yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import subprocess
  17 import sys
  18 import time
  19 import types
  20 import urllib.parse
  21 import urllib.request
  22 import xml.etree.ElementTree
  23
  24 from ..compat import functools  # isort: split
  25 from ..compat import (
  26     compat_etree_fromstring,
  27     compat_expanduser,
  28     compat_os_name,
  29     urllib_req_to_req,
  30 )
  31 from ..cookies import LenientSimpleCookie
  32 from ..downloader.f4m import get_base_url, remove_encrypted_media
  33 from ..downloader.hls import HlsFD
  34 from ..networking import HEADRequest, Request
  35 from ..networking.exceptions import (
  36     HTTPError,
  37     IncompleteRead,
  38     network_exceptions,
  39 )
  40 from ..networking.impersonate import ImpersonateTarget
  41 from ..utils import (
  42     IDENTITY,
  43     JSON_LD_RE,
  44     NO_DEFAULT,
  45     ExtractorError,
  46     FormatSorter,
  47     GeoRestrictedError,
  48     GeoUtils,
  49     LenientJSONDecoder,
  50     Popen,
  51     RegexNotFoundError,
  52     RetryManager,
  53     UnsupportedError,
  54     age_restricted,
  55     base_url,
  56     bug_reports_message,
  57     classproperty,
  58     clean_html,
  59     deprecation_warning,
  60     determine_ext,
  61     dict_get,
  62     encode_data_uri,
  63     error_to_compat_str,
  64     extract_attributes,
  65     filter_dict,
  66     fix_xml_ampersands,
  67     float_or_none,
  68     format_field,
  69     int_or_none,
  70     join_nonempty,
  71     js_to_json,
  72     mimetype2ext,
  73     netrc_from_content,
  74     orderedSet,
  75     parse_bitrate,
  76     parse_codecs,
  77     parse_duration,
  78     parse_iso8601,
  79     parse_m3u8_attributes,
  80     parse_resolution,
  81     sanitize_filename,
  82     sanitize_url,
  83     smuggle_url,
  84     str_or_none,
  85     str_to_int,
  86     strip_or_none,
  87     traverse_obj,
  88     truncate_string,
  89     try_call,
  90     try_get,
  91     unescapeHTML,
  92     unified_strdate,
  93     unified_timestamp,
  94     url_basename,
  95     url_or_none,
  96     urlhandle_detect_ext,
  97     urljoin,
  98     variadic,
  99     xpath_element,
 100     xpath_text,
 101     xpath_with_ns,
 102 )
 103
 104
 105 class InfoExtractor:
 106     """Information Extractor class.
 107
 108     Information extractors are the classes that, given a URL, extract
 109     information about the video (or videos) the URL refers to. This
 110     information includes the real video URL, the video title, author and
 111     others. The information is stored in a dictionary which is then
 112     passed to the YoutubeDL. The YoutubeDL processes this
 113     information possibly downloading the video to the file system, among
 114     other possible outcomes.
 115
 116     The type field determines the type of the result.
 117     By far the most common value (and the default if _type is missing) is
 118     "video", which indicates a single video.
 119
 120     For a video, the dictionaries must include the following fields:
 121
 122     id:             Video identifier.
 123     title:          Video title, unescaped. Set to an empty string if video has
 124                     no title as opposed to "None" which signifies that the
 125                     extractor failed to obtain a title
 126
 127     Additionally, it must contain either a formats entry or a url one:
 128
 129     formats:        A list of dictionaries for each format available, ordered
 130                     from worst to best quality.
 131
 132                     Potential fields:
 133                     * url        The mandatory URL representing the media:
 134                                    for plain file media - HTTP URL of this file,
 135                                    for RTMP - RTMP URL,
 136                                    for HLS - URL of the M3U8 media playlist,
 137                                    for HDS - URL of the F4M manifest,
 138                                    for DASH
 139                                      - HTTP URL to plain file media (in case of
 140                                        unfragmented media)
 141                                      - URL of the MPD manifest or base URL
 142                                        representing the media if MPD manifest
 143                                        is parsed from a string (in case of
 144                                        fragmented media)
 145                                    for MSS - URL of the ISM manifest.
 146                     * request_data  Data to send in POST request to the URL
 147                     * manifest_url
 148                                  The URL of the manifest file in case of
 149                                  fragmented media:
 150                                    for HLS - URL of the M3U8 master playlist,
 151                                    for HDS - URL of the F4M manifest,
 152                                    for DASH - URL of the MPD manifest,
 153                                    for MSS - URL of the ISM manifest.
 154                     * manifest_stream_number  (For internal use only)
 155                                  The index of the stream in the manifest file
 156                     * ext        Will be calculated from URL if missing
 157                     * format     A human-readable description of the format
 158                                  ("mp4 container with h264/opus").
 159                                  Calculated from the format_id, width, height.
 160                                  and format_note fields if missing.
 161                     * format_id  A short description of the format
 162                                  ("mp4_h264_opus" or "19").
 163                                 Technically optional, but strongly recommended.
 164                     * format_note Additional info about the format
 165                                  ("3D" or "DASH video")
 166                     * width      Width of the video, if known
 167                     * height     Height of the video, if known
 168                     * aspect_ratio  Aspect ratio of the video, if known
 169                                  Automatically calculated from width and height
 170                     * resolution Textual description of width and height
 171                                  Automatically calculated from width and height
 172                     * dynamic_range The dynamic range of the video. One of:
 173                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 174                     * tbr        Average bitrate of audio and video in kbps (1000 bits/sec)
 175                     * abr        Average audio bitrate in kbps (1000 bits/sec)
 176                     * acodec     Name of the audio codec in use
 177                     * asr        Audio sampling rate in Hertz
 178                     * audio_channels  Number of audio channels
 179                     * vbr        Average video bitrate in kbps (1000 bits/sec)
 180                     * fps        Frame rate
 181                     * vcodec     Name of the video codec in use
 182                     * container  Name of the container format
 183                     * filesize   The number of bytes, if known in advance
 184                     * filesize_approx  An estimate for the number of bytes
 185                     * player_url SWF Player URL (used for rtmpdump).
 186                     * protocol   The protocol that will be used for the actual
 187                                  download, lower-case. One of "http", "https" or
 188                                  one of the protocols defined in downloader.PROTOCOL_MAP
 189                     * fragment_base_url
 190                                  Base URL for fragments. Each fragment's path
 191                                  value (if present) will be relative to
 192                                  this URL.
 193                     * fragments  A list of fragments of a fragmented media.
 194                                  Each fragment entry must contain either an url
 195                                  or a path. If an url is present it should be
 196                                  considered by a client. Otherwise both path and
 197                                  fragment_base_url must be present. Here is
 198                                  the list of all potential fields:
 199                                  * "url" - fragment's URL
 200                                  * "path" - fragment's path relative to
 201                                             fragment_base_url
 202                                  * "duration" (optional, int or float)
 203                                  * "filesize" (optional, int)
 204                     * is_from_start  Is a live format that can be downloaded
 205                                 from the start. Boolean
 206                     * preference Order number of this format. If this field is
 207                                  present and not None, the formats get sorted
 208                                  by this field, regardless of all other values.
 209                                  -1 for default (order by other properties),
 210                                  -2 or smaller for less than default.
 211                                  < -1000 to hide the format (if there is
 212                                     another one which is strictly better)
 213                     * language   Language code, e.g. "de" or "en-US".
 214                     * language_preference  Is this in the language mentioned in
 215                                  the URL?
 216                                  10 if it's what the URL is about,
 217                                  -1 for default (don't know),
 218                                  -10 otherwise, other values reserved for now.
 219                     * quality    Order number of the video quality of this
 220                                  format, irrespective of the file format.
 221                                  -1 for default (order by other properties),
 222                                  -2 or smaller for less than default.
 223                     * source_preference  Order number for this video source
 224                                   (quality takes higher priority)
 225                                  -1 for default (order by other properties),
 226                                  -2 or smaller for less than default.
 227                     * http_headers  A dictionary of additional HTTP headers
 228                                  to add to the request.
 229                     * stretched_ratio  If given and not 1, indicates that the
 230                                  video's pixels are not square.
 231                                  width : height ratio as float.
 232                     * no_resume  The server does not support resuming the
 233                                  (HTTP or RTMP) download. Boolean.
 234                     * has_drm    True if the format has DRM and cannot be downloaded.
 235                                  'maybe' if the format may have DRM and has to be tested before download.
 236                     * extra_param_to_segment_url  A query string to append to each
 237                                  fragment's URL, or to update each existing query string
 238                                  with. Only applied by the native HLS/DASH downloaders.
 239                     * hls_aes    A dictionary of HLS AES-128 decryption information
 240                                  used by the native HLS downloader to override the
 241                                  values in the media playlist when an '#EXT-X-KEY' tag
 242                                  is present in the playlist:
 243                                  * uri  The URI from which the key will be downloaded
 244                                  * key  The key (as hex) used to decrypt fragments.
 245                                         If `key` is given, any key URI will be ignored
 246                                  * iv   The IV (as hex) used to decrypt fragments
 247                     * downloader_options  A dictionary of downloader options
 248                                  (For internal use only)
 249                                  * http_chunk_size Chunk size for HTTP downloads
 250                                  * ffmpeg_args     Extra arguments for ffmpeg downloader (input)
 251                                  * ffmpeg_args_out Extra arguments for ffmpeg downloader (output)
 252                     * is_dash_periods  Whether the format is a result of merging
 253                                  multiple DASH periods.
 254                     RTMP formats can also have the additional fields: page_url,
 255                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 256                     rtmp_protocol, rtmp_real_time
 257
 258     url:            Final video URL.
 259     ext:            Video filename extension.
 260     format:         The video format, defaults to ext (used for --get-format)
 261     player_url:     SWF Player URL (used for rtmpdump).
 262
 263     The following fields are optional:
 264
 265     direct:         True if a direct video file was given (must only be set by GenericIE)
 266     alt_title:      A secondary title of the video.
 267     display_id:     An alternative identifier for the video, not necessarily
 268                     unique, but available before title. Typically, id is
 269                     something like "4234987", title "Dancing naked mole rats",
 270                     and display_id "dancing-naked-mole-rats"
 271     thumbnails:     A list of dictionaries, with the following entries:
 272                         * "id" (optional, string) - Thumbnail format ID
 273                         * "url"
 274                         * "preference" (optional, int) - quality of the image
 275                         * "width" (optional, int)
 276                         * "height" (optional, int)
 277                         * "resolution" (optional, string "{width}x{height}",
 278                                         deprecated)
 279                         * "filesize" (optional, int)
 280                         * "http_headers" (dict) - HTTP headers for the request
 281     thumbnail:      Full URL to a video thumbnail image.
 282     description:    Full video description.
 283     uploader:       Full name of the video uploader.
 284     license:        License name the video is licensed under.
 285     creators:       List of creators of the video.
 286     timestamp:      UNIX timestamp of the moment the video was uploaded
 287     upload_date:    Video upload date in UTC (YYYYMMDD).
 288                     If not explicitly set, calculated from timestamp
 289     release_timestamp: UNIX timestamp of the moment the video was released.
 290                     If it is not clear whether to use timestamp or this, use the former
 291     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 292                     If not explicitly set, calculated from release_timestamp
 293     release_year:   Year (YYYY) as integer when the video or album was released.
 294                     To be used if no exact release date is known.
 295                     If not explicitly set, calculated from release_date.
 296     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 297     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 298                     If not explicitly set, calculated from modified_timestamp
 299     uploader_id:    Nickname or id of the video uploader.
 300     uploader_url:   Full URL to a personal webpage of the video uploader.
 301     channel:        Full name of the channel the video is uploaded on.
 302                     Note that channel fields may or may not repeat uploader
 303                     fields. This depends on a particular extractor.
 304     channel_id:     Id of the channel.
 305     channel_url:    Full URL to a channel webpage.
 306     channel_follower_count: Number of followers of the channel.
 307     channel_is_verified: Whether the channel is verified on the platform.
 308     location:       Physical location where the video was filmed.
 309     subtitles:      The available subtitles as a dictionary in the format
 310                     {tag: subformats}. "tag" is usually a language code, and
 311                     "subformats" is a list sorted from lower to higher
 312                     preference, each element is a dictionary with the "ext"
 313                     entry and one of:
 314                         * "data": The subtitles file contents
 315                         * "url": A URL pointing to the subtitles file
 316                     It can optionally also have:
 317                         * "name": Name or description of the subtitles
 318                         * "http_headers": A dictionary of additional HTTP headers
 319                                   to add to the request.
 320                     "ext" will be calculated from URL if missing
 321     automatic_captions: Like 'subtitles'; contains automatically generated
 322                     captions instead of normal subtitles
 323     duration:       Length of the video in seconds, as an integer or float.
 324     view_count:     How many users have watched the video on the platform.
 325     concurrent_view_count: How many users are currently watching the video on the platform.
 326     like_count:     Number of positive ratings of the video
 327     dislike_count:  Number of negative ratings of the video
 328     repost_count:   Number of reposts of the video
 329     average_rating: Average rating give by users, the scale used depends on the webpage
 330     comment_count:  Number of comments on the video
 331     comments:       A list of comments, each with one or more of the following
 332                     properties (all but one of text or html optional):
 333                         * "author" - human-readable name of the comment author
 334                         * "author_id" - user ID of the comment author
 335                         * "author_thumbnail" - The thumbnail of the comment author
 336                         * "author_url" - The url to the comment author's page
 337                         * "author_is_verified" - Whether the author is verified
 338                                                  on the platform
 339                         * "author_is_uploader" - Whether the comment is made by
 340                                                  the video uploader
 341                         * "id" - Comment ID
 342                         * "html" - Comment as HTML
 343                         * "text" - Plain text of the comment
 344                         * "timestamp" - UNIX timestamp of comment
 345                         * "parent" - ID of the comment this one is replying to.
 346                                      Set to "root" to indicate that this is a
 347                                      comment to the original video.
 348                         * "like_count" - Number of positive ratings of the comment
 349                         * "dislike_count" - Number of negative ratings of the comment
 350                         * "is_favorited" - Whether the comment is marked as
 351                                            favorite by the video uploader
 352                         * "is_pinned" - Whether the comment is pinned to
 353                                         the top of the comments
 354     age_limit:      Age restriction for the video, as an integer (years)
 355     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 356                     should allow to get the same result again. (It will be set
 357                     by YoutubeDL if it's missing)
 358     categories:     A list of categories that the video falls in, for example
 359                     ["Sports", "Berlin"]
 360     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 361     cast:           A list of the video cast
 362     is_live:        True, False, or None (=unknown). Whether this video is a
 363                     live stream that goes on instead of a fixed-length video.
 364     was_live:       True, False, or None (=unknown). Whether this video was
 365                     originally a live stream.
 366     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 367                     or 'post_live' (was live, but VOD is not yet processed)
 368                     If absent, automatically set from is_live, was_live
 369     start_time:     Time in seconds where the reproduction should start, as
 370                     specified in the URL.
 371     end_time:       Time in seconds where the reproduction should end, as
 372                     specified in the URL.
 373     chapters:       A list of dictionaries, with the following entries:
 374                         * "start_time" - The start time of the chapter in seconds
 375                         * "end_time" - The end time of the chapter in seconds
 376                         * "title" (optional, string)
 377     heatmap:        A list of dictionaries, with the following entries:
 378                         * "start_time" - The start time of the data point in seconds
 379                         * "end_time" - The end time of the data point in seconds
 380                         * "value" - The normalized value of the data point (float between 0 and 1)
 381     playable_in_embed: Whether this video is allowed to play in embedded
 382                     players on other sites. Can be True (=always allowed),
 383                     False (=never allowed), None (=unknown), or a string
 384                     specifying the criteria for embedability; e.g. 'whitelist'
 385     availability:   Under what condition the video is available. One of
 386                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 387                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 388                     to set it
 389     media_type:     The type of media as classified by the site, e.g. "episode", "clip", "trailer"
 390     _old_archive_ids: A list of old archive ids needed for backward compatibility
 391     _format_sort_fields: A list of fields to use for sorting formats
 392     __post_extractor: A function to be called just before the metadata is
 393                     written to either disk, logger or console. The function
 394                     must return a dict which will be added to the info_dict.
 395                     This is usefull for additional information that is
 396                     time-consuming to extract. Note that the fields thus
 397                     extracted will not be available to output template and
 398                     match_filter. So, only "comments" and "comment_count" are
 399                     currently allowed to be extracted via this method.
 400
 401     The following fields should only be used when the video belongs to some logical
 402     chapter or section:
 403
 404     chapter:        Name or title of the chapter the video belongs to.
 405     chapter_number: Number of the chapter the video belongs to, as an integer.
 406     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 407
 408     The following fields should only be used when the video is an episode of some
 409     series, programme or podcast:
 410
 411     series:         Title of the series or programme the video episode belongs to.
 412     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 413     season:         Title of the season the video episode belongs to.
 414     season_number:  Number of the season the video episode belongs to, as an integer.
 415     season_id:      Id of the season the video episode belongs to, as a unicode string.
 416     episode:        Title of the video episode. Unlike mandatory video title field,
 417                     this field should denote the exact title of the video episode
 418                     without any kind of decoration.
 419     episode_number: Number of the video episode within a season, as an integer.
 420     episode_id:     Id of the video episode, as a unicode string.
 421
 422     The following fields should only be used when the media is a track or a part of
 423     a music album:
 424
 425     track:          Title of the track.
 426     track_number:   Number of the track within an album or a disc, as an integer.
 427     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 428                     as a unicode string.
 429     artists:        List of artists of the track.
 430     composers:      List of composers of the piece.
 431     genres:         List of genres of the track.
 432     album:          Title of the album the track belongs to.
 433     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 434     album_artists:  List of all artists appeared on the album.
 435                     E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
 436                     Useful for splits and compilations.
 437     disc_number:    Number of the disc or other physical medium the track belongs to,
 438                     as an integer.
 439
 440     The following fields should only be set for clips that should be cut from the original video:
 441
 442     section_start:  Start time of the section in seconds
 443     section_end:    End time of the section in seconds
 444
 445     The following fields should only be set for storyboards:
 446     rows:           Number of rows in each storyboard fragment, as an integer
 447     columns:        Number of columns in each storyboard fragment, as an integer
 448
 449     The following fields are deprecated and should not be set by new code:
 450     composer:       Use "composers" instead.
 451                     Composer(s) of the piece, comma-separated.
 452     artist:         Use "artists" instead.
 453                     Artist(s) of the track, comma-separated.
 454     genre:          Use "genres" instead.
 455                     Genre(s) of the track, comma-separated.
 456     album_artist:   Use "album_artists" instead.
 457                     All artists appeared on the album, comma-separated.
 458     creator:        Use "creators" instead.
 459                     The creator of the video.
 460
 461     Unless mentioned otherwise, the fields should be Unicode strings.
 462
 463     Unless mentioned otherwise, None is equivalent to absence of information.
 464
 465
 466     _type "playlist" indicates multiple videos.
 467     There must be a key "entries", which is a list, an iterable, or a PagedList
 468     object, each element of which is a valid dictionary by this specification.
 469
 470     Additionally, playlists can have "id", "title", and any other relevant
 471     attributes with the same semantics as videos (see above).
 472
 473     It can also have the following optional fields:
 474
 475     playlist_count: The total number of videos in a playlist. If not given,
 476                     YoutubeDL tries to calculate it from "entries"
 477
 478
 479     _type "multi_video" indicates that there are multiple videos that
 480     form a single show, for examples multiple acts of an opera or TV episode.
 481     It must have an entries key like a playlist and contain all the keys
 482     required for a video at the same time.
 483
 484
 485     _type "url" indicates that the video must be extracted from another
 486     location, possibly by a different extractor. Its only required key is:
 487     "url" - the next URL to extract.
 488     The key "ie_key" can be set to the class name (minus the trailing "IE",
 489     e.g. "Youtube") if the extractor class is known in advance.
 490     Additionally, the dictionary may have any properties of the resolved entity
 491     known in advance, for example "title" if the title of the referred video is
 492     known ahead of time.
 493
 494
 495     _type "url_transparent" entities have the same specification as "url", but
 496     indicate that the given additional information is more precise than the one
 497     associated with the resolved URL.
 498     This is useful when a site employs a video service that hosts the video and
 499     its technical metadata, but that video service does not embed a useful
 500     title, description etc.
 501
 502
 503     Subclasses of this should also be added to the list of extractors and
 504     should define _VALID_URL as a regexp or a Sequence of regexps, and
 505     re-define the _real_extract() and (optionally) _real_initialize() methods.
 506
 507     Subclasses may also override suitable() if necessary, but ensure the function
 508     signature is preserved and that this function imports everything it needs
 509     (except other extractors), so that lazy_extractors works correctly.
 510
 511     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 512     the HTML of Generic webpages. It may also override _extract_embed_urls
 513     or _extract_from_webpage as necessary. While these are normally classmethods,
 514     _extract_from_webpage is allowed to be an instance method.
 515
 516     _extract_from_webpage may raise self.StopExtraction() to stop further
 517     processing of the webpage and obtain exclusive rights to it. This is useful
 518     when the extractor cannot reliably be matched using just the URL,
 519     e.g. invidious/peertube instances
 520
 521     Embed-only extractors can be defined by setting _VALID_URL = False.
 522
 523     To support username + password (or netrc) login, the extractor must define a
 524     _NETRC_MACHINE and re-define _perform_login(username, password) and
 525     (optionally) _initialize_pre_login() methods. The _perform_login method will
 526     be called between _initialize_pre_login and _real_initialize if credentials
 527     are passed by the user. In cases where it is necessary to have the login
 528     process as part of the extraction rather than initialization, _perform_login
 529     can be left undefined.
 530
 531     _GEO_BYPASS attribute may be set to False in order to disable
 532     geo restriction bypass mechanisms for a particular extractor.
 533     Though it won't disable explicit geo restriction bypass based on
 534     country code provided with geo_bypass_country.
 535
 536     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 537     countries for this extractor. One of these countries will be used by
 538     geo restriction bypass mechanism right away in order to bypass
 539     geo restriction, of course, if the mechanism is not disabled.
 540
 541     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 542     IP blocks in CIDR notation for this extractor. One of these IP blocks
 543     will be used by geo restriction bypass mechanism similarly
 544     to _GEO_COUNTRIES.
 545
 546     The _ENABLED attribute should be set to False for IEs that
 547     are disabled by default and must be explicitly enabled.
 548
 549     The _WORKING attribute should be set to False for broken IEs
 550     in order to warn the users and skip the tests.
 551     """
 552
 553     _ready = False
 554     _downloader = None
 555     _x_forwarded_for_ip = None
 556     _GEO_BYPASS = True
 557     _GEO_COUNTRIES = None
 558     _GEO_IP_BLOCKS = None
 559     _WORKING = True
 560     _ENABLED = True
 561     _NETRC_MACHINE = None
 562     IE_DESC = None
 563     SEARCH_KEY = None
 564     _VALID_URL = None
 565     _EMBED_REGEX = []
 566
 567     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 568         password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 569         return {
 570             None: '',
 571             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 572             'password': f'Use {password_hint}',
 573             'cookies': (
 574                 'Use --cookies-from-browser or --cookies for the authentication. '
 575                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 576         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 577
 578     def __init__(self, downloader=None):
 579         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 580         If a downloader is not passed during initialization,
 581         it must be set using "set_downloader()" before "extract()" is called"""
 582         self._ready = False
 583         self._x_forwarded_for_ip = None
 584         self._printed_messages = set()
 585         self.set_downloader(downloader)
 586
 587     @classmethod
 588     def _match_valid_url(cls, url):
 589         if cls._VALID_URL is False:
 590             return None
 591         # This does not use has/getattr intentionally - we want to know whether
 592         # we have cached the regexp for *this* class, whereas getattr would also
 593         # match the superclass
 594         if '_VALID_URL_RE' not in cls.__dict__:
 595             cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
 596         return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
 597
 598     @classmethod
 599     def suitable(cls, url):
 600         """Receives a URL and returns True if suitable for this IE."""
 601         # This function must import everything it needs (except other extractors),
 602         # so that lazy_extractors works correctly
 603         return cls._match_valid_url(url) is not None
 604
 605     @classmethod
 606     def _match_id(cls, url):
 607         return cls._match_valid_url(url).group('id')
 608
 609     @classmethod
 610     def get_temp_id(cls, url):
 611         try:
 612             return cls._match_id(url)
 613         except (IndexError, AttributeError):
 614             return None
 615
 616     @classmethod
 617     def working(cls):
 618         """Getter method for _WORKING."""
 619         return cls._WORKING
 620
 621     @classmethod
 622     def supports_login(cls):
 623         return bool(cls._NETRC_MACHINE)
 624
 625     def initialize(self):
 626         """Initializes an instance (authentication, etc)."""
 627         self._printed_messages = set()
 628         self._initialize_geo_bypass({
 629             'countries': self._GEO_COUNTRIES,
 630             'ip_blocks': self._GEO_IP_BLOCKS,
 631         })
 632         if not self._ready:
 633             self._initialize_pre_login()
 634             if self.supports_login():
 635                 username, password = self._get_login_info()
 636                 if username:
 637                     self._perform_login(username, password)
 638             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 639                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 640             self._real_initialize()
 641             self._ready = True
 642
 643     def _initialize_geo_bypass(self, geo_bypass_context):
 644         """
 645         Initialize geo restriction bypass mechanism.
 646
 647         This method is used to initialize geo bypass mechanism based on faking
 648         X-Forwarded-For HTTP header. A random country from provided country list
 649         is selected and a random IP belonging to this country is generated. This
 650         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 651         HTTP requests.
 652
 653         This method will be used for initial geo bypass mechanism initialization
 654         during the instance initialization with _GEO_COUNTRIES and
 655         _GEO_IP_BLOCKS.
 656
 657         You may also manually call it from extractor's code if geo bypass
 658         information is not available beforehand (e.g. obtained during
 659         extraction) or due to some other reason. In this case you should pass
 660         this information in geo bypass context passed as first argument. It may
 661         contain following fields:
 662
 663         countries:  List of geo unrestricted countries (similar
 664                     to _GEO_COUNTRIES)
 665         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 666                     (similar to _GEO_IP_BLOCKS)
 667
 668         """
 669         if not self._x_forwarded_for_ip:
 670
 671             # Geo bypass mechanism is explicitly disabled by user
 672             if not self.get_param('geo_bypass', True):
 673                 return
 674
 675             if not geo_bypass_context:
 676                 geo_bypass_context = {}
 677
 678             # Backward compatibility: previously _initialize_geo_bypass
 679             # expected a list of countries, some 3rd party code may still use
 680             # it this way
 681             if isinstance(geo_bypass_context, (list, tuple)):
 682                 geo_bypass_context = {
 683                     'countries': geo_bypass_context,
 684                 }
 685
 686             # The whole point of geo bypass mechanism is to fake IP
 687             # as X-Forwarded-For HTTP header based on some IP block or
 688             # country code.
 689
 690             # Path 1: bypassing based on IP block in CIDR notation
 691
 692             # Explicit IP block specified by user, use it right away
 693             # regardless of whether extractor is geo bypassable or not
 694             ip_block = self.get_param('geo_bypass_ip_block', None)
 695
 696             # Otherwise use random IP block from geo bypass context but only
 697             # if extractor is known as geo bypassable
 698             if not ip_block:
 699                 ip_blocks = geo_bypass_context.get('ip_blocks')
 700                 if self._GEO_BYPASS and ip_blocks:
 701                     ip_block = random.choice(ip_blocks)
 702
 703             if ip_block:
 704                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 705                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 706                 return
 707
 708             # Path 2: bypassing based on country code
 709
 710             # Explicit country code specified by user, use it right away
 711             # regardless of whether extractor is geo bypassable or not
 712             country = self.get_param('geo_bypass_country', None)
 713
 714             # Otherwise use random country code from geo bypass context but
 715             # only if extractor is known as geo bypassable
 716             if not country:
 717                 countries = geo_bypass_context.get('countries')
 718                 if self._GEO_BYPASS and countries:
 719                     country = random.choice(countries)
 720
 721             if country:
 722                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 723                 self._downloader.write_debug(
 724                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 725
 726     def extract(self, url):
 727         """Extracts URL information and returns it in list of dicts."""
 728         try:
 729             for _ in range(2):
 730                 try:
 731                     self.initialize()
 732                     self.to_screen('Extracting URL: %s' % (
 733                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 734                     ie_result = self._real_extract(url)
 735                     if ie_result is None:
 736                         return None
 737                     if self._x_forwarded_for_ip:
 738                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 739                     subtitles = ie_result.get('subtitles') or {}
 740                     if 'no-live-chat' in self.get_param('compat_opts'):
 741                         for lang in ('live_chat', 'comments', 'danmaku'):
 742                             subtitles.pop(lang, None)
 743                     return ie_result
 744                 except GeoRestrictedError as e:
 745                     if self.__maybe_fake_ip_and_retry(e.countries):
 746                         continue
 747                     raise
 748         except UnsupportedError:
 749             raise
 750         except ExtractorError as e:
 751             e.video_id = e.video_id or self.get_temp_id(url)
 752             e.ie = e.ie or self.IE_NAME
 753             e.traceback = e.traceback or sys.exc_info()[2]
 754             raise
 755         except IncompleteRead as e:
 756             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 757         except (KeyError, StopIteration) as e:
 758             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 759
 760     def __maybe_fake_ip_and_retry(self, countries):
 761         if (not self.get_param('geo_bypass_country', None)
 762                 and self._GEO_BYPASS
 763                 and self.get_param('geo_bypass', True)
 764                 and not self._x_forwarded_for_ip
 765                 and countries):
 766             country_code = random.choice(countries)
 767             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 768             if self._x_forwarded_for_ip:
 769                 self.report_warning(
 770                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 771                     % (self._x_forwarded_for_ip, country_code.upper()))
 772                 return True
 773         return False
 774
 775     def set_downloader(self, downloader):
 776         """Sets a YoutubeDL instance as the downloader for this IE."""
 777         self._downloader = downloader
 778
 779     @property
 780     def cache(self):
 781         return self._downloader.cache
 782
 783     @property
 784     def cookiejar(self):
 785         return self._downloader.cookiejar
 786
 787     def _initialize_pre_login(self):
 788         """ Initialization before login. Redefine in subclasses."""
 789         pass
 790
 791     def _perform_login(self, username, password):
 792         """ Login with username and password. Redefine in subclasses."""
 793         pass
 794
 795     def _real_initialize(self):
 796         """Real initialization process. Redefine in subclasses."""
 797         pass
 798
 799     def _real_extract(self, url):
 800         """Real extraction process. Redefine in subclasses."""
 801         raise NotImplementedError('This method must be implemented by subclasses')
 802
 803     @classmethod
 804     def ie_key(cls):
 805         """A string for getting the InfoExtractor with get_info_extractor"""
 806         return cls.__name__[:-2]
 807
 808     @classproperty
 809     def IE_NAME(cls):
 810         return cls.__name__[:-2]
 811
 812     @staticmethod
 813     def __can_accept_status_code(err, expected_status):
 814         assert isinstance(err, HTTPError)
 815         if expected_status is None:
 816             return False
 817         elif callable(expected_status):
 818             return expected_status(err.status) is True
 819         else:
 820             return err.status in variadic(expected_status)
 821
 822     def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None):
 823         if isinstance(url_or_request, urllib.request.Request):
 824             self._downloader.deprecation_warning(
 825                 'Passing a urllib.request.Request to _create_request() is deprecated. '
 826                 'Use yt_dlp.networking.common.Request instead.')
 827             url_or_request = urllib_req_to_req(url_or_request)
 828         elif not isinstance(url_or_request, Request):
 829             url_or_request = Request(url_or_request)
 830
 831         url_or_request.update(data=data, headers=headers, query=query, extensions=extensions)
 832         return url_or_request
 833
 834     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None,
 835                          headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False):
 836         """
 837         Return the response handle.
 838
 839         See _download_webpage docstring for arguments specification.
 840         """
 841         if not self._downloader._first_webpage_request:
 842             sleep_interval = self.get_param('sleep_interval_requests') or 0
 843             if sleep_interval > 0:
 844                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 845                 time.sleep(sleep_interval)
 846         else:
 847             self._downloader._first_webpage_request = False
 848
 849         if note is None:
 850             self.report_download_webpage(video_id)
 851         elif note is not False:
 852             if video_id is None:
 853                 self.to_screen(str(note))
 854             else:
 855                 self.to_screen(f'{video_id}: {note}')
 856
 857         # Some sites check X-Forwarded-For HTTP header in order to figure out
 858         # the origin of the client behind proxy. This allows bypassing geo
 859         # restriction by faking this header's value to IP that belongs to some
 860         # geo unrestricted country. We will do so once we encounter any
 861         # geo restriction error.
 862         if self._x_forwarded_for_ip:
 863             headers = (headers or {}).copy()
 864             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 865
 866         extensions = {}
 867
 868         if impersonate in (True, ''):
 869             impersonate = ImpersonateTarget()
 870         requested_targets = [
 871             t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t)
 872             for t in variadic(impersonate)
 873         ] if impersonate else []
 874
 875         available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None)
 876         if available_target:
 877             extensions['impersonate'] = available_target
 878         elif requested_targets:
 879             message = 'The extractor is attempting impersonation, but '
 880             message += (
 881                 'no impersonate target is available' if not str(impersonate)
 882                 else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"')
 883             info_msg = ('see  https://github.com/yt-dlp/yt-dlp#impersonation  '
 884                         'for information on installing the required dependencies')
 885             if require_impersonation:
 886                 raise ExtractorError(f'{message}; {info_msg}', expected=True)
 887             self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True)
 888
 889         try:
 890             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions))
 891         except network_exceptions as err:
 892             if isinstance(err, HTTPError):
 893                 if self.__can_accept_status_code(err, expected_status):
 894                     return err.response
 895
 896             if errnote is False:
 897                 return False
 898             if errnote is None:
 899                 errnote = 'Unable to download webpage'
 900
 901             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 902             if fatal:
 903                 raise ExtractorError(errmsg, cause=err)
 904             else:
 905                 self.report_warning(errmsg)
 906                 return False
 907
 908     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 909                                  encoding=None, data=None, headers={}, query={}, expected_status=None,
 910                                  impersonate=None, require_impersonation=False):
 911         """
 912         Return a tuple (page content as string, URL handle).
 913
 914         Arguments:
 915         url_or_request -- plain text URL as a string or
 916             a yt_dlp.networking.Request object
 917         video_id -- Video/playlist/item identifier (string)
 918
 919         Keyword arguments:
 920         note -- note printed before downloading (string)
 921         errnote -- note printed in case of an error (string)
 922         fatal -- flag denoting whether error should be considered fatal,
 923             i.e. whether it should cause ExtractionError to be raised,
 924             otherwise a warning will be reported and extraction continued
 925         encoding -- encoding for a page content decoding, guessed automatically
 926             when not explicitly specified
 927         data -- POST data (bytes)
 928         headers -- HTTP headers (dict)
 929         query -- URL query (dict)
 930         expected_status -- allows to accept failed HTTP requests (non 2xx
 931             status code) by explicitly specifying a set of accepted status
 932             codes. Can be any of the following entities:
 933                 - an integer type specifying an exact failed status code to
 934                   accept
 935                 - a list or a tuple of integer types specifying a list of
 936                   failed status codes to accept
 937                 - a callable accepting an actual failed status code and
 938                   returning True if it should be accepted
 939             Note that this argument does not affect success status codes (2xx)
 940             which are always accepted.
 941         impersonate -- the impersonate target. Can be any of the following entities:
 942                 - an instance of yt_dlp.networking.impersonate.ImpersonateTarget
 943                 - a string in the format of CLIENT[:OS]
 944                 - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances
 945                 - a boolean value; True means any impersonate target is sufficient
 946         require_impersonation -- flag to toggle whether the request should raise an error
 947             if impersonation is not possible (bool, default: False)
 948         """
 949
 950         # Strip hashes from the URL (#1038)
 951         if isinstance(url_or_request, str):
 952             url_or_request = url_or_request.partition('#')[0]
 953
 954         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data,
 955                                      headers=headers, query=query, expected_status=expected_status,
 956                                      impersonate=impersonate, require_impersonation=require_impersonation)
 957         if urlh is False:
 958             assert not fatal
 959             return False
 960         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
 961                                              encoding=encoding, data=data)
 962         return (content, urlh)
 963
 964     @staticmethod
 965     def _guess_encoding_from_content(content_type, webpage_bytes):
 966         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 967         if m:
 968             encoding = m.group(1)
 969         else:
 970             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 971                           webpage_bytes[:1024])
 972             if m:
 973                 encoding = m.group(1).decode('ascii')
 974             elif webpage_bytes.startswith(b'\xff\xfe'):
 975                 encoding = 'utf-16'
 976             else:
 977                 encoding = 'utf-8'
 978
 979         return encoding
 980
 981     def __check_blocked(self, content):
 982         first_block = content[:512]
 983         if ('<title>Access to this site is blocked</title>' in content
 984                 and 'Websense' in first_block):
 985             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 986             blocked_iframe = self._html_search_regex(
 987                 r'<iframe src="([^"]+)"', content,
 988                 'Websense information URL', default=None)
 989             if blocked_iframe:
 990                 msg += ' Visit %s for more details' % blocked_iframe
 991             raise ExtractorError(msg, expected=True)
 992         if '<title>The URL you requested has been blocked</title>' in first_block:
 993             msg = (
 994                 'Access to this webpage has been blocked by Indian censorship. '
 995                 'Use a VPN or proxy server (with --proxy) to route around it.')
 996             block_msg = self._html_search_regex(
 997                 r'</h1><p>(.*?)</p>',
 998                 content, 'block message', default=None)
 999             if block_msg:
1000                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
1001             raise ExtractorError(msg, expected=True)
1002         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
1003                 and 'blocklist.rkn.gov.ru' in content):
1004             raise ExtractorError(
1005                 'Access to this webpage has been blocked by decision of the Russian government. '
1006                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
1007                 expected=True)
1008
1009     def _request_dump_filename(self, url, video_id, data=None):
1010         if data is not None:
1011             data = hashlib.md5(data).hexdigest()
1012         basen = join_nonempty(video_id, data, url, delim='_')
1013         trim_length = self.get_param('trim_file_name') or 240
1014         if len(basen) > trim_length:
1015             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
1016             basen = basen[:trim_length - len(h)] + h
1017         filename = sanitize_filename(f'{basen}.dump', restricted=True)
1018         # Working around MAX_PATH limitation on Windows (see
1019         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
1020         if compat_os_name == 'nt':
1021             absfilepath = os.path.abspath(filename)
1022             if len(absfilepath) > 259:
1023                 filename = fR'\\?\{absfilepath}'
1024         return filename
1025
1026     def __decode_webpage(self, webpage_bytes, encoding, headers):
1027         if not encoding:
1028             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
1029         try:
1030             return webpage_bytes.decode(encoding, 'replace')
1031         except LookupError:
1032             return webpage_bytes.decode('utf-8', 'replace')
1033
1034     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
1035                               prefix=None, encoding=None, data=None):
1036         webpage_bytes = urlh.read()
1037         if prefix is not None:
1038             webpage_bytes = prefix + webpage_bytes
1039         if self.get_param('dump_intermediate_pages', False):
1040             self.to_screen('Dumping request to ' + urlh.url)
1041             dump = base64.b64encode(webpage_bytes).decode('ascii')
1042             self._downloader.to_screen(dump)
1043         if self.get_param('write_pages'):
1044             if isinstance(url_or_request, Request):
1045                 data = self._create_request(url_or_request, data).data
1046             filename = self._request_dump_filename(urlh.url, video_id, data)
1047             self.to_screen(f'Saving request to {filename}')
1048             with open(filename, 'wb') as outf:
1049                 outf.write(webpage_bytes)
1050
1051         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
1052         self.__check_blocked(content)
1053
1054         return content
1055
1056     def __print_error(self, errnote, fatal, video_id, err):
1057         if fatal:
1058             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
1059         elif errnote:
1060             self.report_warning(f'{video_id}: {errnote}: {err}')
1061
1062     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
1063         if transform_source:
1064             xml_string = transform_source(xml_string)
1065         try:
1066             return compat_etree_fromstring(xml_string.encode('utf-8'))
1067         except xml.etree.ElementTree.ParseError as ve:
1068             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1069
1070     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1071         try:
1072             return json.loads(
1073                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1074         except ValueError as ve:
1075             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1076
1077     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1078         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1079
1080     def __create_download_methods(name, parser, note, errnote, return_value):
1081
1082         def parse(ie, content, *args, errnote=errnote, **kwargs):
1083             if parser is None:
1084                 return content
1085             if errnote is False:
1086                 kwargs['errnote'] = errnote
1087             # parser is fetched by name so subclasses can override it
1088             return getattr(ie, parser)(content, *args, **kwargs)
1089
1090         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1091                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
1092                             impersonate=None, require_impersonation=False):
1093             res = self._download_webpage_handle(
1094                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1095                 data=data, headers=headers, query=query, expected_status=expected_status,
1096                 impersonate=impersonate, require_impersonation=require_impersonation)
1097             if res is False:
1098                 return res
1099             content, urlh = res
1100             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1101
1102         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1103                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
1104                              impersonate=None, require_impersonation=False):
1105             if self.get_param('load_pages'):
1106                 url_or_request = self._create_request(url_or_request, data, headers, query)
1107                 filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
1108                 self.to_screen(f'Loading request from {filename}')
1109                 try:
1110                     with open(filename, 'rb') as dumpf:
1111                         webpage_bytes = dumpf.read()
1112                 except OSError as e:
1113                     self.report_warning(f'Unable to load request from disk: {e}')
1114                 else:
1115                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1116                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1117             kwargs = {
1118                 'note': note,
1119                 'errnote': errnote,
1120                 'transform_source': transform_source,
1121                 'fatal': fatal,
1122                 'encoding': encoding,
1123                 'data': data,
1124                 'headers': headers,
1125                 'query': query,
1126                 'expected_status': expected_status,
1127                 'impersonate': impersonate,
1128                 'require_impersonation': require_impersonation,
1129             }
1130             if parser is None:
1131                 kwargs.pop('transform_source')
1132             # The method is fetched by name so subclasses can override _download_..._handle
1133             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1134             return res if res is False else res[0]
1135
1136         def impersonate(func, name, return_value):
1137             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1138             func.__doc__ = f'''
1139                 @param transform_source     Apply this transformation before parsing
1140                 @returns                    {return_value}
1141
1142                 See _download_webpage_handle docstring for other arguments specification
1143             '''
1144
1145         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1146         impersonate(download_content, f'_download_{name}', f'{return_value}')
1147         return download_handle, download_content
1148
1149     _download_xml_handle, _download_xml = __create_download_methods(
1150         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1151     _download_json_handle, _download_json = __create_download_methods(
1152         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1153     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1154         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1155     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1156
1157     def _download_webpage(
1158             self, url_or_request, video_id, note=None, errnote=None,
1159             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1160         """
1161         Return the data of the page as a string.
1162
1163         Keyword arguments:
1164         tries -- number of tries
1165         timeout -- sleep interval between tries
1166
1167         See _download_webpage_handle docstring for other arguments specification.
1168         """
1169
1170         R''' # NB: These are unused; should they be deprecated?
1171         if tries != 1:
1172             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1173         if timeout is NO_DEFAULT:
1174             timeout = 5
1175         else:
1176             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1177         '''
1178
1179         try_count = 0
1180         while True:
1181             try:
1182                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1183             except IncompleteRead as e:
1184                 try_count += 1
1185                 if try_count >= tries:
1186                     raise e
1187                 self._sleep(timeout, video_id)
1188
1189     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1190         idstr = format_field(video_id, None, '%s: ')
1191         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1192         if only_once:
1193             if f'WARNING: {msg}' in self._printed_messages:
1194                 return
1195             self._printed_messages.add(f'WARNING: {msg}')
1196         self._downloader.report_warning(msg, *args, **kwargs)
1197
1198     def to_screen(self, msg, *args, **kwargs):
1199         """Print msg to screen, prefixing it with '[ie_name]'"""
1200         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1201
1202     def write_debug(self, msg, *args, **kwargs):
1203         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1204
1205     def get_param(self, name, default=None, *args, **kwargs):
1206         if self._downloader:
1207             return self._downloader.params.get(name, default, *args, **kwargs)
1208         return default
1209
1210     def report_drm(self, video_id, partial=NO_DEFAULT):
1211         if partial is not NO_DEFAULT:
1212             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1213         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1214
1215     def report_extraction(self, id_or_name):
1216         """Report information extraction."""
1217         self.to_screen('%s: Extracting information' % id_or_name)
1218
1219     def report_download_webpage(self, video_id):
1220         """Report webpage download."""
1221         self.to_screen('%s: Downloading webpage' % video_id)
1222
1223     def report_age_confirmation(self):
1224         """Report attempt to confirm age."""
1225         self.to_screen('Confirming age')
1226
1227     def report_login(self):
1228         """Report attempt to log in."""
1229         self.to_screen('Logging in')
1230
1231     def raise_login_required(
1232             self, msg='This video is only available for registered users',
1233             metadata_available=False, method=NO_DEFAULT):
1234         if metadata_available and (
1235                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1236             self.report_warning(msg)
1237             return
1238         msg += format_field(self._login_hint(method), None, '. %s')
1239         raise ExtractorError(msg, expected=True)
1240
1241     def raise_geo_restricted(
1242             self, msg='This video is not available from your location due to geo restriction',
1243             countries=None, metadata_available=False):
1244         if metadata_available and (
1245                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1246             self.report_warning(msg)
1247         else:
1248             raise GeoRestrictedError(msg, countries=countries)
1249
1250     def raise_no_formats(self, msg, expected=False, video_id=None):
1251         if expected and (
1252                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1253             self.report_warning(msg, video_id)
1254         elif isinstance(msg, ExtractorError):
1255             raise msg
1256         else:
1257             raise ExtractorError(msg, expected=expected, video_id=video_id)
1258
1259     # Methods for following #608
1260     @staticmethod
1261     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1262         """Returns a URL that points to a page that should be processed"""
1263         if ie is not None:
1264             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1265         if video_id is not None:
1266             kwargs['id'] = video_id
1267         if video_title is not None:
1268             kwargs['title'] = video_title
1269         return {
1270             **kwargs,
1271             '_type': 'url_transparent' if url_transparent else 'url',
1272             'url': url,
1273         }
1274
1275     @classmethod
1276     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1277                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1278         return cls.playlist_result(
1279             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1280             playlist_id, playlist_title, **kwargs)
1281
1282     @staticmethod
1283     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1284         """Returns a playlist"""
1285         if playlist_id:
1286             kwargs['id'] = playlist_id
1287         if playlist_title:
1288             kwargs['title'] = playlist_title
1289         if playlist_description is not None:
1290             kwargs['description'] = playlist_description
1291         return {
1292             **kwargs,
1293             '_type': 'multi_video' if multi_video else 'playlist',
1294             'entries': entries,
1295         }
1296
1297     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1298         """
1299         Perform a regex search on the given string, using a single or a list of
1300         patterns returning the first matching group.
1301         In case of failure return a default value or raise a WARNING or a
1302         RegexNotFoundError, depending on fatal, specifying the field name.
1303         """
1304         if string is None:
1305             mobj = None
1306         elif isinstance(pattern, (str, re.Pattern)):
1307             mobj = re.search(pattern, string, flags)
1308         else:
1309             for p in pattern:
1310                 mobj = re.search(p, string, flags)
1311                 if mobj:
1312                     break
1313
1314         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1315
1316         if mobj:
1317             if group is None:
1318                 # return the first matching group
1319                 return next(g for g in mobj.groups() if g is not None)
1320             elif isinstance(group, (list, tuple)):
1321                 return tuple(mobj.group(g) for g in group)
1322             else:
1323                 return mobj.group(group)
1324         elif default is not NO_DEFAULT:
1325             return default
1326         elif fatal:
1327             raise RegexNotFoundError('Unable to extract %s' % _name)
1328         else:
1329             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1330             return None
1331
1332     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1333                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1334         """Searches string for the JSON object specified by start_pattern"""
1335         # NB: end_pattern is only used to reduce the size of the initial match
1336         if default is NO_DEFAULT:
1337             default, has_default = {}, False
1338         else:
1339             fatal, has_default = False, True
1340
1341         json_string = self._search_regex(
1342             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1343             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1344         if not json_string:
1345             return default
1346
1347         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1348         try:
1349             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1350         except ExtractorError as e:
1351             if fatal:
1352                 raise ExtractorError(
1353                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1354             elif not has_default:
1355                 self.report_warning(
1356                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1357         return default
1358
1359     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1360         """
1361         Like _search_regex, but strips HTML tags and unescapes entities.
1362         """
1363         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1364         if isinstance(res, tuple):
1365             return tuple(map(clean_html, res))
1366         return clean_html(res)
1367
1368     def _get_netrc_login_info(self, netrc_machine=None):
1369         netrc_machine = netrc_machine or self._NETRC_MACHINE
1370
1371         cmd = self.get_param('netrc_cmd')
1372         if cmd:
1373             cmd = cmd.replace('{}', netrc_machine)
1374             self.to_screen(f'Executing command: {cmd}')
1375             stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1376             if ret != 0:
1377                 raise OSError(f'Command returned error code {ret}')
1378             info = netrc_from_content(stdout).authenticators(netrc_machine)
1379
1380         elif self.get_param('usenetrc', False):
1381             netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1382             if os.path.isdir(netrc_file):
1383                 netrc_file = os.path.join(netrc_file, '.netrc')
1384             info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1385
1386         else:
1387             return None, None
1388         if not info:
1389             self.to_screen(f'No authenticators for {netrc_machine}')
1390             return None, None
1391
1392         self.write_debug(f'Using netrc for {netrc_machine} authentication')
1393         return info[0], info[2]
1394
1395     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1396         """
1397         Get the login info as (username, password)
1398         First look for the manually specified credentials using username_option
1399         and password_option as keys in params dictionary. If no such credentials
1400         are available try the netrc_cmd if it is defined or look in the
1401         netrc file using the netrc_machine or _NETRC_MACHINE value.
1402         If there's no info available, return (None, None)
1403         """
1404
1405         username = self.get_param(username_option)
1406         if username is not None:
1407             password = self.get_param(password_option)
1408         else:
1409             try:
1410                 username, password = self._get_netrc_login_info(netrc_machine)
1411             except (OSError, netrc.NetrcParseError) as err:
1412                 self.report_warning(f'Failed to parse .netrc: {err}')
1413                 return None, None
1414         return username, password
1415
1416     def _get_tfa_info(self, note='two-factor verification code'):
1417         """
1418         Get the two-factor authentication info
1419         TODO - asking the user will be required for sms/phone verify
1420         currently just uses the command line option
1421         If there's no info available, return None
1422         """
1423
1424         tfa = self.get_param('twofactor')
1425         if tfa is not None:
1426             return tfa
1427
1428         return getpass.getpass('Type %s and press [Return]: ' % note)
1429
1430     # Helper functions for extracting OpenGraph info
1431     @staticmethod
1432     def _og_regexes(prop):
1433         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1434         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1435                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1436         template = r'<meta[^>]+?%s[^>]+?%s'
1437         return [
1438             template % (property_re, content_re),
1439             template % (content_re, property_re),
1440         ]
1441
1442     @staticmethod
1443     def _meta_regex(prop):
1444         return r'''(?isx)<meta
1445                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1446                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1447
1448     def _og_search_property(self, prop, html, name=None, **kargs):
1449         prop = variadic(prop)
1450         if name is None:
1451             name = 'OpenGraph %s' % prop[0]
1452         og_regexes = []
1453         for p in prop:
1454             og_regexes.extend(self._og_regexes(p))
1455         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1456         if escaped is None:
1457             return None
1458         return unescapeHTML(escaped)
1459
1460     def _og_search_thumbnail(self, html, **kargs):
1461         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1462
1463     def _og_search_description(self, html, **kargs):
1464         return self._og_search_property('description', html, fatal=False, **kargs)
1465
1466     def _og_search_title(self, html, *, fatal=False, **kargs):
1467         return self._og_search_property('title', html, fatal=fatal, **kargs)
1468
1469     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1470         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1471         if secure:
1472             regexes = self._og_regexes('video:secure_url') + regexes
1473         return self._html_search_regex(regexes, html, name, **kargs)
1474
1475     def _og_search_url(self, html, **kargs):
1476         return self._og_search_property('url', html, **kargs)
1477
1478     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1479         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1480
1481     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1482         name = variadic(name)
1483         if display_name is None:
1484             display_name = name[0]
1485         return self._html_search_regex(
1486             [self._meta_regex(n) for n in name],
1487             html, display_name, fatal=fatal, group='content', **kwargs)
1488
1489     def _dc_search_uploader(self, html):
1490         return self._html_search_meta('dc.creator', html, 'uploader')
1491
1492     @staticmethod
1493     def _rta_search(html):
1494         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1495         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1496                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1497                      html):
1498             return 18
1499
1500         # And then there are the jokers who advertise that they use RTA, but actually don't.
1501         AGE_LIMIT_MARKERS = [
1502             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1503             r'>[^<]*you acknowledge you are at least (\d+) years old',
1504             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1505         ]
1506
1507         age_limit = 0
1508         for marker in AGE_LIMIT_MARKERS:
1509             mobj = re.search(marker, html)
1510             if mobj:
1511                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1512         return age_limit
1513
1514     def _media_rating_search(self, html):
1515         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1516         rating = self._html_search_meta('rating', html)
1517
1518         if not rating:
1519             return None
1520
1521         RATING_TABLE = {
1522             'safe for kids': 0,
1523             'general': 8,
1524             '14 years': 14,
1525             'mature': 17,
1526             'restricted': 19,
1527         }
1528         return RATING_TABLE.get(rating.lower())
1529
1530     def _family_friendly_search(self, html):
1531         # See http://schema.org/VideoObject
1532         family_friendly = self._html_search_meta(
1533             'isFamilyFriendly', html, default=None)
1534
1535         if not family_friendly:
1536             return None
1537
1538         RATING_TABLE = {
1539             '1': 0,
1540             'true': 0,
1541             '0': 18,
1542             'false': 18,
1543         }
1544         return RATING_TABLE.get(family_friendly.lower())
1545
1546     def _twitter_search_player(self, html):
1547         return self._html_search_meta('twitter:player', html,
1548                                       'twitter card player')
1549
1550     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1551         """Yield all json ld objects in the html"""
1552         if default is not NO_DEFAULT:
1553             fatal = False
1554         for mobj in re.finditer(JSON_LD_RE, html):
1555             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1556             for json_ld in variadic(json_ld_item):
1557                 if isinstance(json_ld, dict):
1558                     yield json_ld
1559
1560     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1561         """Search for a video in any json ld in the html"""
1562         if default is not NO_DEFAULT:
1563             fatal = False
1564         info = self._json_ld(
1565             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1566             video_id, fatal=fatal, expected_type=expected_type)
1567         if info:
1568             return info
1569         if default is not NO_DEFAULT:
1570             return default
1571         elif fatal:
1572             raise RegexNotFoundError('Unable to extract JSON-LD')
1573         else:
1574             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1575             return {}
1576
1577     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1578         if isinstance(json_ld, str):
1579             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1580         if not json_ld:
1581             return {}
1582         info = {}
1583
1584         INTERACTION_TYPE_MAP = {
1585             'CommentAction': 'comment',
1586             'AgreeAction': 'like',
1587             'DisagreeAction': 'dislike',
1588             'LikeAction': 'like',
1589             'DislikeAction': 'dislike',
1590             'ListenAction': 'view',
1591             'WatchAction': 'view',
1592             'ViewAction': 'view',
1593         }
1594
1595         def is_type(e, *expected_types):
1596             type = variadic(traverse_obj(e, '@type'))
1597             return any(x in type for x in expected_types)
1598
1599         def extract_interaction_type(e):
1600             interaction_type = e.get('interactionType')
1601             if isinstance(interaction_type, dict):
1602                 interaction_type = interaction_type.get('@type')
1603             return str_or_none(interaction_type)
1604
1605         def extract_interaction_statistic(e):
1606             interaction_statistic = e.get('interactionStatistic')
1607             if isinstance(interaction_statistic, dict):
1608                 interaction_statistic = [interaction_statistic]
1609             if not isinstance(interaction_statistic, list):
1610                 return
1611             for is_e in interaction_statistic:
1612                 if not is_type(is_e, 'InteractionCounter'):
1613                     continue
1614                 interaction_type = extract_interaction_type(is_e)
1615                 if not interaction_type:
1616                     continue
1617                 # For interaction count some sites provide string instead of
1618                 # an integer (as per spec) with non digit characters (e.g. ",")
1619                 # so extracting count with more relaxed str_to_int
1620                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1621                 if interaction_count is None:
1622                     continue
1623                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1624                 if not count_kind:
1625                     continue
1626                 count_key = '%s_count' % count_kind
1627                 if info.get(count_key) is not None:
1628                     continue
1629                 info[count_key] = interaction_count
1630
1631         def extract_chapter_information(e):
1632             chapters = [{
1633                 'title': part.get('name'),
1634                 'start_time': part.get('startOffset'),
1635                 'end_time': part.get('endOffset'),
1636             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1637             for idx, (last_c, current_c, next_c) in enumerate(zip(
1638                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1639                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1640                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1641                 if None in current_c.values():
1642                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1643                     return
1644             if chapters:
1645                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1646                 info['chapters'] = chapters
1647
1648         def extract_video_object(e):
1649             author = e.get('author')
1650             info.update({
1651                 'url': url_or_none(e.get('contentUrl')),
1652                 'ext': mimetype2ext(e.get('encodingFormat')),
1653                 'title': unescapeHTML(e.get('name')),
1654                 'description': unescapeHTML(e.get('description')),
1655                 'thumbnails': [{'url': unescapeHTML(url)}
1656                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1657                                if url_or_none(url)],
1658                 'duration': parse_duration(e.get('duration')),
1659                 'timestamp': unified_timestamp(e.get('uploadDate')),
1660                 # author can be an instance of 'Organization' or 'Person' types.
1661                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1662                 # however some websites are using 'Text' type instead.
1663                 # 1. https://schema.org/VideoObject
1664                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1665                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1666                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1667                 'tbr': int_or_none(e.get('bitrate')),
1668                 'width': int_or_none(e.get('width')),
1669                 'height': int_or_none(e.get('height')),
1670                 'view_count': int_or_none(e.get('interactionCount')),
1671                 'tags': try_call(lambda: e.get('keywords').split(',')),
1672             })
1673             if is_type(e, 'AudioObject'):
1674                 info.update({
1675                     'vcodec': 'none',
1676                     'abr': int_or_none(e.get('bitrate')),
1677                 })
1678             extract_interaction_statistic(e)
1679             extract_chapter_information(e)
1680
1681         def traverse_json_ld(json_ld, at_top_level=True):
1682             for e in variadic(json_ld):
1683                 if not isinstance(e, dict):
1684                     continue
1685                 if at_top_level and '@context' not in e:
1686                     continue
1687                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1688                     traverse_json_ld(e['@graph'], at_top_level=False)
1689                     continue
1690                 if expected_type is not None and not is_type(e, expected_type):
1691                     continue
1692                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1693                 if rating is not None:
1694                     info['average_rating'] = rating
1695                 if is_type(e, 'TVEpisode', 'Episode'):
1696                     episode_name = unescapeHTML(e.get('name'))
1697                     info.update({
1698                         'episode': episode_name,
1699                         'episode_number': int_or_none(e.get('episodeNumber')),
1700                         'description': unescapeHTML(e.get('description')),
1701                     })
1702                     if not info.get('title') and episode_name:
1703                         info['title'] = episode_name
1704                     part_of_season = e.get('partOfSeason')
1705                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1706                         info.update({
1707                             'season': unescapeHTML(part_of_season.get('name')),
1708                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1709                         })
1710                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1711                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1712                         info['series'] = unescapeHTML(part_of_series.get('name'))
1713                 elif is_type(e, 'Movie'):
1714                     info.update({
1715                         'title': unescapeHTML(e.get('name')),
1716                         'description': unescapeHTML(e.get('description')),
1717                         'duration': parse_duration(e.get('duration')),
1718                         'timestamp': unified_timestamp(e.get('dateCreated')),
1719                     })
1720                 elif is_type(e, 'Article', 'NewsArticle'):
1721                     info.update({
1722                         'timestamp': parse_iso8601(e.get('datePublished')),
1723                         'title': unescapeHTML(e.get('headline')),
1724                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1725                     })
1726                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1727                         extract_video_object(e['video'][0])
1728                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1729                         extract_video_object(e['subjectOf'][0])
1730                 elif is_type(e, 'VideoObject', 'AudioObject'):
1731                     extract_video_object(e)
1732                     if expected_type is None:
1733                         continue
1734                     else:
1735                         break
1736                 video = e.get('video')
1737                 if is_type(video, 'VideoObject'):
1738                     extract_video_object(video)
1739                 if expected_type is None:
1740                     continue
1741                 else:
1742                     break
1743
1744         traverse_json_ld(json_ld)
1745         return filter_dict(info)
1746
1747     def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw):
1748         if default == '{}':
1749             self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead')
1750             default = {}
1751         if default is not NO_DEFAULT:
1752             fatal = False
1753
1754         return self._search_json(
1755             r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
1756             video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
1757
1758     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1759         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1760         rectx = re.escape(context_name)
1761         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1762         js, arg_keys, arg_vals = self._search_regex(
1763             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1764             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1765             default=NO_DEFAULT if fatal else (None, None, None))
1766         if js is None:
1767             return {}
1768
1769         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1770             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1771
1772         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1773         return traverse_obj(ret, traverse) or {}
1774
1775     @staticmethod
1776     def _hidden_inputs(html):
1777         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1778         hidden_inputs = {}
1779         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1780             attrs = extract_attributes(input)
1781             if not input:
1782                 continue
1783             if attrs.get('type') not in ('hidden', 'submit'):
1784                 continue
1785             name = attrs.get('name') or attrs.get('id')
1786             value = attrs.get('value')
1787             if name and value is not None:
1788                 hidden_inputs[name] = value
1789         return hidden_inputs
1790
1791     def _form_hidden_inputs(self, form_id, html):
1792         form = self._search_regex(
1793             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1794             html, '%s form' % form_id, group='form')
1795         return self._hidden_inputs(form)
1796
1797     @classproperty(cache=True)
1798     def FormatSort(cls):
1799         class FormatSort(FormatSorter):
1800             def __init__(ie, *args, **kwargs):
1801                 super().__init__(ie._downloader, *args, **kwargs)
1802
1803         deprecation_warning(
1804             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1805             'Use yt_dlp.utils.FormatSorter instead')
1806         return FormatSort
1807
1808     def _sort_formats(self, formats, field_preference=[]):
1809         if not field_preference:
1810             self._downloader.deprecation_warning(
1811                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1812             return
1813         self._downloader.deprecation_warning(
1814             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1815             'Return _format_sort_fields in the info_dict instead')
1816         if formats:
1817             formats[0]['__sort_fields'] = field_preference
1818
1819     def _check_formats(self, formats, video_id):
1820         if formats:
1821             formats[:] = filter(
1822                 lambda f: self._is_valid_url(
1823                     f['url'], video_id,
1824                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1825                 formats)
1826
1827     @staticmethod
1828     def _remove_duplicate_formats(formats):
1829         format_urls = set()
1830         unique_formats = []
1831         for f in formats:
1832             if f['url'] not in format_urls:
1833                 format_urls.add(f['url'])
1834                 unique_formats.append(f)
1835         formats[:] = unique_formats
1836
1837     def _is_valid_url(self, url, video_id, item='video', headers={}):
1838         url = self._proto_relative_url(url, scheme='http:')
1839         # For now assume non HTTP(S) URLs always valid
1840         if not (url.startswith('http://') or url.startswith('https://')):
1841             return True
1842         try:
1843             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1844             return True
1845         except ExtractorError as e:
1846             self.to_screen(
1847                 '%s: %s URL is invalid, skipping: %s'
1848                 % (video_id, item, error_to_compat_str(e.cause)))
1849             return False
1850
1851     def http_scheme(self):
1852         """ Either "http:" or "https:", depending on the user's preferences """
1853         return (
1854             'http:'
1855             if self.get_param('prefer_insecure', False)
1856             else 'https:')
1857
1858     def _proto_relative_url(self, url, scheme=None):
1859         scheme = scheme or self.http_scheme()
1860         assert scheme.endswith(':')
1861         return sanitize_url(url, scheme=scheme[:-1])
1862
1863     def _sleep(self, timeout, video_id, msg_template=None):
1864         if msg_template is None:
1865             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1866         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1867         self.to_screen(msg)
1868         time.sleep(timeout)
1869
1870     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1871                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1872                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1873         if self.get_param('ignore_no_formats_error'):
1874             fatal = False
1875
1876         res = self._download_xml_handle(
1877             manifest_url, video_id, 'Downloading f4m manifest',
1878             'Unable to download f4m manifest',
1879             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1880             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1881             transform_source=transform_source,
1882             fatal=fatal, data=data, headers=headers, query=query)
1883         if res is False:
1884             return []
1885
1886         manifest, urlh = res
1887         manifest_url = urlh.url
1888
1889         return self._parse_f4m_formats(
1890             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1891             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1892
1893     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1894                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1895                            fatal=True, m3u8_id=None):
1896         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1897             return []
1898
1899         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1900         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1901         if akamai_pv is not None and ';' in akamai_pv.text:
1902             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1903             if playerVerificationChallenge.strip() != '':
1904                 return []
1905
1906         formats = []
1907         manifest_version = '1.0'
1908         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1909         if not media_nodes:
1910             manifest_version = '2.0'
1911             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1912         # Remove unsupported DRM protected media from final formats
1913         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1914         media_nodes = remove_encrypted_media(media_nodes)
1915         if not media_nodes:
1916             return formats
1917
1918         manifest_base_url = get_base_url(manifest)
1919
1920         bootstrap_info = xpath_element(
1921             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1922             'bootstrap info', default=None)
1923
1924         vcodec = None
1925         mime_type = xpath_text(
1926             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1927             'base URL', default=None)
1928         if mime_type and mime_type.startswith('audio/'):
1929             vcodec = 'none'
1930
1931         for i, media_el in enumerate(media_nodes):
1932             tbr = int_or_none(media_el.attrib.get('bitrate'))
1933             width = int_or_none(media_el.attrib.get('width'))
1934             height = int_or_none(media_el.attrib.get('height'))
1935             format_id = join_nonempty(f4m_id, tbr or i)
1936             # If <bootstrapInfo> is present, the specified f4m is a
1937             # stream-level manifest, and only set-level manifests may refer to
1938             # external resources.  See section 11.4 and section 4 of F4M spec
1939             if bootstrap_info is None:
1940                 media_url = None
1941                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1942                 if manifest_version == '2.0':
1943                     media_url = media_el.attrib.get('href')
1944                 if media_url is None:
1945                     media_url = media_el.attrib.get('url')
1946                 if not media_url:
1947                     continue
1948                 manifest_url = (
1949                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1950                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1951                 # If media_url is itself a f4m manifest do the recursive extraction
1952                 # since bitrates in parent manifest (this one) and media_url manifest
1953                 # may differ leading to inability to resolve the format by requested
1954                 # bitrate in f4m downloader
1955                 ext = determine_ext(manifest_url)
1956                 if ext == 'f4m':
1957                     f4m_formats = self._extract_f4m_formats(
1958                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1959                         transform_source=transform_source, fatal=fatal)
1960                     # Sometimes stream-level manifest contains single media entry that
1961                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1962                     # At the same time parent's media entry in set-level manifest may
1963                     # contain it. We will copy it from parent in such cases.
1964                     if len(f4m_formats) == 1:
1965                         f = f4m_formats[0]
1966                         f.update({
1967                             'tbr': f.get('tbr') or tbr,
1968                             'width': f.get('width') or width,
1969                             'height': f.get('height') or height,
1970                             'format_id': f.get('format_id') if not tbr else format_id,
1971                             'vcodec': vcodec,
1972                         })
1973                     formats.extend(f4m_formats)
1974                     continue
1975                 elif ext == 'm3u8':
1976                     formats.extend(self._extract_m3u8_formats(
1977                         manifest_url, video_id, 'mp4', preference=preference,
1978                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1979                     continue
1980             formats.append({
1981                 'format_id': format_id,
1982                 'url': manifest_url,
1983                 'manifest_url': manifest_url,
1984                 'ext': 'flv' if bootstrap_info is not None else None,
1985                 'protocol': 'f4m',
1986                 'tbr': tbr,
1987                 'width': width,
1988                 'height': height,
1989                 'vcodec': vcodec,
1990                 'preference': preference,
1991                 'quality': quality,
1992             })
1993         return formats
1994
1995     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1996         return {
1997             'format_id': join_nonempty(m3u8_id, 'meta'),
1998             'url': m3u8_url,
1999             'ext': ext,
2000             'protocol': 'm3u8',
2001             'preference': preference - 100 if preference else -100,
2002             'quality': quality,
2003             'resolution': 'multiple',
2004             'format_note': 'Quality selection URL',
2005         }
2006
2007     def _report_ignoring_subs(self, name):
2008         self.report_warning(bug_reports_message(
2009             f'Ignoring subtitle tracks found in the {name} manifest; '
2010             'if any subtitle tracks are missing,'
2011         ), only_once=True)
2012
2013     def _extract_m3u8_formats(self, *args, **kwargs):
2014         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2015         if subs:
2016             self._report_ignoring_subs('HLS')
2017         return fmts
2018
2019     def _extract_m3u8_formats_and_subtitles(
2020             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2021             preference=None, quality=None, m3u8_id=None, note=None,
2022             errnote=None, fatal=True, live=False, data=None, headers={},
2023             query={}):
2024
2025         if self.get_param('ignore_no_formats_error'):
2026             fatal = False
2027
2028         if not m3u8_url:
2029             if errnote is not False:
2030                 errnote = errnote or 'Failed to obtain m3u8 URL'
2031                 if fatal:
2032                     raise ExtractorError(errnote, video_id=video_id)
2033                 self.report_warning(f'{errnote}{bug_reports_message()}')
2034             return [], {}
2035
2036         res = self._download_webpage_handle(
2037             m3u8_url, video_id,
2038             note='Downloading m3u8 information' if note is None else note,
2039             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2040             fatal=fatal, data=data, headers=headers, query=query)
2041
2042         if res is False:
2043             return [], {}
2044
2045         m3u8_doc, urlh = res
2046         m3u8_url = urlh.url
2047
2048         return self._parse_m3u8_formats_and_subtitles(
2049             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2050             preference=preference, quality=quality, m3u8_id=m3u8_id,
2051             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2052             headers=headers, query=query, video_id=video_id)
2053
2054     def _parse_m3u8_formats_and_subtitles(
2055             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2056             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2057             errnote=None, fatal=True, data=None, headers={}, query={},
2058             video_id=None):
2059         formats, subtitles = [], {}
2060         has_drm = HlsFD._has_drm(m3u8_doc)
2061
2062         def format_url(url):
2063             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2064
2065         if self.get_param('hls_split_discontinuity', False):
2066             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2067                 if not m3u8_doc:
2068                     if not manifest_url:
2069                         return []
2070                     m3u8_doc = self._download_webpage(
2071                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2072                         note=False, errnote='Failed to download m3u8 playlist information')
2073                     if m3u8_doc is False:
2074                         return []
2075                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2076
2077         else:
2078             def _extract_m3u8_playlist_indices(*args, **kwargs):
2079                 return [None]
2080
2081         # References:
2082         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2083         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2084         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2085
2086         # We should try extracting formats only from master playlists [1, 4.3.4],
2087         # i.e. playlists that describe available qualities. On the other hand
2088         # media playlists [1, 4.3.3] should be returned as is since they contain
2089         # just the media without qualities renditions.
2090         # Fortunately, master playlist can be easily distinguished from media
2091         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2092         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2093         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2094         # media playlist and MUST NOT appear in master playlist thus we can
2095         # clearly detect media playlist with this criterion.
2096
2097         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2098             formats = [{
2099                 'format_id': join_nonempty(m3u8_id, idx),
2100                 'format_index': idx,
2101                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2102                 'ext': ext,
2103                 'protocol': entry_protocol,
2104                 'preference': preference,
2105                 'quality': quality,
2106                 'has_drm': has_drm,
2107             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2108
2109             return formats, subtitles
2110
2111         groups = {}
2112         last_stream_inf = {}
2113
2114         def extract_media(x_media_line):
2115             media = parse_m3u8_attributes(x_media_line)
2116             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2117             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2118             if not (media_type and group_id and name):
2119                 return
2120             groups.setdefault(group_id, []).append(media)
2121             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2122             if media_type == 'SUBTITLES':
2123                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2124                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2125                 # However, lack of URI has been spotted in the wild.
2126                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2127                 if not media.get('URI'):
2128                     return
2129                 url = format_url(media['URI'])
2130                 sub_info = {
2131                     'url': url,
2132                     'ext': determine_ext(url),
2133                 }
2134                 if sub_info['ext'] == 'm3u8':
2135                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2136                     # files may contain is WebVTT:
2137                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2138                     sub_info['ext'] = 'vtt'
2139                     sub_info['protocol'] = 'm3u8_native'
2140                 lang = media.get('LANGUAGE') or 'und'
2141                 subtitles.setdefault(lang, []).append(sub_info)
2142             if media_type not in ('VIDEO', 'AUDIO'):
2143                 return
2144             media_url = media.get('URI')
2145             if media_url:
2146                 manifest_url = format_url(media_url)
2147                 formats.extend({
2148                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2149                     'format_note': name,
2150                     'format_index': idx,
2151                     'url': manifest_url,
2152                     'manifest_url': m3u8_url,
2153                     'language': media.get('LANGUAGE'),
2154                     'ext': ext,
2155                     'protocol': entry_protocol,
2156                     'preference': preference,
2157                     'quality': quality,
2158                     'has_drm': has_drm,
2159                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2160                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2161
2162         def build_stream_name():
2163             # Despite specification does not mention NAME attribute for
2164             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2165             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2166             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2167             stream_name = last_stream_inf.get('NAME')
2168             if stream_name:
2169                 return stream_name
2170             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2171             # from corresponding rendition group
2172             stream_group_id = last_stream_inf.get('VIDEO')
2173             if not stream_group_id:
2174                 return
2175             stream_group = groups.get(stream_group_id)
2176             if not stream_group:
2177                 return stream_group_id
2178             rendition = stream_group[0]
2179             return rendition.get('NAME') or stream_group_id
2180
2181         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2182         # chance to detect video only formats when EXT-X-STREAM-INF tags
2183         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2184         for line in m3u8_doc.splitlines():
2185             if line.startswith('#EXT-X-MEDIA:'):
2186                 extract_media(line)
2187
2188         for line in m3u8_doc.splitlines():
2189             if line.startswith('#EXT-X-STREAM-INF:'):
2190                 last_stream_inf = parse_m3u8_attributes(line)
2191             elif line.startswith('#') or not line.strip():
2192                 continue
2193             else:
2194                 tbr = float_or_none(
2195                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2196                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2197                 manifest_url = format_url(line.strip())
2198
2199                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2200                     format_id = [m3u8_id, None, idx]
2201                     # Bandwidth of live streams may differ over time thus making
2202                     # format_id unpredictable. So it's better to keep provided
2203                     # format_id intact.
2204                     if not live:
2205                         stream_name = build_stream_name()
2206                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2207                     f = {
2208                         'format_id': join_nonempty(*format_id),
2209                         'format_index': idx,
2210                         'url': manifest_url,
2211                         'manifest_url': m3u8_url,
2212                         'tbr': tbr,
2213                         'ext': ext,
2214                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2215                         'protocol': entry_protocol,
2216                         'preference': preference,
2217                         'quality': quality,
2218                         'has_drm': has_drm,
2219                     }
2220                     resolution = last_stream_inf.get('RESOLUTION')
2221                     if resolution:
2222                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2223                         if mobj:
2224                             f['width'] = int(mobj.group('width'))
2225                             f['height'] = int(mobj.group('height'))
2226                     # Unified Streaming Platform
2227                     mobj = re.search(
2228                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2229                     if mobj:
2230                         abr, vbr = mobj.groups()
2231                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2232                         f.update({
2233                             'vbr': vbr,
2234                             'abr': abr,
2235                         })
2236                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2237                     f.update(codecs)
2238                     audio_group_id = last_stream_inf.get('AUDIO')
2239                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2240                     # references a rendition group MUST have a CODECS attribute.
2241                     # However, this is not always respected. E.g. [2]
2242                     # contains EXT-X-STREAM-INF tag which references AUDIO
2243                     # rendition group but does not have CODECS and despite
2244                     # referencing an audio group it represents a complete
2245                     # (with audio and video) format. So, for such cases we will
2246                     # ignore references to rendition groups and treat them
2247                     # as complete formats.
2248                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2249                         audio_group = groups.get(audio_group_id)
2250                         if audio_group and audio_group[0].get('URI'):
2251                             # TODO: update acodec for audio only formats with
2252                             # the same GROUP-ID
2253                             f['acodec'] = 'none'
2254                     if not f.get('ext'):
2255                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2256                     formats.append(f)
2257
2258                     # for DailyMotion
2259                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2260                     if progressive_uri:
2261                         http_f = f.copy()
2262                         del http_f['manifest_url']
2263                         http_f.update({
2264                             'format_id': f['format_id'].replace('hls-', 'http-'),
2265                             'protocol': 'http',
2266                             'url': progressive_uri,
2267                         })
2268                         formats.append(http_f)
2269
2270                 last_stream_inf = {}
2271         return formats, subtitles
2272
2273     def _extract_m3u8_vod_duration(
2274             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2275
2276         m3u8_vod = self._download_webpage(
2277             m3u8_vod_url, video_id,
2278             note='Downloading m3u8 VOD manifest' if note is None else note,
2279             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2280             fatal=False, data=data, headers=headers, query=query)
2281
2282         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2283
2284     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2285         if '#EXT-X-ENDLIST' not in m3u8_vod:
2286             return None
2287
2288         return int(sum(
2289             float(line[len('#EXTINF:'):].split(',')[0])
2290             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2291
2292     def _extract_mpd_vod_duration(
2293             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2294
2295         mpd_doc = self._download_xml(
2296             mpd_url, video_id,
2297             note='Downloading MPD VOD manifest' if note is None else note,
2298             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2299             fatal=False, data=data, headers=headers, query=query)
2300         if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2301             return None
2302         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2303
2304     @staticmethod
2305     def _xpath_ns(path, namespace=None):
2306         if not namespace:
2307             return path
2308         out = []
2309         for c in path.split('/'):
2310             if not c or c == '.':
2311                 out.append(c)
2312             else:
2313                 out.append('{%s}%s' % (namespace, c))
2314         return '/'.join(out)
2315
2316     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2317         if self.get_param('ignore_no_formats_error'):
2318             fatal = False
2319
2320         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2321         if res is False:
2322             assert not fatal
2323             return [], {}
2324         smil, urlh = res
2325
2326         return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2327                                                       namespace=self._parse_smil_namespace(smil))
2328
2329     def _extract_smil_formats(self, *args, **kwargs):
2330         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2331         if subs:
2332             self._report_ignoring_subs('SMIL')
2333         return fmts
2334
2335     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2336         res = self._download_smil(smil_url, video_id, fatal=fatal)
2337         if res is False:
2338             return {}
2339
2340         smil, urlh = res
2341         smil_url = urlh.url
2342
2343         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2344
2345     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2346         return self._download_xml_handle(
2347             smil_url, video_id, 'Downloading SMIL file',
2348             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2349
2350     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2351         namespace = self._parse_smil_namespace(smil)
2352
2353         formats, subtitles = self._parse_smil_formats_and_subtitles(
2354             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2355
2356         video_id = os.path.splitext(url_basename(smil_url))[0]
2357         title = None
2358         description = None
2359         upload_date = None
2360         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2361             name = meta.attrib.get('name')
2362             content = meta.attrib.get('content')
2363             if not name or not content:
2364                 continue
2365             if not title and name == 'title':
2366                 title = content
2367             elif not description and name in ('description', 'abstract'):
2368                 description = content
2369             elif not upload_date and name == 'date':
2370                 upload_date = unified_strdate(content)
2371
2372         thumbnails = [{
2373             'id': image.get('type'),
2374             'url': image.get('src'),
2375             'width': int_or_none(image.get('width')),
2376             'height': int_or_none(image.get('height')),
2377         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2378
2379         return {
2380             'id': video_id,
2381             'title': title or video_id,
2382             'description': description,
2383             'upload_date': upload_date,
2384             'thumbnails': thumbnails,
2385             'formats': formats,
2386             'subtitles': subtitles,
2387         }
2388
2389     def _parse_smil_namespace(self, smil):
2390         return self._search_regex(
2391             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2392
2393     def _parse_smil_formats(self, *args, **kwargs):
2394         fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2395         if subs:
2396             self._report_ignoring_subs('SMIL')
2397         return fmts
2398
2399     def _parse_smil_formats_and_subtitles(
2400             self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2401         base = smil_url
2402         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2403             b = meta.get('base') or meta.get('httpBase')
2404             if b:
2405                 base = b
2406                 break
2407
2408         formats, subtitles = [], {}
2409         rtmp_count = 0
2410         http_count = 0
2411         m3u8_count = 0
2412         imgs_count = 0
2413
2414         srcs = set()
2415         media = itertools.chain.from_iterable(
2416             smil.findall(self._xpath_ns(arg, namespace))
2417             for arg in ['.//video', './/audio', './/media'])
2418         for medium in media:
2419             src = medium.get('src')
2420             if not src or src in srcs:
2421                 continue
2422             srcs.add(src)
2423
2424             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2425             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2426             width = int_or_none(medium.get('width'))
2427             height = int_or_none(medium.get('height'))
2428             proto = medium.get('proto')
2429             ext = medium.get('ext')
2430             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2431                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2432             streamer = medium.get('streamer') or base
2433
2434             if proto == 'rtmp' or streamer.startswith('rtmp'):
2435                 rtmp_count += 1
2436                 formats.append({
2437                     'url': streamer,
2438                     'play_path': src,
2439                     'ext': 'flv',
2440                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2441                     'tbr': bitrate,
2442                     'filesize': filesize,
2443                     'width': width,
2444                     'height': height,
2445                 })
2446                 if transform_rtmp_url:
2447                     streamer, src = transform_rtmp_url(streamer, src)
2448                     formats[-1].update({
2449                         'url': streamer,
2450                         'play_path': src,
2451                     })
2452                 continue
2453
2454             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2455             src_url = src_url.strip()
2456
2457             if proto == 'm3u8' or src_ext == 'm3u8':
2458                 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
2459                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2460                 self._merge_subtitles(m3u8_subs, target=subtitles)
2461                 if len(m3u8_formats) == 1:
2462                     m3u8_count += 1
2463                     m3u8_formats[0].update({
2464                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2465                         'tbr': bitrate,
2466                         'width': width,
2467                         'height': height,
2468                     })
2469                 formats.extend(m3u8_formats)
2470             elif src_ext == 'f4m':
2471                 f4m_url = src_url
2472                 if not f4m_params:
2473                     f4m_params = {
2474                         'hdcore': '3.2.0',
2475                         'plugin': 'flowplayer-3.2.0.1',
2476                     }
2477                 f4m_url += '&' if '?' in f4m_url else '?'
2478                 f4m_url += urllib.parse.urlencode(f4m_params)
2479                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2480             elif src_ext == 'mpd':
2481                 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2482                     src_url, video_id, mpd_id='dash', fatal=False)
2483                 formats.extend(mpd_formats)
2484                 self._merge_subtitles(mpd_subs, target=subtitles)
2485             elif re.search(r'\.ism/[Mm]anifest', src_url):
2486                 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2487                     src_url, video_id, ism_id='mss', fatal=False)
2488                 formats.extend(ism_formats)
2489                 self._merge_subtitles(ism_subs, target=subtitles)
2490             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2491                 http_count += 1
2492                 formats.append({
2493                     'url': src_url,
2494                     'ext': ext or src_ext or 'flv',
2495                     'format_id': 'http-%d' % (bitrate or http_count),
2496                     'tbr': bitrate,
2497                     'filesize': filesize,
2498                     'width': width,
2499                     'height': height,
2500                 })
2501
2502         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2503             src = medium.get('src')
2504             if not src or src in srcs:
2505                 continue
2506             srcs.add(src)
2507
2508             imgs_count += 1
2509             formats.append({
2510                 'format_id': 'imagestream-%d' % (imgs_count),
2511                 'url': src,
2512                 'ext': mimetype2ext(medium.get('type')),
2513                 'acodec': 'none',
2514                 'vcodec': 'none',
2515                 'width': int_or_none(medium.get('width')),
2516                 'height': int_or_none(medium.get('height')),
2517                 'format_note': 'SMIL storyboards',
2518             })
2519
2520         smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2521         self._merge_subtitles(smil_subs, target=subtitles)
2522
2523         return formats, subtitles
2524
2525     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2526         urls = []
2527         subtitles = {}
2528         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2529             src = textstream.get('src')
2530             if not src or src in urls:
2531                 continue
2532             urls.append(src)
2533             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2534             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2535             subtitles.setdefault(lang, []).append({
2536                 'url': src,
2537                 'ext': ext,
2538             })
2539         return subtitles
2540
2541     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2542         res = self._download_xml_handle(
2543             xspf_url, playlist_id, 'Downloading xpsf playlist',
2544             'Unable to download xspf manifest', fatal=fatal)
2545         if res is False:
2546             return []
2547
2548         xspf, urlh = res
2549         xspf_url = urlh.url
2550
2551         return self._parse_xspf(
2552             xspf, playlist_id, xspf_url=xspf_url,
2553             xspf_base_url=base_url(xspf_url))
2554
2555     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2556         NS_MAP = {
2557             'xspf': 'http://xspf.org/ns/0/',
2558             's1': 'http://static.streamone.nl/player/ns/0',
2559         }
2560
2561         entries = []
2562         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2563             title = xpath_text(
2564                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2565             description = xpath_text(
2566                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2567             thumbnail = xpath_text(
2568                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2569             duration = float_or_none(
2570                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2571
2572             formats = []
2573             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2574                 format_url = urljoin(xspf_base_url, location.text)
2575                 if not format_url:
2576                     continue
2577                 formats.append({
2578                     'url': format_url,
2579                     'manifest_url': xspf_url,
2580                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2581                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2582                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2583                 })
2584
2585             entries.append({
2586                 'id': playlist_id,
2587                 'title': title,
2588                 'description': description,
2589                 'thumbnail': thumbnail,
2590                 'duration': duration,
2591                 'formats': formats,
2592             })
2593         return entries
2594
2595     def _extract_mpd_formats(self, *args, **kwargs):
2596         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2597         if subs:
2598             self._report_ignoring_subs('DASH')
2599         return fmts
2600
2601     def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
2602         periods = self._extract_mpd_periods(*args, **kwargs)
2603         return self._merge_mpd_periods(periods)
2604
2605     def _extract_mpd_periods(
2606             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2607             fatal=True, data=None, headers={}, query={}):
2608
2609         if self.get_param('ignore_no_formats_error'):
2610             fatal = False
2611
2612         res = self._download_xml_handle(
2613             mpd_url, video_id,
2614             note='Downloading MPD manifest' if note is None else note,
2615             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2616             fatal=fatal, data=data, headers=headers, query=query)
2617         if res is False:
2618             return []
2619         mpd_doc, urlh = res
2620         if mpd_doc is None:
2621             return []
2622
2623         # We could have been redirected to a new url when we retrieved our mpd file.
2624         mpd_url = urlh.url
2625         mpd_base_url = base_url(mpd_url)
2626
2627         return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
2628
2629     def _parse_mpd_formats(self, *args, **kwargs):
2630         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2631         if subs:
2632             self._report_ignoring_subs('DASH')
2633         return fmts
2634
2635     def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
2636         periods = self._parse_mpd_periods(*args, **kwargs)
2637         return self._merge_mpd_periods(periods)
2638
2639     def _merge_mpd_periods(self, periods):
2640         """
2641         Combine all formats and subtitles from an MPD manifest into a single list,
2642         by concatenate streams with similar formats.
2643         """
2644         formats, subtitles = {}, {}
2645         for period in periods:
2646             for f in period['formats']:
2647                 assert 'is_dash_periods' not in f, 'format already processed'
2648                 f['is_dash_periods'] = True
2649                 format_key = tuple(v for k, v in f.items() if k not in (
2650                     ('format_id', 'fragments', 'manifest_stream_number')))
2651                 if format_key not in formats:
2652                     formats[format_key] = f
2653                 elif 'fragments' in f:
2654                     formats[format_key].setdefault('fragments', []).extend(f['fragments'])
2655
2656             if subtitles and period['subtitles']:
2657                 self.report_warning(bug_reports_message(
2658                     'Found subtitles in multiple periods in the DASH manifest; '
2659                     'if part of the subtitles are missing,'
2660                 ), only_once=True)
2661
2662             for sub_lang, sub_info in period['subtitles'].items():
2663                 subtitles.setdefault(sub_lang, []).extend(sub_info)
2664
2665         return list(formats.values()), subtitles
2666
2667     def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2668         """
2669         Parse formats from MPD manifest.
2670         References:
2671          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2672             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2673          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2674         """
2675         if not self.get_param('dynamic_mpd', True):
2676             if mpd_doc.get('type') == 'dynamic':
2677                 return [], {}
2678
2679         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2680
2681         def _add_ns(path):
2682             return self._xpath_ns(path, namespace)
2683
2684         def is_drm_protected(element):
2685             return element.find(_add_ns('ContentProtection')) is not None
2686
2687         def extract_multisegment_info(element, ms_parent_info):
2688             ms_info = ms_parent_info.copy()
2689
2690             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2691             # common attributes and elements.  We will only extract relevant
2692             # for us.
2693             def extract_common(source):
2694                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2695                 if segment_timeline is not None:
2696                     s_e = segment_timeline.findall(_add_ns('S'))
2697                     if s_e:
2698                         ms_info['total_number'] = 0
2699                         ms_info['s'] = []
2700                         for s in s_e:
2701                             r = int(s.get('r', 0))
2702                             ms_info['total_number'] += 1 + r
2703                             ms_info['s'].append({
2704                                 't': int(s.get('t', 0)),
2705                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2706                                 'd': int(s.attrib['d']),
2707                                 'r': r,
2708                             })
2709                 start_number = source.get('startNumber')
2710                 if start_number:
2711                     ms_info['start_number'] = int(start_number)
2712                 timescale = source.get('timescale')
2713                 if timescale:
2714                     ms_info['timescale'] = int(timescale)
2715                 segment_duration = source.get('duration')
2716                 if segment_duration:
2717                     ms_info['segment_duration'] = float(segment_duration)
2718
2719             def extract_Initialization(source):
2720                 initialization = source.find(_add_ns('Initialization'))
2721                 if initialization is not None:
2722                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2723
2724             segment_list = element.find(_add_ns('SegmentList'))
2725             if segment_list is not None:
2726                 extract_common(segment_list)
2727                 extract_Initialization(segment_list)
2728                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2729                 if segment_urls_e:
2730                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2731             else:
2732                 segment_template = element.find(_add_ns('SegmentTemplate'))
2733                 if segment_template is not None:
2734                     extract_common(segment_template)
2735                     media = segment_template.get('media')
2736                     if media:
2737                         ms_info['media'] = media
2738                     initialization = segment_template.get('initialization')
2739                     if initialization:
2740                         ms_info['initialization'] = initialization
2741                     else:
2742                         extract_Initialization(segment_template)
2743             return ms_info
2744
2745         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2746         stream_numbers = collections.defaultdict(int)
2747         for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
2748             period_entry = {
2749                 'id': period.get('id', f'period-{period_idx}'),
2750                 'formats': [],
2751                 'subtitles': collections.defaultdict(list),
2752             }
2753             period_duration = parse_duration(period.get('duration')) or mpd_duration
2754             period_ms_info = extract_multisegment_info(period, {
2755                 'start_number': 1,
2756                 'timescale': 1,
2757             })
2758             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2759                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2760                 for representation in adaptation_set.findall(_add_ns('Representation')):
2761                     representation_attrib = adaptation_set.attrib.copy()
2762                     representation_attrib.update(representation.attrib)
2763                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2764                     mime_type = representation_attrib['mimeType']
2765                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2766
2767                     codec_str = representation_attrib.get('codecs', '')
2768                     # Some kind of binary subtitle found in some youtube livestreams
2769                     if mime_type == 'application/x-rawcc':
2770                         codecs = {'scodec': codec_str}
2771                     else:
2772                         codecs = parse_codecs(codec_str)
2773                     if content_type not in ('video', 'audio', 'text'):
2774                         if mime_type == 'image/jpeg':
2775                             content_type = mime_type
2776                         elif codecs.get('vcodec', 'none') != 'none':
2777                             content_type = 'video'
2778                         elif codecs.get('acodec', 'none') != 'none':
2779                             content_type = 'audio'
2780                         elif codecs.get('scodec', 'none') != 'none':
2781                             content_type = 'text'
2782                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2783                             content_type = 'text'
2784                         else:
2785                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2786                             continue
2787
2788                     base_url = ''
2789                     for element in (representation, adaptation_set, period, mpd_doc):
2790                         base_url_e = element.find(_add_ns('BaseURL'))
2791                         if try_call(lambda: base_url_e.text) is not None:
2792                             base_url = base_url_e.text + base_url
2793                             if re.match(r'^https?://', base_url):
2794                                 break
2795                     if mpd_base_url and base_url.startswith('/'):
2796                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2797                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2798                         if not mpd_base_url.endswith('/'):
2799                             mpd_base_url += '/'
2800                         base_url = mpd_base_url + base_url
2801                     representation_id = representation_attrib.get('id')
2802                     lang = representation_attrib.get('lang')
2803                     url_el = representation.find(_add_ns('BaseURL'))
2804                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2805                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2806                     if representation_id is not None:
2807                         format_id = representation_id
2808                     else:
2809                         format_id = content_type
2810                     if mpd_id:
2811                         format_id = mpd_id + '-' + format_id
2812                     if content_type in ('video', 'audio'):
2813                         f = {
2814                             'format_id': format_id,
2815                             'manifest_url': mpd_url,
2816                             'ext': mimetype2ext(mime_type),
2817                             'width': int_or_none(representation_attrib.get('width')),
2818                             'height': int_or_none(representation_attrib.get('height')),
2819                             'tbr': float_or_none(bandwidth, 1000),
2820                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2821                             'fps': int_or_none(representation_attrib.get('frameRate')),
2822                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2823                             'format_note': 'DASH %s' % content_type,
2824                             'filesize': filesize,
2825                             'container': mimetype2ext(mime_type) + '_dash',
2826                             **codecs
2827                         }
2828                     elif content_type == 'text':
2829                         f = {
2830                             'ext': mimetype2ext(mime_type),
2831                             'manifest_url': mpd_url,
2832                             'filesize': filesize,
2833                         }
2834                     elif content_type == 'image/jpeg':
2835                         # See test case in VikiIE
2836                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2837                         f = {
2838                             'format_id': format_id,
2839                             'ext': 'mhtml',
2840                             'manifest_url': mpd_url,
2841                             'format_note': 'DASH storyboards (jpeg)',
2842                             'acodec': 'none',
2843                             'vcodec': 'none',
2844                         }
2845                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2846                         f['has_drm'] = True
2847                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2848
2849                     def prepare_template(template_name, identifiers):
2850                         tmpl = representation_ms_info[template_name]
2851                         if representation_id is not None:
2852                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2853                         # First of, % characters outside $...$ templates
2854                         # must be escaped by doubling for proper processing
2855                         # by % operator string formatting used further (see
2856                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2857                         t = ''
2858                         in_template = False
2859                         for c in tmpl:
2860                             t += c
2861                             if c == '$':
2862                                 in_template = not in_template
2863                             elif c == '%' and not in_template:
2864                                 t += c
2865                         # Next, $...$ templates are translated to their
2866                         # %(...) counterparts to be used with % operator
2867                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2868                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2869                         t.replace('$$', '$')
2870                         return t
2871
2872                     # @initialization is a regular template like @media one
2873                     # so it should be handled just the same way (see
2874                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2875                     if 'initialization' in representation_ms_info:
2876                         initialization_template = prepare_template(
2877                             'initialization',
2878                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2879                             # $Time$ shall not be included for @initialization thus
2880                             # only $Bandwidth$ remains
2881                             ('Bandwidth', ))
2882                         representation_ms_info['initialization_url'] = initialization_template % {
2883                             'Bandwidth': bandwidth,
2884                         }
2885
2886                     def location_key(location):
2887                         return 'url' if re.match(r'^https?://', location) else 'path'
2888
2889                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2890
2891                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2892                         media_location_key = location_key(media_template)
2893
2894                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2895                         # can't be used at the same time
2896                         if '%(Number' in media_template and 's' not in representation_ms_info:
2897                             segment_duration = None
2898                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2899                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2900                                 representation_ms_info['total_number'] = int(math.ceil(
2901                                     float_or_none(period_duration, segment_duration, default=0)))
2902                             representation_ms_info['fragments'] = [{
2903                                 media_location_key: media_template % {
2904                                     'Number': segment_number,
2905                                     'Bandwidth': bandwidth,
2906                                 },
2907                                 'duration': segment_duration,
2908                             } for segment_number in range(
2909                                 representation_ms_info['start_number'],
2910                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2911                         else:
2912                             # $Number*$ or $Time$ in media template with S list available
2913                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2914                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2915                             representation_ms_info['fragments'] = []
2916                             segment_time = 0
2917                             segment_d = None
2918                             segment_number = representation_ms_info['start_number']
2919
2920                             def add_segment_url():
2921                                 segment_url = media_template % {
2922                                     'Time': segment_time,
2923                                     'Bandwidth': bandwidth,
2924                                     'Number': segment_number,
2925                                 }
2926                                 representation_ms_info['fragments'].append({
2927                                     media_location_key: segment_url,
2928                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2929                                 })
2930
2931                             for num, s in enumerate(representation_ms_info['s']):
2932                                 segment_time = s.get('t') or segment_time
2933                                 segment_d = s['d']
2934                                 add_segment_url()
2935                                 segment_number += 1
2936                                 for r in range(s.get('r', 0)):
2937                                     segment_time += segment_d
2938                                     add_segment_url()
2939                                     segment_number += 1
2940                                 segment_time += segment_d
2941                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2942                         # No media template,
2943                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2944                         # or any YouTube dashsegments video
2945                         fragments = []
2946                         segment_index = 0
2947                         timescale = representation_ms_info['timescale']
2948                         for s in representation_ms_info['s']:
2949                             duration = float_or_none(s['d'], timescale)
2950                             for r in range(s.get('r', 0) + 1):
2951                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2952                                 fragments.append({
2953                                     location_key(segment_uri): segment_uri,
2954                                     'duration': duration,
2955                                 })
2956                                 segment_index += 1
2957                         representation_ms_info['fragments'] = fragments
2958                     elif 'segment_urls' in representation_ms_info:
2959                         # Segment URLs with no SegmentTimeline
2960                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2961                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2962                         fragments = []
2963                         segment_duration = float_or_none(
2964                             representation_ms_info['segment_duration'],
2965                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2966                         for segment_url in representation_ms_info['segment_urls']:
2967                             fragment = {
2968                                 location_key(segment_url): segment_url,
2969                             }
2970                             if segment_duration:
2971                                 fragment['duration'] = segment_duration
2972                             fragments.append(fragment)
2973                         representation_ms_info['fragments'] = fragments
2974                     # If there is a fragments key available then we correctly recognized fragmented media.
2975                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2976                     # assumption is not necessarily correct since we may simply have no support for
2977                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2978                     if 'fragments' in representation_ms_info:
2979                         f.update({
2980                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2981                             'url': mpd_url or base_url,
2982                             'fragment_base_url': base_url,
2983                             'fragments': [],
2984                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2985                         })
2986                         if 'initialization_url' in representation_ms_info:
2987                             initialization_url = representation_ms_info['initialization_url']
2988                             if not f.get('url'):
2989                                 f['url'] = initialization_url
2990                             f['fragments'].append({location_key(initialization_url): initialization_url})
2991                         f['fragments'].extend(representation_ms_info['fragments'])
2992                         if not period_duration:
2993                             period_duration = try_get(
2994                                 representation_ms_info,
2995                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2996                     else:
2997                         # Assuming direct URL to unfragmented media.
2998                         f['url'] = base_url
2999                     if content_type in ('video', 'audio', 'image/jpeg'):
3000                         f['manifest_stream_number'] = stream_numbers[f['url']]
3001                         stream_numbers[f['url']] += 1
3002                         period_entry['formats'].append(f)
3003                     elif content_type == 'text':
3004                         period_entry['subtitles'][lang or 'und'].append(f)
3005             yield period_entry
3006
3007     def _extract_ism_formats(self, *args, **kwargs):
3008         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3009         if subs:
3010             self._report_ignoring_subs('ISM')
3011         return fmts
3012
3013     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3014         if self.get_param('ignore_no_formats_error'):
3015             fatal = False
3016
3017         res = self._download_xml_handle(
3018             ism_url, video_id,
3019             note='Downloading ISM manifest' if note is None else note,
3020             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3021             fatal=fatal, data=data, headers=headers, query=query)
3022         if res is False:
3023             return [], {}
3024         ism_doc, urlh = res
3025         if ism_doc is None:
3026             return [], {}
3027
3028         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
3029
3030     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3031         """
3032         Parse formats from ISM manifest.
3033         References:
3034          1. [MS-SSTR]: Smooth Streaming Protocol,
3035             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3036         """
3037         if ism_doc.get('IsLive') == 'TRUE':
3038             return [], {}
3039
3040         duration = int(ism_doc.attrib['Duration'])
3041         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3042
3043         formats = []
3044         subtitles = {}
3045         for stream in ism_doc.findall('StreamIndex'):
3046             stream_type = stream.get('Type')
3047             if stream_type not in ('video', 'audio', 'text'):
3048                 continue
3049             url_pattern = stream.attrib['Url']
3050             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3051             stream_name = stream.get('Name')
3052             stream_language = stream.get('Language', 'und')
3053             for track in stream.findall('QualityLevel'):
3054                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3055                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3056                 # TODO: add support for WVC1 and WMAP
3057                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3058                     self.report_warning('%s is not a supported codec' % fourcc)
3059                     continue
3060                 tbr = int(track.attrib['Bitrate']) // 1000
3061                 # [1] does not mention Width and Height attributes. However,
3062                 # they're often present while MaxWidth and MaxHeight are
3063                 # missing, so should be used as fallbacks
3064                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3065                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3066                 sampling_rate = int_or_none(track.get('SamplingRate'))
3067
3068                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3069                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3070
3071                 fragments = []
3072                 fragment_ctx = {
3073                     'time': 0,
3074                 }
3075                 stream_fragments = stream.findall('c')
3076                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3077                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3078                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3079                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3080                     if not fragment_ctx['duration']:
3081                         try:
3082                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3083                         except IndexError:
3084                             next_fragment_time = duration
3085                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3086                     for _ in range(fragment_repeat):
3087                         fragments.append({
3088                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3089                             'duration': fragment_ctx['duration'] / stream_timescale,
3090                         })
3091                         fragment_ctx['time'] += fragment_ctx['duration']
3092
3093                 if stream_type == 'text':
3094                     subtitles.setdefault(stream_language, []).append({
3095                         'ext': 'ismt',
3096                         'protocol': 'ism',
3097                         'url': ism_url,
3098                         'manifest_url': ism_url,
3099                         'fragments': fragments,
3100                         '_download_params': {
3101                             'stream_type': stream_type,
3102                             'duration': duration,
3103                             'timescale': stream_timescale,
3104                             'fourcc': fourcc,
3105                             'language': stream_language,
3106                             'codec_private_data': track.get('CodecPrivateData'),
3107                         }
3108                     })
3109                 elif stream_type in ('video', 'audio'):
3110                     formats.append({
3111                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3112                         'url': ism_url,
3113                         'manifest_url': ism_url,
3114                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3115                         'width': width,
3116                         'height': height,
3117                         'tbr': tbr,
3118                         'asr': sampling_rate,
3119                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3120                         'acodec': 'none' if stream_type == 'video' else fourcc,
3121                         'protocol': 'ism',
3122                         'fragments': fragments,
3123                         'has_drm': ism_doc.find('Protection') is not None,
3124                         'language': stream_language,
3125                         'audio_channels': int_or_none(track.get('Channels')),
3126                         '_download_params': {
3127                             'stream_type': stream_type,
3128                             'duration': duration,
3129                             'timescale': stream_timescale,
3130                             'width': width or 0,
3131                             'height': height or 0,
3132                             'fourcc': fourcc,
3133                             'language': stream_language,
3134                             'codec_private_data': track.get('CodecPrivateData'),
3135                             'sampling_rate': sampling_rate,
3136                             'channels': int_or_none(track.get('Channels', 2)),
3137                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3138                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3139                         },
3140                     })
3141         return formats, subtitles
3142
3143     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3144         def absolute_url(item_url):
3145             return urljoin(base_url, item_url)
3146
3147         def parse_content_type(content_type):
3148             if not content_type:
3149                 return {}
3150             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3151             if ctr:
3152                 mimetype, codecs = ctr.groups()
3153                 f = parse_codecs(codecs)
3154                 f['ext'] = mimetype2ext(mimetype)
3155                 return f
3156             return {}
3157
3158         def _media_formats(src, cur_media_type, type_info=None):
3159             type_info = type_info or {}
3160             full_url = absolute_url(src)
3161             ext = type_info.get('ext') or determine_ext(full_url)
3162             if ext == 'm3u8':
3163                 is_plain_url = False
3164                 formats = self._extract_m3u8_formats(
3165                     full_url, video_id, ext='mp4',
3166                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3167                     preference=preference, quality=quality, fatal=False)
3168             elif ext == 'mpd':
3169                 is_plain_url = False
3170                 formats = self._extract_mpd_formats(
3171                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3172             else:
3173                 is_plain_url = True
3174                 formats = [{
3175                     'url': full_url,
3176                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3177                     'ext': ext,
3178                 }]
3179             return is_plain_url, formats
3180
3181         entries = []
3182         # amp-video and amp-audio are very similar to their HTML5 counterparts
3183         # so we will include them right here (see
3184         # https://www.ampproject.org/docs/reference/components/amp-video)
3185         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3186         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3187         media_tags = [(media_tag, media_tag_name, media_type, '')
3188                       for media_tag, media_tag_name, media_type
3189                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3190         media_tags.extend(re.findall(
3191             # We only allow video|audio followed by a whitespace or '>'.
3192             # Allowing more characters may end up in significant slow down (see
3193             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3194             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3195             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3196         for media_tag, _, media_type, media_content in media_tags:
3197             media_info = {
3198                 'formats': [],
3199                 'subtitles': {},
3200             }
3201             media_attributes = extract_attributes(media_tag)
3202             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3203             if src:
3204                 f = parse_content_type(media_attributes.get('type'))
3205                 _, formats = _media_formats(src, media_type, f)
3206                 media_info['formats'].extend(formats)
3207             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3208             if media_content:
3209                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3210                     s_attr = extract_attributes(source_tag)
3211                     # data-video-src and data-src are non standard but seen
3212                     # several times in the wild
3213                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3214                     if not src:
3215                         continue
3216                     f = parse_content_type(s_attr.get('type'))
3217                     is_plain_url, formats = _media_formats(src, media_type, f)
3218                     if is_plain_url:
3219                         # width, height, res, label and title attributes are
3220                         # all not standard but seen several times in the wild
3221                         labels = [
3222                             s_attr.get(lbl)
3223                             for lbl in ('label', 'title')
3224                             if str_or_none(s_attr.get(lbl))
3225                         ]
3226                         width = int_or_none(s_attr.get('width'))
3227                         height = (int_or_none(s_attr.get('height'))
3228                                   or int_or_none(s_attr.get('res')))
3229                         if not width or not height:
3230                             for lbl in labels:
3231                                 resolution = parse_resolution(lbl)
3232                                 if not resolution:
3233                                     continue
3234                                 width = width or resolution.get('width')
3235                                 height = height or resolution.get('height')
3236                         for lbl in labels:
3237                             tbr = parse_bitrate(lbl)
3238                             if tbr:
3239                                 break
3240                         else:
3241                             tbr = None
3242                         f.update({
3243                             'width': width,
3244                             'height': height,
3245                             'tbr': tbr,
3246                             'format_id': s_attr.get('label') or s_attr.get('title'),
3247                         })
3248                         f.update(formats[0])
3249                         media_info['formats'].append(f)
3250                     else:
3251                         media_info['formats'].extend(formats)
3252                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3253                     track_attributes = extract_attributes(track_tag)
3254                     kind = track_attributes.get('kind')
3255                     if not kind or kind in ('subtitles', 'captions'):
3256                         src = strip_or_none(track_attributes.get('src'))
3257                         if not src:
3258                             continue
3259                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3260                         media_info['subtitles'].setdefault(lang, []).append({
3261                             'url': absolute_url(src),
3262                         })
3263             for f in media_info['formats']:
3264                 f.setdefault('http_headers', {})['Referer'] = base_url
3265             if media_info['formats'] or media_info['subtitles']:
3266                 entries.append(media_info)
3267         return entries
3268
3269     def _extract_akamai_formats(self, *args, **kwargs):
3270         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3271         if subs:
3272             self._report_ignoring_subs('akamai')
3273         return fmts
3274
3275     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3276         signed = 'hdnea=' in manifest_url
3277         if not signed:
3278             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3279             manifest_url = re.sub(
3280                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3281                 '', manifest_url).strip('?')
3282
3283         formats = []
3284         subtitles = {}
3285
3286         hdcore_sign = 'hdcore=3.7.0'
3287         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3288         hds_host = hosts.get('hds')
3289         if hds_host:
3290             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3291         if 'hdcore=' not in f4m_url:
3292             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3293         f4m_formats = self._extract_f4m_formats(
3294             f4m_url, video_id, f4m_id='hds', fatal=False)
3295         for entry in f4m_formats:
3296             entry.update({'extra_param_to_segment_url': hdcore_sign})
3297         formats.extend(f4m_formats)
3298
3299         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3300         hls_host = hosts.get('hls')
3301         if hls_host:
3302             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3303         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3304             m3u8_url, video_id, 'mp4', 'm3u8_native',
3305             m3u8_id='hls', fatal=False)
3306         formats.extend(m3u8_formats)
3307         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3308
3309         http_host = hosts.get('http')
3310         if http_host and m3u8_formats and not signed:
3311             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3312             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3313             qualities_length = len(qualities)
3314             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3315                 i = 0
3316                 for f in m3u8_formats:
3317                     if f['vcodec'] != 'none':
3318                         for protocol in ('http', 'https'):
3319                             http_f = f.copy()
3320                             del http_f['manifest_url']
3321                             http_url = re.sub(
3322                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3323                             http_f.update({
3324                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3325                                 'url': http_url,
3326                                 'protocol': protocol,
3327                             })
3328                             formats.append(http_f)
3329                         i += 1
3330
3331         return formats, subtitles
3332
3333     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3334         query = urllib.parse.urlparse(url).query
3335         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3336         mobj = re.search(
3337             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3338         url_base = mobj.group('url')
3339         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3340         formats = []
3341
3342         def manifest_url(manifest):
3343             m_url = f'{http_base_url}/{manifest}'
3344             if query:
3345                 m_url += '?%s' % query
3346             return m_url
3347
3348         if 'm3u8' not in skip_protocols:
3349             formats.extend(self._extract_m3u8_formats(
3350                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3351                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3352         if 'f4m' not in skip_protocols:
3353             formats.extend(self._extract_f4m_formats(
3354                 manifest_url('manifest.f4m'),
3355                 video_id, f4m_id='hds', fatal=False))
3356         if 'dash' not in skip_protocols:
3357             formats.extend(self._extract_mpd_formats(
3358                 manifest_url('manifest.mpd'),
3359                 video_id, mpd_id='dash', fatal=False))
3360         if re.search(r'(?:/smil:|\.smil)', url_base):
3361             if 'smil' not in skip_protocols:
3362                 rtmp_formats = self._extract_smil_formats(
3363                     manifest_url('jwplayer.smil'),
3364                     video_id, fatal=False)
3365                 for rtmp_format in rtmp_formats:
3366                     rtsp_format = rtmp_format.copy()
3367                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3368                     del rtsp_format['play_path']
3369                     del rtsp_format['ext']
3370                     rtsp_format.update({
3371                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3372                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3373                         'protocol': 'rtsp',
3374                     })
3375                     formats.extend([rtmp_format, rtsp_format])
3376         else:
3377             for protocol in ('rtmp', 'rtsp'):
3378                 if protocol not in skip_protocols:
3379                     formats.append({
3380                         'url': f'{protocol}:{url_base}',
3381                         'format_id': protocol,
3382                         'protocol': protocol,
3383                     })
3384         return formats
3385
3386     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3387         mobj = re.search(
3388             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3389             webpage)
3390         if mobj:
3391             try:
3392                 jwplayer_data = self._parse_json(mobj.group('options'),
3393                                                  video_id=video_id,
3394                                                  transform_source=transform_source)
3395             except ExtractorError:
3396                 pass
3397             else:
3398                 if isinstance(jwplayer_data, dict):
3399                     return jwplayer_data
3400
3401     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3402         jwplayer_data = self._find_jwplayer_data(
3403             webpage, video_id, transform_source=js_to_json)
3404         return self._parse_jwplayer_data(
3405             jwplayer_data, video_id, *args, **kwargs)
3406
3407     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3408                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3409         entries = []
3410         if not isinstance(jwplayer_data, dict):
3411             return entries
3412
3413         playlist_items = jwplayer_data.get('playlist')
3414         # JWPlayer backward compatibility: single playlist item/flattened playlists
3415         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3416         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3417         if not isinstance(playlist_items, list):
3418             playlist_items = (playlist_items or jwplayer_data, )
3419
3420         for video_data in playlist_items:
3421             if not isinstance(video_data, dict):
3422                 continue
3423             # JWPlayer backward compatibility: flattened sources
3424             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3425             if 'sources' not in video_data:
3426                 video_data['sources'] = [video_data]
3427
3428             this_video_id = video_id or video_data['mediaid']
3429
3430             formats = self._parse_jwplayer_formats(
3431                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3432                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3433
3434             subtitles = {}
3435             tracks = video_data.get('tracks')
3436             if tracks and isinstance(tracks, list):
3437                 for track in tracks:
3438                     if not isinstance(track, dict):
3439                         continue
3440                     track_kind = track.get('kind')
3441                     if not track_kind or not isinstance(track_kind, str):
3442                         continue
3443                     if track_kind.lower() not in ('captions', 'subtitles'):
3444                         continue
3445                     track_url = urljoin(base_url, track.get('file'))
3446                     if not track_url:
3447                         continue
3448                     subtitles.setdefault(track.get('label') or 'en', []).append({
3449                         'url': self._proto_relative_url(track_url)
3450                     })
3451
3452             entry = {
3453                 'id': this_video_id,
3454                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3455                 'description': clean_html(video_data.get('description')),
3456                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3457                 'timestamp': int_or_none(video_data.get('pubdate')),
3458                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3459                 'subtitles': subtitles,
3460                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3461                 'genre': clean_html(video_data.get('genre')),
3462                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3463                 'season_number': int_or_none(video_data.get('season')),
3464                 'episode_number': int_or_none(video_data.get('episode')),
3465                 'release_year': int_or_none(video_data.get('releasedate')),
3466                 'age_limit': int_or_none(video_data.get('age_restriction')),
3467             }
3468             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3469             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3470                 entry.update({
3471                     '_type': 'url_transparent',
3472                     'url': formats[0]['url'],
3473                 })
3474             else:
3475                 entry['formats'] = formats
3476             entries.append(entry)
3477         if len(entries) == 1:
3478             return entries[0]
3479         else:
3480             return self.playlist_result(entries)
3481
3482     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3483                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3484         urls = set()
3485         formats = []
3486         for source in jwplayer_sources_data:
3487             if not isinstance(source, dict):
3488                 continue
3489             source_url = urljoin(
3490                 base_url, self._proto_relative_url(source.get('file')))
3491             if not source_url or source_url in urls:
3492                 continue
3493             urls.add(source_url)
3494             source_type = source.get('type') or ''
3495             ext = mimetype2ext(source_type) or determine_ext(source_url)
3496             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3497                 formats.extend(self._extract_m3u8_formats(
3498                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3499                     m3u8_id=m3u8_id, fatal=False))
3500             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3501                 formats.extend(self._extract_mpd_formats(
3502                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3503             elif ext == 'smil':
3504                 formats.extend(self._extract_smil_formats(
3505                     source_url, video_id, fatal=False))
3506             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3507             elif source_type.startswith('audio') or ext in (
3508                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3509                 formats.append({
3510                     'url': source_url,
3511                     'vcodec': 'none',
3512                     'ext': ext,
3513                 })
3514             else:
3515                 format_id = str_or_none(source.get('label'))
3516                 height = int_or_none(source.get('height'))
3517                 if height is None and format_id:
3518                     # Often no height is provided but there is a label in
3519                     # format like "1080p", "720p SD", or 1080.
3520                     height = parse_resolution(format_id).get('height')
3521                 a_format = {
3522                     'url': source_url,
3523                     'width': int_or_none(source.get('width')),
3524                     'height': height,
3525                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3526                     'filesize': int_or_none(source.get('filesize')),
3527                     'ext': ext,
3528                     'format_id': format_id
3529                 }
3530                 if source_url.startswith('rtmp'):
3531                     a_format['ext'] = 'flv'
3532                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3533                     # of jwplayer.flash.swf
3534                     rtmp_url_parts = re.split(
3535                         r'((?:mp4|mp3|flv):)', source_url, 1)
3536                     if len(rtmp_url_parts) == 3:
3537                         rtmp_url, prefix, play_path = rtmp_url_parts
3538                         a_format.update({
3539                             'url': rtmp_url,
3540                             'play_path': prefix + play_path,
3541                         })
3542                     if rtmp_params:
3543                         a_format.update(rtmp_params)
3544                 formats.append(a_format)
3545         return formats
3546
3547     def _live_title(self, name):
3548         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3549         return name
3550
3551     def _int(self, v, name, fatal=False, **kwargs):
3552         res = int_or_none(v, **kwargs)
3553         if res is None:
3554             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3555             if fatal:
3556                 raise ExtractorError(msg)
3557             else:
3558                 self.report_warning(msg)
3559         return res
3560
3561     def _float(self, v, name, fatal=False, **kwargs):
3562         res = float_or_none(v, **kwargs)
3563         if res is None:
3564             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3565             if fatal:
3566                 raise ExtractorError(msg)
3567             else:
3568                 self.report_warning(msg)
3569         return res
3570
3571     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3572                     path='/', secure=False, discard=False, rest={}, **kwargs):
3573         cookie = http.cookiejar.Cookie(
3574             0, name, value, port, port is not None, domain, True,
3575             domain.startswith('.'), path, True, secure, expire_time,
3576             discard, None, None, rest)
3577         self.cookiejar.set_cookie(cookie)
3578
3579     def _get_cookies(self, url):
3580         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3581         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3582
3583     def _apply_first_set_cookie_header(self, url_handle, cookie):
3584         """
3585         Apply first Set-Cookie header instead of the last. Experimental.
3586
3587         Some sites (e.g. [1-3]) may serve two cookies under the same name
3588         in Set-Cookie header and expect the first (old) one to be set rather
3589         than second (new). However, as of RFC6265 the newer one cookie
3590         should be set into cookie store what actually happens.
3591         We will workaround this issue by resetting the cookie to
3592         the first one manually.
3593         1. https://new.vk.com/
3594         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3595         3. https://learning.oreilly.com/
3596         """
3597         for header, cookies in url_handle.headers.items():
3598             if header.lower() != 'set-cookie':
3599                 continue
3600             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3601             cookie_value = re.search(
3602                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3603             if cookie_value:
3604                 value, domain = cookie_value.groups()
3605                 self._set_cookie(domain, cookie, value)
3606                 break
3607
3608     @classmethod
3609     def get_testcases(cls, include_onlymatching=False):
3610         # Do not look in super classes
3611         t = vars(cls).get('_TEST')
3612         if t:
3613             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3614             tests = [t]
3615         else:
3616             tests = vars(cls).get('_TESTS', [])
3617         for t in tests:
3618             if not include_onlymatching and t.get('only_matching', False):
3619                 continue
3620             t['name'] = cls.ie_key()
3621             yield t
3622         if getattr(cls, '__wrapped__', None):
3623             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3624
3625     @classmethod
3626     def get_webpage_testcases(cls):
3627         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3628         for t in tests:
3629             t['name'] = cls.ie_key()
3630             yield t
3631         if getattr(cls, '__wrapped__', None):
3632             yield from cls.__wrapped__.get_webpage_testcases()
3633
3634     @classproperty(cache=True)
3635     def age_limit(cls):
3636         """Get age limit from the testcases"""
3637         return max(traverse_obj(
3638             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3639             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3640
3641     @classproperty(cache=True)
3642     def _RETURN_TYPE(cls):
3643         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3644         tests = tuple(cls.get_testcases(include_onlymatching=False))
3645         if not tests:
3646             return None
3647         elif not any(k.startswith('playlist') for test in tests for k in test):
3648             return 'video'
3649         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3650             return 'playlist'
3651         return 'any'
3652
3653     @classmethod
3654     def is_single_video(cls, url):
3655         """Returns whether the URL is of a single video, None if unknown"""
3656         if cls.suitable(url):
3657             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3658
3659     @classmethod
3660     def is_suitable(cls, age_limit):
3661         """Test whether the extractor is generally suitable for the given age limit"""
3662         return not age_restricted(cls.age_limit, age_limit)
3663
3664     @classmethod
3665     def description(cls, *, markdown=True, search_examples=None):
3666         """Description of the extractor"""
3667         desc = ''
3668         if cls._NETRC_MACHINE:
3669             if markdown:
3670                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3671             else:
3672                 desc += f' [{cls._NETRC_MACHINE}]'
3673         if cls.IE_DESC is False:
3674             desc += ' [HIDDEN]'
3675         elif cls.IE_DESC:
3676             desc += f' {cls.IE_DESC}'
3677         if cls.SEARCH_KEY:
3678             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3679             if search_examples:
3680                 _COUNTS = ('', '5', '10', 'all')
3681                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3682         if not cls.working():
3683             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3684
3685         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3686         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3687         return f'{name}:{desc}' if desc else name
3688
3689     def extract_subtitles(self, *args, **kwargs):
3690         if (self.get_param('writesubtitles', False)
3691                 or self.get_param('listsubtitles')):
3692             return self._get_subtitles(*args, **kwargs)
3693         return {}
3694
3695     def _get_subtitles(self, *args, **kwargs):
3696         raise NotImplementedError('This method must be implemented by subclasses')
3697
3698     class CommentsDisabled(Exception):
3699         """Raise in _get_comments if comments are disabled for the video"""
3700
3701     def extract_comments(self, *args, **kwargs):
3702         if not self.get_param('getcomments'):
3703             return None
3704         generator = self._get_comments(*args, **kwargs)
3705
3706         def extractor():
3707             comments = []
3708             interrupted = True
3709             try:
3710                 while True:
3711                     comments.append(next(generator))
3712             except StopIteration:
3713                 interrupted = False
3714             except KeyboardInterrupt:
3715                 self.to_screen('Interrupted by user')
3716             except self.CommentsDisabled:
3717                 return {'comments': None, 'comment_count': None}
3718             except Exception as e:
3719                 if self.get_param('ignoreerrors') is not True:
3720                     raise
3721                 self._downloader.report_error(e)
3722             comment_count = len(comments)
3723             self.to_screen(f'Extracted {comment_count} comments')
3724             return {
3725                 'comments': comments,
3726                 'comment_count': None if interrupted else comment_count
3727             }
3728         return extractor
3729
3730     def _get_comments(self, *args, **kwargs):
3731         raise NotImplementedError('This method must be implemented by subclasses')
3732
3733     @staticmethod
3734     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3735         """ Merge subtitle items for one language. Items with duplicated URLs/data
3736         will be dropped. """
3737         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3738         ret = list(subtitle_list1)
3739         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3740         return ret
3741
3742     @classmethod
3743     def _merge_subtitles(cls, *dicts, target=None):
3744         """ Merge subtitle dictionaries, language by language. """
3745         if target is None:
3746             target = {}
3747         for d in dicts:
3748             for lang, subs in d.items():
3749                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3750         return target
3751
3752     def extract_automatic_captions(self, *args, **kwargs):
3753         if (self.get_param('writeautomaticsub', False)
3754                 or self.get_param('listsubtitles')):
3755             return self._get_automatic_captions(*args, **kwargs)
3756         return {}
3757
3758     def _get_automatic_captions(self, *args, **kwargs):
3759         raise NotImplementedError('This method must be implemented by subclasses')
3760
3761     @functools.cached_property
3762     def _cookies_passed(self):
3763         """Whether cookies have been passed to YoutubeDL"""
3764         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3765
3766     def mark_watched(self, *args, **kwargs):
3767         if not self.get_param('mark_watched', False):
3768             return
3769         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3770             self._mark_watched(*args, **kwargs)
3771
3772     def _mark_watched(self, *args, **kwargs):
3773         raise NotImplementedError('This method must be implemented by subclasses')
3774
3775     def geo_verification_headers(self):
3776         headers = {}
3777         geo_verification_proxy = self.get_param('geo_verification_proxy')
3778         if geo_verification_proxy:
3779             headers['Ytdl-request-proxy'] = geo_verification_proxy
3780         return headers
3781
3782     @staticmethod
3783     def _generic_id(url):
3784         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3785
3786     def _generic_title(self, url='', webpage='', *, default=None):
3787         return (self._og_search_title(webpage, default=None)
3788                 or self._html_extract_title(webpage, default=None)
3789                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3790                 or default)
3791
3792     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3793         if not duration:
3794             return
3795         chapter_list = [{
3796             'start_time': start_function(chapter),
3797             'title': title_function(chapter),
3798         } for chapter in chapter_list or []]
3799         if strict:
3800             warn = self.report_warning
3801         else:
3802             warn = self.write_debug
3803             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3804
3805         chapters = [{'start_time': 0}]
3806         for idx, chapter in enumerate(chapter_list):
3807             if chapter['start_time'] is None:
3808                 warn(f'Incomplete chapter {idx}')
3809             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3810                 chapters.append(chapter)
3811             elif chapter not in chapters:
3812                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3813                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3814                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3815         return chapters[1:]
3816
3817     def _extract_chapters_from_description(self, description, duration):
3818         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3819         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3820         return self._extract_chapters_helper(
3821             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3822             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3823             duration=duration, strict=False) or self._extract_chapters_helper(
3824             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3825             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3826             duration=duration, strict=False)
3827
3828     @staticmethod
3829     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3830         all_known = all(map(
3831             lambda x: x is not None,
3832             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3833         return (
3834             'private' if is_private
3835             else 'premium_only' if needs_premium
3836             else 'subscriber_only' if needs_subscription
3837             else 'needs_auth' if needs_auth
3838             else 'unlisted' if is_unlisted
3839             else 'public' if all_known
3840             else None)
3841
3842     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3843         '''
3844         @returns            A list of values for the extractor argument given by "key"
3845                             or "default" if no such key is present
3846         @param default      The default value to return when the key is not present (default: [])
3847         @param casesense    When false, the values are converted to lower case
3848         '''
3849         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3850         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3851         if val is None:
3852             return [] if default is NO_DEFAULT else default
3853         return list(val) if casesense else [x.lower() for x in val]
3854
3855     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3856         if not playlist_id or not video_id:
3857             return not video_id
3858
3859         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3860         if no_playlist is not None:
3861             return not no_playlist
3862
3863         video_id = '' if video_id is True else f' {video_id}'
3864         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3865         if self.get_param('noplaylist'):
3866             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3867             return False
3868         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3869         return True
3870
3871     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3872         RetryManager.report_retry(
3873             err, _count or int(fatal), _retries,
3874             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3875             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3876
3877     def RetryManager(self, **kwargs):
3878         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3879
3880     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3881         display_id = traverse_obj(info_dict, 'display_id', 'id')
3882         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3883         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3884             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3885
3886     @classmethod
3887     def extract_from_webpage(cls, ydl, url, webpage):
3888         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3889               else ydl.get_info_extractor(cls.ie_key()))
3890         for info in ie._extract_from_webpage(url, webpage) or []:
3891             # url = None since we do not want to set (webpage/original)_url
3892             ydl.add_default_extra_info(info, ie, None)
3893             yield info
3894
3895     @classmethod
3896     def _extract_from_webpage(cls, url, webpage):
3897         for embed_url in orderedSet(
3898                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3899             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3900
3901     @classmethod
3902     def _extract_embed_urls(cls, url, webpage):
3903         """@returns all the embed urls on the webpage"""
3904         if '_EMBED_URL_RE' not in cls.__dict__:
3905             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3906             for idx, regex in enumerate(cls._EMBED_REGEX):
3907                 assert regex.count('(?P<url>') == 1, \
3908                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3909             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3910
3911         for regex in cls._EMBED_URL_RE:
3912             for mobj in regex.finditer(webpage):
3913                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3914                 if cls._VALID_URL is False or cls.suitable(embed_url):
3915                     yield embed_url
3916
3917     class StopExtraction(Exception):
3918         pass
3919
3920     @classmethod
3921     def _extract_url(cls, webpage):  # TODO: Remove
3922         """Only for compatibility with some older extractors"""
3923         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3924
3925     @classmethod
3926     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3927         if plugin_name:
3928             mro = inspect.getmro(cls)
3929             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3930             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3931             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3932             while getattr(super_class, '__wrapped__', None):
3933                 super_class = super_class.__wrapped__
3934             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3935             _PLUGIN_OVERRIDES[super_class].append(cls)
3936
3937         return super().__init_subclass__(**kwargs)
3938
3939
3940 class SearchInfoExtractor(InfoExtractor):
3941     """
3942     Base class for paged search queries extractors.
3943     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3944     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3945     """
3946
3947     _MAX_RESULTS = float('inf')
3948     _RETURN_TYPE = 'playlist'
3949
3950     @classproperty
3951     def _VALID_URL(cls):
3952         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3953
3954     def _real_extract(self, query):
3955         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3956         if prefix == '':
3957             return self._get_n_results(query, 1)
3958         elif prefix == 'all':
3959             return self._get_n_results(query, self._MAX_RESULTS)
3960         else:
3961             n = int(prefix)
3962             if n <= 0:
3963                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3964             elif n > self._MAX_RESULTS:
3965                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3966                 n = self._MAX_RESULTS
3967             return self._get_n_results(query, n)
3968
3969     def _get_n_results(self, query, n):
3970         """Get a specified number of results for a query.
3971         Either this function or _search_results must be overridden by subclasses """
3972         return self.playlist_result(
3973             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3974             query, query)
3975
3976     def _search_results(self, query):
3977         """Returns an iterator of search results"""
3978         raise NotImplementedError('This method must be implemented by subclasses')
3979
3980     @classproperty
3981     def SEARCH_KEY(cls):
3982         return cls._SEARCH_KEY
3983
3984
3985 class UnsupportedURLIE(InfoExtractor):
3986     _VALID_URL = '.*'
3987     _ENABLED = False
3988     IE_DESC = False
3989
3990     def _real_extract(self, url):
3991         raise UnsupportedError(url)
3992
3993
3994 _PLUGIN_OVERRIDES = collections.defaultdict(list)