yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import subprocess
  17 import sys
  18 import time
  19 import types
  20 import urllib.parse
  21 import urllib.request
  22 import xml.etree.ElementTree
  23
  24 from ..compat import functools  # isort: split
  25 from ..compat import (
  26     compat_etree_fromstring,
  27     compat_expanduser,
  28     compat_os_name,
  29     urllib_req_to_req,
  30 )
  31 from ..cookies import LenientSimpleCookie
  32 from ..downloader.f4m import get_base_url, remove_encrypted_media
  33 from ..downloader.hls import HlsFD
  34 from ..networking import HEADRequest, Request
  35 from ..networking.exceptions import (
  36     HTTPError,
  37     IncompleteRead,
  38     network_exceptions,
  39 )
  40 from ..utils import (
  41     IDENTITY,
  42     JSON_LD_RE,
  43     NO_DEFAULT,
  44     ExtractorError,
  45     FormatSorter,
  46     GeoRestrictedError,
  47     GeoUtils,
  48     LenientJSONDecoder,
  49     Popen,
  50     RegexNotFoundError,
  51     RetryManager,
  52     UnsupportedError,
  53     age_restricted,
  54     base_url,
  55     bug_reports_message,
  56     classproperty,
  57     clean_html,
  58     deprecation_warning,
  59     determine_ext,
  60     dict_get,
  61     encode_data_uri,
  62     error_to_compat_str,
  63     extract_attributes,
  64     filter_dict,
  65     fix_xml_ampersands,
  66     float_or_none,
  67     format_field,
  68     int_or_none,
  69     join_nonempty,
  70     js_to_json,
  71     mimetype2ext,
  72     netrc_from_content,
  73     orderedSet,
  74     parse_bitrate,
  75     parse_codecs,
  76     parse_duration,
  77     parse_iso8601,
  78     parse_m3u8_attributes,
  79     parse_resolution,
  80     sanitize_filename,
  81     sanitize_url,
  82     smuggle_url,
  83     str_or_none,
  84     str_to_int,
  85     strip_or_none,
  86     traverse_obj,
  87     truncate_string,
  88     try_call,
  89     try_get,
  90     unescapeHTML,
  91     unified_strdate,
  92     unified_timestamp,
  93     url_basename,
  94     url_or_none,
  95     urlhandle_detect_ext,
  96     urljoin,
  97     variadic,
  98     xpath_element,
  99     xpath_text,
 100     xpath_with_ns,
 101 )
 102
 103
 104 class InfoExtractor:
 105     """Information Extractor class.
 106
 107     Information extractors are the classes that, given a URL, extract
 108     information about the video (or videos) the URL refers to. This
 109     information includes the real video URL, the video title, author and
 110     others. The information is stored in a dictionary which is then
 111     passed to the YoutubeDL. The YoutubeDL processes this
 112     information possibly downloading the video to the file system, among
 113     other possible outcomes.
 114
 115     The type field determines the type of the result.
 116     By far the most common value (and the default if _type is missing) is
 117     "video", which indicates a single video.
 118
 119     For a video, the dictionaries must include the following fields:
 120
 121     id:             Video identifier.
 122     title:          Video title, unescaped. Set to an empty string if video has
 123                     no title as opposed to "None" which signifies that the
 124                     extractor failed to obtain a title
 125
 126     Additionally, it must contain either a formats entry or a url one:
 127
 128     formats:        A list of dictionaries for each format available, ordered
 129                     from worst to best quality.
 130
 131                     Potential fields:
 132                     * url        The mandatory URL representing the media:
 133                                    for plain file media - HTTP URL of this file,
 134                                    for RTMP - RTMP URL,
 135                                    for HLS - URL of the M3U8 media playlist,
 136                                    for HDS - URL of the F4M manifest,
 137                                    for DASH
 138                                      - HTTP URL to plain file media (in case of
 139                                        unfragmented media)
 140                                      - URL of the MPD manifest or base URL
 141                                        representing the media if MPD manifest
 142                                        is parsed from a string (in case of
 143                                        fragmented media)
 144                                    for MSS - URL of the ISM manifest.
 145                     * request_data  Data to send in POST request to the URL
 146                     * manifest_url
 147                                  The URL of the manifest file in case of
 148                                  fragmented media:
 149                                    for HLS - URL of the M3U8 master playlist,
 150                                    for HDS - URL of the F4M manifest,
 151                                    for DASH - URL of the MPD manifest,
 152                                    for MSS - URL of the ISM manifest.
 153                     * manifest_stream_number  (For internal use only)
 154                                  The index of the stream in the manifest file
 155                     * ext        Will be calculated from URL if missing
 156                     * format     A human-readable description of the format
 157                                  ("mp4 container with h264/opus").
 158                                  Calculated from the format_id, width, height.
 159                                  and format_note fields if missing.
 160                     * format_id  A short description of the format
 161                                  ("mp4_h264_opus" or "19").
 162                                 Technically optional, but strongly recommended.
 163                     * format_note Additional info about the format
 164                                  ("3D" or "DASH video")
 165                     * width      Width of the video, if known
 166                     * height     Height of the video, if known
 167                     * aspect_ratio  Aspect ratio of the video, if known
 168                                  Automatically calculated from width and height
 169                     * resolution Textual description of width and height
 170                                  Automatically calculated from width and height
 171                     * dynamic_range The dynamic range of the video. One of:
 172                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 173                     * tbr        Average bitrate of audio and video in KBit/s
 174                     * abr        Average audio bitrate in KBit/s
 175                     * acodec     Name of the audio codec in use
 176                     * asr        Audio sampling rate in Hertz
 177                     * audio_channels  Number of audio channels
 178                     * vbr        Average video bitrate in KBit/s
 179                     * fps        Frame rate
 180                     * vcodec     Name of the video codec in use
 181                     * container  Name of the container format
 182                     * filesize   The number of bytes, if known in advance
 183                     * filesize_approx  An estimate for the number of bytes
 184                     * player_url SWF Player URL (used for rtmpdump).
 185                     * protocol   The protocol that will be used for the actual
 186                                  download, lower-case. One of "http", "https" or
 187                                  one of the protocols defined in downloader.PROTOCOL_MAP
 188                     * fragment_base_url
 189                                  Base URL for fragments. Each fragment's path
 190                                  value (if present) will be relative to
 191                                  this URL.
 192                     * fragments  A list of fragments of a fragmented media.
 193                                  Each fragment entry must contain either an url
 194                                  or a path. If an url is present it should be
 195                                  considered by a client. Otherwise both path and
 196                                  fragment_base_url must be present. Here is
 197                                  the list of all potential fields:
 198                                  * "url" - fragment's URL
 199                                  * "path" - fragment's path relative to
 200                                             fragment_base_url
 201                                  * "duration" (optional, int or float)
 202                                  * "filesize" (optional, int)
 203                     * is_from_start  Is a live format that can be downloaded
 204                                 from the start. Boolean
 205                     * preference Order number of this format. If this field is
 206                                  present and not None, the formats get sorted
 207                                  by this field, regardless of all other values.
 208                                  -1 for default (order by other properties),
 209                                  -2 or smaller for less than default.
 210                                  < -1000 to hide the format (if there is
 211                                     another one which is strictly better)
 212                     * language   Language code, e.g. "de" or "en-US".
 213                     * language_preference  Is this in the language mentioned in
 214                                  the URL?
 215                                  10 if it's what the URL is about,
 216                                  -1 for default (don't know),
 217                                  -10 otherwise, other values reserved for now.
 218                     * quality    Order number of the video quality of this
 219                                  format, irrespective of the file format.
 220                                  -1 for default (order by other properties),
 221                                  -2 or smaller for less than default.
 222                     * source_preference  Order number for this video source
 223                                   (quality takes higher priority)
 224                                  -1 for default (order by other properties),
 225                                  -2 or smaller for less than default.
 226                     * http_headers  A dictionary of additional HTTP headers
 227                                  to add to the request.
 228                     * stretched_ratio  If given and not 1, indicates that the
 229                                  video's pixels are not square.
 230                                  width : height ratio as float.
 231                     * no_resume  The server does not support resuming the
 232                                  (HTTP or RTMP) download. Boolean.
 233                     * has_drm    True if the format has DRM and cannot be downloaded.
 234                                  'maybe' if the format may have DRM and has to be tested before download.
 235                     * extra_param_to_segment_url  A query string to append to each
 236                                  fragment's URL, or to update each existing query string
 237                                  with. Only applied by the native HLS/DASH downloaders.
 238                     * hls_aes    A dictionary of HLS AES-128 decryption information
 239                                  used by the native HLS downloader to override the
 240                                  values in the media playlist when an '#EXT-X-KEY' tag
 241                                  is present in the playlist:
 242                                  * uri  The URI from which the key will be downloaded
 243                                  * key  The key (as hex) used to decrypt fragments.
 244                                         If `key` is given, any key URI will be ignored
 245                                  * iv   The IV (as hex) used to decrypt fragments
 246                     * downloader_options  A dictionary of downloader options
 247                                  (For internal use only)
 248                                  * http_chunk_size Chunk size for HTTP downloads
 249                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 250                     * is_dash_periods  Whether the format is a result of merging
 251                                  multiple DASH periods.
 252                     RTMP formats can also have the additional fields: page_url,
 253                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 254                     rtmp_protocol, rtmp_real_time
 255
 256     url:            Final video URL.
 257     ext:            Video filename extension.
 258     format:         The video format, defaults to ext (used for --get-format)
 259     player_url:     SWF Player URL (used for rtmpdump).
 260
 261     The following fields are optional:
 262
 263     direct:         True if a direct video file was given (must only be set by GenericIE)
 264     alt_title:      A secondary title of the video.
 265     display_id:     An alternative identifier for the video, not necessarily
 266                     unique, but available before title. Typically, id is
 267                     something like "4234987", title "Dancing naked mole rats",
 268                     and display_id "dancing-naked-mole-rats"
 269     thumbnails:     A list of dictionaries, with the following entries:
 270                         * "id" (optional, string) - Thumbnail format ID
 271                         * "url"
 272                         * "preference" (optional, int) - quality of the image
 273                         * "width" (optional, int)
 274                         * "height" (optional, int)
 275                         * "resolution" (optional, string "{width}x{height}",
 276                                         deprecated)
 277                         * "filesize" (optional, int)
 278                         * "http_headers" (dict) - HTTP headers for the request
 279     thumbnail:      Full URL to a video thumbnail image.
 280     description:    Full video description.
 281     uploader:       Full name of the video uploader.
 282     license:        License name the video is licensed under.
 283     creators:       List of creators of the video.
 284     timestamp:      UNIX timestamp of the moment the video was uploaded
 285     upload_date:    Video upload date in UTC (YYYYMMDD).
 286                     If not explicitly set, calculated from timestamp
 287     release_timestamp: UNIX timestamp of the moment the video was released.
 288                     If it is not clear whether to use timestamp or this, use the former
 289     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 290                     If not explicitly set, calculated from release_timestamp
 291     release_year:   Year (YYYY) as integer when the video or album was released.
 292                     To be used if no exact release date is known.
 293                     If not explicitly set, calculated from release_date.
 294     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 295     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 296                     If not explicitly set, calculated from modified_timestamp
 297     uploader_id:    Nickname or id of the video uploader.
 298     uploader_url:   Full URL to a personal webpage of the video uploader.
 299     channel:        Full name of the channel the video is uploaded on.
 300                     Note that channel fields may or may not repeat uploader
 301                     fields. This depends on a particular extractor.
 302     channel_id:     Id of the channel.
 303     channel_url:    Full URL to a channel webpage.
 304     channel_follower_count: Number of followers of the channel.
 305     channel_is_verified: Whether the channel is verified on the platform.
 306     location:       Physical location where the video was filmed.
 307     subtitles:      The available subtitles as a dictionary in the format
 308                     {tag: subformats}. "tag" is usually a language code, and
 309                     "subformats" is a list sorted from lower to higher
 310                     preference, each element is a dictionary with the "ext"
 311                     entry and one of:
 312                         * "data": The subtitles file contents
 313                         * "url": A URL pointing to the subtitles file
 314                     It can optionally also have:
 315                         * "name": Name or description of the subtitles
 316                         * "http_headers": A dictionary of additional HTTP headers
 317                                   to add to the request.
 318                     "ext" will be calculated from URL if missing
 319     automatic_captions: Like 'subtitles'; contains automatically generated
 320                     captions instead of normal subtitles
 321     duration:       Length of the video in seconds, as an integer or float.
 322     view_count:     How many users have watched the video on the platform.
 323     concurrent_view_count: How many users are currently watching the video on the platform.
 324     like_count:     Number of positive ratings of the video
 325     dislike_count:  Number of negative ratings of the video
 326     repost_count:   Number of reposts of the video
 327     average_rating: Average rating give by users, the scale used depends on the webpage
 328     comment_count:  Number of comments on the video
 329     comments:       A list of comments, each with one or more of the following
 330                     properties (all but one of text or html optional):
 331                         * "author" - human-readable name of the comment author
 332                         * "author_id" - user ID of the comment author
 333                         * "author_thumbnail" - The thumbnail of the comment author
 334                         * "author_url" - The url to the comment author's page
 335                         * "author_is_verified" - Whether the author is verified
 336                                                  on the platform
 337                         * "author_is_uploader" - Whether the comment is made by
 338                                                  the video uploader
 339                         * "id" - Comment ID
 340                         * "html" - Comment as HTML
 341                         * "text" - Plain text of the comment
 342                         * "timestamp" - UNIX timestamp of comment
 343                         * "parent" - ID of the comment this one is replying to.
 344                                      Set to "root" to indicate that this is a
 345                                      comment to the original video.
 346                         * "like_count" - Number of positive ratings of the comment
 347                         * "dislike_count" - Number of negative ratings of the comment
 348                         * "is_favorited" - Whether the comment is marked as
 349                                            favorite by the video uploader
 350                         * "is_pinned" - Whether the comment is pinned to
 351                                         the top of the comments
 352     age_limit:      Age restriction for the video, as an integer (years)
 353     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 354                     should allow to get the same result again. (It will be set
 355                     by YoutubeDL if it's missing)
 356     categories:     A list of categories that the video falls in, for example
 357                     ["Sports", "Berlin"]
 358     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 359     cast:           A list of the video cast
 360     is_live:        True, False, or None (=unknown). Whether this video is a
 361                     live stream that goes on instead of a fixed-length video.
 362     was_live:       True, False, or None (=unknown). Whether this video was
 363                     originally a live stream.
 364     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 365                     or 'post_live' (was live, but VOD is not yet processed)
 366                     If absent, automatically set from is_live, was_live
 367     start_time:     Time in seconds where the reproduction should start, as
 368                     specified in the URL.
 369     end_time:       Time in seconds where the reproduction should end, as
 370                     specified in the URL.
 371     chapters:       A list of dictionaries, with the following entries:
 372                         * "start_time" - The start time of the chapter in seconds
 373                         * "end_time" - The end time of the chapter in seconds
 374                         * "title" (optional, string)
 375     heatmap:        A list of dictionaries, with the following entries:
 376                         * "start_time" - The start time of the data point in seconds
 377                         * "end_time" - The end time of the data point in seconds
 378                         * "value" - The normalized value of the data point (float between 0 and 1)
 379     playable_in_embed: Whether this video is allowed to play in embedded
 380                     players on other sites. Can be True (=always allowed),
 381                     False (=never allowed), None (=unknown), or a string
 382                     specifying the criteria for embedability; e.g. 'whitelist'
 383     availability:   Under what condition the video is available. One of
 384                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 385                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 386                     to set it
 387     media_type:     The type of media as classified by the site, e.g. "episode", "clip", "trailer"
 388     _old_archive_ids: A list of old archive ids needed for backward compatibility
 389     _format_sort_fields: A list of fields to use for sorting formats
 390     __post_extractor: A function to be called just before the metadata is
 391                     written to either disk, logger or console. The function
 392                     must return a dict which will be added to the info_dict.
 393                     This is usefull for additional information that is
 394                     time-consuming to extract. Note that the fields thus
 395                     extracted will not be available to output template and
 396                     match_filter. So, only "comments" and "comment_count" are
 397                     currently allowed to be extracted via this method.
 398
 399     The following fields should only be used when the video belongs to some logical
 400     chapter or section:
 401
 402     chapter:        Name or title of the chapter the video belongs to.
 403     chapter_number: Number of the chapter the video belongs to, as an integer.
 404     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 405
 406     The following fields should only be used when the video is an episode of some
 407     series, programme or podcast:
 408
 409     series:         Title of the series or programme the video episode belongs to.
 410     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 411     season:         Title of the season the video episode belongs to.
 412     season_number:  Number of the season the video episode belongs to, as an integer.
 413     season_id:      Id of the season the video episode belongs to, as a unicode string.
 414     episode:        Title of the video episode. Unlike mandatory video title field,
 415                     this field should denote the exact title of the video episode
 416                     without any kind of decoration.
 417     episode_number: Number of the video episode within a season, as an integer.
 418     episode_id:     Id of the video episode, as a unicode string.
 419
 420     The following fields should only be used when the media is a track or a part of
 421     a music album:
 422
 423     track:          Title of the track.
 424     track_number:   Number of the track within an album or a disc, as an integer.
 425     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 426                     as a unicode string.
 427     artists:        List of artists of the track.
 428     composers:      List of composers of the piece.
 429     genres:         List of genres of the track.
 430     album:          Title of the album the track belongs to.
 431     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 432     album_artists:  List of all artists appeared on the album.
 433                     E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
 434                     Useful for splits and compilations.
 435     disc_number:    Number of the disc or other physical medium the track belongs to,
 436                     as an integer.
 437
 438     The following fields should only be set for clips that should be cut from the original video:
 439
 440     section_start:  Start time of the section in seconds
 441     section_end:    End time of the section in seconds
 442
 443     The following fields should only be set for storyboards:
 444     rows:           Number of rows in each storyboard fragment, as an integer
 445     columns:        Number of columns in each storyboard fragment, as an integer
 446
 447     The following fields are deprecated and should not be set by new code:
 448     composer:       Use "composers" instead.
 449                     Composer(s) of the piece, comma-separated.
 450     artist:         Use "artists" instead.
 451                     Artist(s) of the track, comma-separated.
 452     genre:          Use "genres" instead.
 453                     Genre(s) of the track, comma-separated.
 454     album_artist:   Use "album_artists" instead.
 455                     All artists appeared on the album, comma-separated.
 456     creator:        Use "creators" instead.
 457                     The creator of the video.
 458
 459     Unless mentioned otherwise, the fields should be Unicode strings.
 460
 461     Unless mentioned otherwise, None is equivalent to absence of information.
 462
 463
 464     _type "playlist" indicates multiple videos.
 465     There must be a key "entries", which is a list, an iterable, or a PagedList
 466     object, each element of which is a valid dictionary by this specification.
 467
 468     Additionally, playlists can have "id", "title", and any other relevant
 469     attributes with the same semantics as videos (see above).
 470
 471     It can also have the following optional fields:
 472
 473     playlist_count: The total number of videos in a playlist. If not given,
 474                     YoutubeDL tries to calculate it from "entries"
 475
 476
 477     _type "multi_video" indicates that there are multiple videos that
 478     form a single show, for examples multiple acts of an opera or TV episode.
 479     It must have an entries key like a playlist and contain all the keys
 480     required for a video at the same time.
 481
 482
 483     _type "url" indicates that the video must be extracted from another
 484     location, possibly by a different extractor. Its only required key is:
 485     "url" - the next URL to extract.
 486     The key "ie_key" can be set to the class name (minus the trailing "IE",
 487     e.g. "Youtube") if the extractor class is known in advance.
 488     Additionally, the dictionary may have any properties of the resolved entity
 489     known in advance, for example "title" if the title of the referred video is
 490     known ahead of time.
 491
 492
 493     _type "url_transparent" entities have the same specification as "url", but
 494     indicate that the given additional information is more precise than the one
 495     associated with the resolved URL.
 496     This is useful when a site employs a video service that hosts the video and
 497     its technical metadata, but that video service does not embed a useful
 498     title, description etc.
 499
 500
 501     Subclasses of this should also be added to the list of extractors and
 502     should define _VALID_URL as a regexp or a Sequence of regexps, and
 503     re-define the _real_extract() and (optionally) _real_initialize() methods.
 504
 505     Subclasses may also override suitable() if necessary, but ensure the function
 506     signature is preserved and that this function imports everything it needs
 507     (except other extractors), so that lazy_extractors works correctly.
 508
 509     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 510     the HTML of Generic webpages. It may also override _extract_embed_urls
 511     or _extract_from_webpage as necessary. While these are normally classmethods,
 512     _extract_from_webpage is allowed to be an instance method.
 513
 514     _extract_from_webpage may raise self.StopExtraction() to stop further
 515     processing of the webpage and obtain exclusive rights to it. This is useful
 516     when the extractor cannot reliably be matched using just the URL,
 517     e.g. invidious/peertube instances
 518
 519     Embed-only extractors can be defined by setting _VALID_URL = False.
 520
 521     To support username + password (or netrc) login, the extractor must define a
 522     _NETRC_MACHINE and re-define _perform_login(username, password) and
 523     (optionally) _initialize_pre_login() methods. The _perform_login method will
 524     be called between _initialize_pre_login and _real_initialize if credentials
 525     are passed by the user. In cases where it is necessary to have the login
 526     process as part of the extraction rather than initialization, _perform_login
 527     can be left undefined.
 528
 529     _GEO_BYPASS attribute may be set to False in order to disable
 530     geo restriction bypass mechanisms for a particular extractor.
 531     Though it won't disable explicit geo restriction bypass based on
 532     country code provided with geo_bypass_country.
 533
 534     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 535     countries for this extractor. One of these countries will be used by
 536     geo restriction bypass mechanism right away in order to bypass
 537     geo restriction, of course, if the mechanism is not disabled.
 538
 539     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 540     IP blocks in CIDR notation for this extractor. One of these IP blocks
 541     will be used by geo restriction bypass mechanism similarly
 542     to _GEO_COUNTRIES.
 543
 544     The _ENABLED attribute should be set to False for IEs that
 545     are disabled by default and must be explicitly enabled.
 546
 547     The _WORKING attribute should be set to False for broken IEs
 548     in order to warn the users and skip the tests.
 549     """
 550
 551     _ready = False
 552     _downloader = None
 553     _x_forwarded_for_ip = None
 554     _GEO_BYPASS = True
 555     _GEO_COUNTRIES = None
 556     _GEO_IP_BLOCKS = None
 557     _WORKING = True
 558     _ENABLED = True
 559     _NETRC_MACHINE = None
 560     IE_DESC = None
 561     SEARCH_KEY = None
 562     _VALID_URL = None
 563     _EMBED_REGEX = []
 564
 565     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 566         password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 567         return {
 568             None: '',
 569             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 570             'password': f'Use {password_hint}',
 571             'cookies': (
 572                 'Use --cookies-from-browser or --cookies for the authentication. '
 573                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 574         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 575
 576     def __init__(self, downloader=None):
 577         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 578         If a downloader is not passed during initialization,
 579         it must be set using "set_downloader()" before "extract()" is called"""
 580         self._ready = False
 581         self._x_forwarded_for_ip = None
 582         self._printed_messages = set()
 583         self.set_downloader(downloader)
 584
 585     @classmethod
 586     def _match_valid_url(cls, url):
 587         if cls._VALID_URL is False:
 588             return None
 589         # This does not use has/getattr intentionally - we want to know whether
 590         # we have cached the regexp for *this* class, whereas getattr would also
 591         # match the superclass
 592         if '_VALID_URL_RE' not in cls.__dict__:
 593             cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
 594         return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
 595
 596     @classmethod
 597     def suitable(cls, url):
 598         """Receives a URL and returns True if suitable for this IE."""
 599         # This function must import everything it needs (except other extractors),
 600         # so that lazy_extractors works correctly
 601         return cls._match_valid_url(url) is not None
 602
 603     @classmethod
 604     def _match_id(cls, url):
 605         return cls._match_valid_url(url).group('id')
 606
 607     @classmethod
 608     def get_temp_id(cls, url):
 609         try:
 610             return cls._match_id(url)
 611         except (IndexError, AttributeError):
 612             return None
 613
 614     @classmethod
 615     def working(cls):
 616         """Getter method for _WORKING."""
 617         return cls._WORKING
 618
 619     @classmethod
 620     def supports_login(cls):
 621         return bool(cls._NETRC_MACHINE)
 622
 623     def initialize(self):
 624         """Initializes an instance (authentication, etc)."""
 625         self._printed_messages = set()
 626         self._initialize_geo_bypass({
 627             'countries': self._GEO_COUNTRIES,
 628             'ip_blocks': self._GEO_IP_BLOCKS,
 629         })
 630         if not self._ready:
 631             self._initialize_pre_login()
 632             if self.supports_login():
 633                 username, password = self._get_login_info()
 634                 if username:
 635                     self._perform_login(username, password)
 636             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 637                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 638             self._real_initialize()
 639             self._ready = True
 640
 641     def _initialize_geo_bypass(self, geo_bypass_context):
 642         """
 643         Initialize geo restriction bypass mechanism.
 644
 645         This method is used to initialize geo bypass mechanism based on faking
 646         X-Forwarded-For HTTP header. A random country from provided country list
 647         is selected and a random IP belonging to this country is generated. This
 648         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 649         HTTP requests.
 650
 651         This method will be used for initial geo bypass mechanism initialization
 652         during the instance initialization with _GEO_COUNTRIES and
 653         _GEO_IP_BLOCKS.
 654
 655         You may also manually call it from extractor's code if geo bypass
 656         information is not available beforehand (e.g. obtained during
 657         extraction) or due to some other reason. In this case you should pass
 658         this information in geo bypass context passed as first argument. It may
 659         contain following fields:
 660
 661         countries:  List of geo unrestricted countries (similar
 662                     to _GEO_COUNTRIES)
 663         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 664                     (similar to _GEO_IP_BLOCKS)
 665
 666         """
 667         if not self._x_forwarded_for_ip:
 668
 669             # Geo bypass mechanism is explicitly disabled by user
 670             if not self.get_param('geo_bypass', True):
 671                 return
 672
 673             if not geo_bypass_context:
 674                 geo_bypass_context = {}
 675
 676             # Backward compatibility: previously _initialize_geo_bypass
 677             # expected a list of countries, some 3rd party code may still use
 678             # it this way
 679             if isinstance(geo_bypass_context, (list, tuple)):
 680                 geo_bypass_context = {
 681                     'countries': geo_bypass_context,
 682                 }
 683
 684             # The whole point of geo bypass mechanism is to fake IP
 685             # as X-Forwarded-For HTTP header based on some IP block or
 686             # country code.
 687
 688             # Path 1: bypassing based on IP block in CIDR notation
 689
 690             # Explicit IP block specified by user, use it right away
 691             # regardless of whether extractor is geo bypassable or not
 692             ip_block = self.get_param('geo_bypass_ip_block', None)
 693
 694             # Otherwise use random IP block from geo bypass context but only
 695             # if extractor is known as geo bypassable
 696             if not ip_block:
 697                 ip_blocks = geo_bypass_context.get('ip_blocks')
 698                 if self._GEO_BYPASS and ip_blocks:
 699                     ip_block = random.choice(ip_blocks)
 700
 701             if ip_block:
 702                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 703                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 704                 return
 705
 706             # Path 2: bypassing based on country code
 707
 708             # Explicit country code specified by user, use it right away
 709             # regardless of whether extractor is geo bypassable or not
 710             country = self.get_param('geo_bypass_country', None)
 711
 712             # Otherwise use random country code from geo bypass context but
 713             # only if extractor is known as geo bypassable
 714             if not country:
 715                 countries = geo_bypass_context.get('countries')
 716                 if self._GEO_BYPASS and countries:
 717                     country = random.choice(countries)
 718
 719             if country:
 720                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 721                 self._downloader.write_debug(
 722                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 723
 724     def extract(self, url):
 725         """Extracts URL information and returns it in list of dicts."""
 726         try:
 727             for _ in range(2):
 728                 try:
 729                     self.initialize()
 730                     self.to_screen('Extracting URL: %s' % (
 731                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 732                     ie_result = self._real_extract(url)
 733                     if ie_result is None:
 734                         return None
 735                     if self._x_forwarded_for_ip:
 736                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 737                     subtitles = ie_result.get('subtitles') or {}
 738                     if 'no-live-chat' in self.get_param('compat_opts'):
 739                         for lang in ('live_chat', 'comments', 'danmaku'):
 740                             subtitles.pop(lang, None)
 741                     return ie_result
 742                 except GeoRestrictedError as e:
 743                     if self.__maybe_fake_ip_and_retry(e.countries):
 744                         continue
 745                     raise
 746         except UnsupportedError:
 747             raise
 748         except ExtractorError as e:
 749             e.video_id = e.video_id or self.get_temp_id(url)
 750             e.ie = e.ie or self.IE_NAME
 751             e.traceback = e.traceback or sys.exc_info()[2]
 752             raise
 753         except IncompleteRead as e:
 754             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 755         except (KeyError, StopIteration) as e:
 756             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 757
 758     def __maybe_fake_ip_and_retry(self, countries):
 759         if (not self.get_param('geo_bypass_country', None)
 760                 and self._GEO_BYPASS
 761                 and self.get_param('geo_bypass', True)
 762                 and not self._x_forwarded_for_ip
 763                 and countries):
 764             country_code = random.choice(countries)
 765             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 766             if self._x_forwarded_for_ip:
 767                 self.report_warning(
 768                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 769                     % (self._x_forwarded_for_ip, country_code.upper()))
 770                 return True
 771         return False
 772
 773     def set_downloader(self, downloader):
 774         """Sets a YoutubeDL instance as the downloader for this IE."""
 775         self._downloader = downloader
 776
 777     @property
 778     def cache(self):
 779         return self._downloader.cache
 780
 781     @property
 782     def cookiejar(self):
 783         return self._downloader.cookiejar
 784
 785     def _initialize_pre_login(self):
 786         """ Initialization before login. Redefine in subclasses."""
 787         pass
 788
 789     def _perform_login(self, username, password):
 790         """ Login with username and password. Redefine in subclasses."""
 791         pass
 792
 793     def _real_initialize(self):
 794         """Real initialization process. Redefine in subclasses."""
 795         pass
 796
 797     def _real_extract(self, url):
 798         """Real extraction process. Redefine in subclasses."""
 799         raise NotImplementedError('This method must be implemented by subclasses')
 800
 801     @classmethod
 802     def ie_key(cls):
 803         """A string for getting the InfoExtractor with get_info_extractor"""
 804         return cls.__name__[:-2]
 805
 806     @classproperty
 807     def IE_NAME(cls):
 808         return cls.__name__[:-2]
 809
 810     @staticmethod
 811     def __can_accept_status_code(err, expected_status):
 812         assert isinstance(err, HTTPError)
 813         if expected_status is None:
 814             return False
 815         elif callable(expected_status):
 816             return expected_status(err.status) is True
 817         else:
 818             return err.status in variadic(expected_status)
 819
 820     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 821         if isinstance(url_or_request, urllib.request.Request):
 822             self._downloader.deprecation_warning(
 823                 'Passing a urllib.request.Request to _create_request() is deprecated. '
 824                 'Use yt_dlp.networking.common.Request instead.')
 825             url_or_request = urllib_req_to_req(url_or_request)
 826         elif not isinstance(url_or_request, Request):
 827             url_or_request = Request(url_or_request)
 828
 829         url_or_request.update(data=data, headers=headers, query=query)
 830         return url_or_request
 831
 832     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 833         """
 834         Return the response handle.
 835
 836         See _download_webpage docstring for arguments specification.
 837         """
 838         if not self._downloader._first_webpage_request:
 839             sleep_interval = self.get_param('sleep_interval_requests') or 0
 840             if sleep_interval > 0:
 841                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 842                 time.sleep(sleep_interval)
 843         else:
 844             self._downloader._first_webpage_request = False
 845
 846         if note is None:
 847             self.report_download_webpage(video_id)
 848         elif note is not False:
 849             if video_id is None:
 850                 self.to_screen(str(note))
 851             else:
 852                 self.to_screen(f'{video_id}: {note}')
 853
 854         # Some sites check X-Forwarded-For HTTP header in order to figure out
 855         # the origin of the client behind proxy. This allows bypassing geo
 856         # restriction by faking this header's value to IP that belongs to some
 857         # geo unrestricted country. We will do so once we encounter any
 858         # geo restriction error.
 859         if self._x_forwarded_for_ip:
 860             headers = (headers or {}).copy()
 861             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 862
 863         try:
 864             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 865         except network_exceptions as err:
 866             if isinstance(err, HTTPError):
 867                 if self.__can_accept_status_code(err, expected_status):
 868                     return err.response
 869
 870             if errnote is False:
 871                 return False
 872             if errnote is None:
 873                 errnote = 'Unable to download webpage'
 874
 875             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 876             if fatal:
 877                 raise ExtractorError(errmsg, cause=err)
 878             else:
 879                 self.report_warning(errmsg)
 880                 return False
 881
 882     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 883                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 884         """
 885         Return a tuple (page content as string, URL handle).
 886
 887         Arguments:
 888         url_or_request -- plain text URL as a string or
 889             a urllib.request.Request object
 890         video_id -- Video/playlist/item identifier (string)
 891
 892         Keyword arguments:
 893         note -- note printed before downloading (string)
 894         errnote -- note printed in case of an error (string)
 895         fatal -- flag denoting whether error should be considered fatal,
 896             i.e. whether it should cause ExtractionError to be raised,
 897             otherwise a warning will be reported and extraction continued
 898         encoding -- encoding for a page content decoding, guessed automatically
 899             when not explicitly specified
 900         data -- POST data (bytes)
 901         headers -- HTTP headers (dict)
 902         query -- URL query (dict)
 903         expected_status -- allows to accept failed HTTP requests (non 2xx
 904             status code) by explicitly specifying a set of accepted status
 905             codes. Can be any of the following entities:
 906                 - an integer type specifying an exact failed status code to
 907                   accept
 908                 - a list or a tuple of integer types specifying a list of
 909                   failed status codes to accept
 910                 - a callable accepting an actual failed status code and
 911                   returning True if it should be accepted
 912             Note that this argument does not affect success status codes (2xx)
 913             which are always accepted.
 914         """
 915
 916         # Strip hashes from the URL (#1038)
 917         if isinstance(url_or_request, str):
 918             url_or_request = url_or_request.partition('#')[0]
 919
 920         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 921         if urlh is False:
 922             assert not fatal
 923             return False
 924         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 925         return (content, urlh)
 926
 927     @staticmethod
 928     def _guess_encoding_from_content(content_type, webpage_bytes):
 929         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 930         if m:
 931             encoding = m.group(1)
 932         else:
 933             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 934                           webpage_bytes[:1024])
 935             if m:
 936                 encoding = m.group(1).decode('ascii')
 937             elif webpage_bytes.startswith(b'\xff\xfe'):
 938                 encoding = 'utf-16'
 939             else:
 940                 encoding = 'utf-8'
 941
 942         return encoding
 943
 944     def __check_blocked(self, content):
 945         first_block = content[:512]
 946         if ('<title>Access to this site is blocked</title>' in content
 947                 and 'Websense' in first_block):
 948             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 949             blocked_iframe = self._html_search_regex(
 950                 r'<iframe src="([^"]+)"', content,
 951                 'Websense information URL', default=None)
 952             if blocked_iframe:
 953                 msg += ' Visit %s for more details' % blocked_iframe
 954             raise ExtractorError(msg, expected=True)
 955         if '<title>The URL you requested has been blocked</title>' in first_block:
 956             msg = (
 957                 'Access to this webpage has been blocked by Indian censorship. '
 958                 'Use a VPN or proxy server (with --proxy) to route around it.')
 959             block_msg = self._html_search_regex(
 960                 r'</h1><p>(.*?)</p>',
 961                 content, 'block message', default=None)
 962             if block_msg:
 963                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 964             raise ExtractorError(msg, expected=True)
 965         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 966                 and 'blocklist.rkn.gov.ru' in content):
 967             raise ExtractorError(
 968                 'Access to this webpage has been blocked by decision of the Russian government. '
 969                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 970                 expected=True)
 971
 972     def _request_dump_filename(self, url, video_id):
 973         basen = f'{video_id}_{url}'
 974         trim_length = self.get_param('trim_file_name') or 240
 975         if len(basen) > trim_length:
 976             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 977             basen = basen[:trim_length - len(h)] + h
 978         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 979         # Working around MAX_PATH limitation on Windows (see
 980         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 981         if compat_os_name == 'nt':
 982             absfilepath = os.path.abspath(filename)
 983             if len(absfilepath) > 259:
 984                 filename = fR'\\?\{absfilepath}'
 985         return filename
 986
 987     def __decode_webpage(self, webpage_bytes, encoding, headers):
 988         if not encoding:
 989             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 990         try:
 991             return webpage_bytes.decode(encoding, 'replace')
 992         except LookupError:
 993             return webpage_bytes.decode('utf-8', 'replace')
 994
 995     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 996         webpage_bytes = urlh.read()
 997         if prefix is not None:
 998             webpage_bytes = prefix + webpage_bytes
 999         if self.get_param('dump_intermediate_pages', False):
1000             self.to_screen('Dumping request to ' + urlh.url)
1001             dump = base64.b64encode(webpage_bytes).decode('ascii')
1002             self._downloader.to_screen(dump)
1003         if self.get_param('write_pages'):
1004             filename = self._request_dump_filename(urlh.url, video_id)
1005             self.to_screen(f'Saving request to {filename}')
1006             with open(filename, 'wb') as outf:
1007                 outf.write(webpage_bytes)
1008
1009         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
1010         self.__check_blocked(content)
1011
1012         return content
1013
1014     def __print_error(self, errnote, fatal, video_id, err):
1015         if fatal:
1016             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
1017         elif errnote:
1018             self.report_warning(f'{video_id}: {errnote}: {err}')
1019
1020     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
1021         if transform_source:
1022             xml_string = transform_source(xml_string)
1023         try:
1024             return compat_etree_fromstring(xml_string.encode('utf-8'))
1025         except xml.etree.ElementTree.ParseError as ve:
1026             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1027
1028     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1029         try:
1030             return json.loads(
1031                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1032         except ValueError as ve:
1033             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1034
1035     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1036         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1037
1038     def __create_download_methods(name, parser, note, errnote, return_value):
1039
1040         def parse(ie, content, *args, errnote=errnote, **kwargs):
1041             if parser is None:
1042                 return content
1043             if errnote is False:
1044                 kwargs['errnote'] = errnote
1045             # parser is fetched by name so subclasses can override it
1046             return getattr(ie, parser)(content, *args, **kwargs)
1047
1048         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1049                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1050             res = self._download_webpage_handle(
1051                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1052                 data=data, headers=headers, query=query, expected_status=expected_status)
1053             if res is False:
1054                 return res
1055             content, urlh = res
1056             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1057
1058         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1059                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1060             if self.get_param('load_pages'):
1061                 url_or_request = self._create_request(url_or_request, data, headers, query)
1062                 filename = self._request_dump_filename(url_or_request.url, video_id)
1063                 self.to_screen(f'Loading request from {filename}')
1064                 try:
1065                     with open(filename, 'rb') as dumpf:
1066                         webpage_bytes = dumpf.read()
1067                 except OSError as e:
1068                     self.report_warning(f'Unable to load request from disk: {e}')
1069                 else:
1070                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1071                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1072             kwargs = {
1073                 'note': note,
1074                 'errnote': errnote,
1075                 'transform_source': transform_source,
1076                 'fatal': fatal,
1077                 'encoding': encoding,
1078                 'data': data,
1079                 'headers': headers,
1080                 'query': query,
1081                 'expected_status': expected_status,
1082             }
1083             if parser is None:
1084                 kwargs.pop('transform_source')
1085             # The method is fetched by name so subclasses can override _download_..._handle
1086             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1087             return res if res is False else res[0]
1088
1089         def impersonate(func, name, return_value):
1090             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1091             func.__doc__ = f'''
1092                 @param transform_source     Apply this transformation before parsing
1093                 @returns                    {return_value}
1094
1095                 See _download_webpage_handle docstring for other arguments specification
1096             '''
1097
1098         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1099         impersonate(download_content, f'_download_{name}', f'{return_value}')
1100         return download_handle, download_content
1101
1102     _download_xml_handle, _download_xml = __create_download_methods(
1103         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1104     _download_json_handle, _download_json = __create_download_methods(
1105         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1106     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1107         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1108     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1109
1110     def _download_webpage(
1111             self, url_or_request, video_id, note=None, errnote=None,
1112             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1113         """
1114         Return the data of the page as a string.
1115
1116         Keyword arguments:
1117         tries -- number of tries
1118         timeout -- sleep interval between tries
1119
1120         See _download_webpage_handle docstring for other arguments specification.
1121         """
1122
1123         R''' # NB: These are unused; should they be deprecated?
1124         if tries != 1:
1125             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1126         if timeout is NO_DEFAULT:
1127             timeout = 5
1128         else:
1129             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1130         '''
1131
1132         try_count = 0
1133         while True:
1134             try:
1135                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1136             except IncompleteRead as e:
1137                 try_count += 1
1138                 if try_count >= tries:
1139                     raise e
1140                 self._sleep(timeout, video_id)
1141
1142     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1143         idstr = format_field(video_id, None, '%s: ')
1144         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1145         if only_once:
1146             if f'WARNING: {msg}' in self._printed_messages:
1147                 return
1148             self._printed_messages.add(f'WARNING: {msg}')
1149         self._downloader.report_warning(msg, *args, **kwargs)
1150
1151     def to_screen(self, msg, *args, **kwargs):
1152         """Print msg to screen, prefixing it with '[ie_name]'"""
1153         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1154
1155     def write_debug(self, msg, *args, **kwargs):
1156         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1157
1158     def get_param(self, name, default=None, *args, **kwargs):
1159         if self._downloader:
1160             return self._downloader.params.get(name, default, *args, **kwargs)
1161         return default
1162
1163     def report_drm(self, video_id, partial=NO_DEFAULT):
1164         if partial is not NO_DEFAULT:
1165             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1166         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1167
1168     def report_extraction(self, id_or_name):
1169         """Report information extraction."""
1170         self.to_screen('%s: Extracting information' % id_or_name)
1171
1172     def report_download_webpage(self, video_id):
1173         """Report webpage download."""
1174         self.to_screen('%s: Downloading webpage' % video_id)
1175
1176     def report_age_confirmation(self):
1177         """Report attempt to confirm age."""
1178         self.to_screen('Confirming age')
1179
1180     def report_login(self):
1181         """Report attempt to log in."""
1182         self.to_screen('Logging in')
1183
1184     def raise_login_required(
1185             self, msg='This video is only available for registered users',
1186             metadata_available=False, method=NO_DEFAULT):
1187         if metadata_available and (
1188                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1189             self.report_warning(msg)
1190             return
1191         msg += format_field(self._login_hint(method), None, '. %s')
1192         raise ExtractorError(msg, expected=True)
1193
1194     def raise_geo_restricted(
1195             self, msg='This video is not available from your location due to geo restriction',
1196             countries=None, metadata_available=False):
1197         if metadata_available and (
1198                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1199             self.report_warning(msg)
1200         else:
1201             raise GeoRestrictedError(msg, countries=countries)
1202
1203     def raise_no_formats(self, msg, expected=False, video_id=None):
1204         if expected and (
1205                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1206             self.report_warning(msg, video_id)
1207         elif isinstance(msg, ExtractorError):
1208             raise msg
1209         else:
1210             raise ExtractorError(msg, expected=expected, video_id=video_id)
1211
1212     # Methods for following #608
1213     @staticmethod
1214     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1215         """Returns a URL that points to a page that should be processed"""
1216         if ie is not None:
1217             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1218         if video_id is not None:
1219             kwargs['id'] = video_id
1220         if video_title is not None:
1221             kwargs['title'] = video_title
1222         return {
1223             **kwargs,
1224             '_type': 'url_transparent' if url_transparent else 'url',
1225             'url': url,
1226         }
1227
1228     @classmethod
1229     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1230                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1231         return cls.playlist_result(
1232             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1233             playlist_id, playlist_title, **kwargs)
1234
1235     @staticmethod
1236     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1237         """Returns a playlist"""
1238         if playlist_id:
1239             kwargs['id'] = playlist_id
1240         if playlist_title:
1241             kwargs['title'] = playlist_title
1242         if playlist_description is not None:
1243             kwargs['description'] = playlist_description
1244         return {
1245             **kwargs,
1246             '_type': 'multi_video' if multi_video else 'playlist',
1247             'entries': entries,
1248         }
1249
1250     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1251         """
1252         Perform a regex search on the given string, using a single or a list of
1253         patterns returning the first matching group.
1254         In case of failure return a default value or raise a WARNING or a
1255         RegexNotFoundError, depending on fatal, specifying the field name.
1256         """
1257         if string is None:
1258             mobj = None
1259         elif isinstance(pattern, (str, re.Pattern)):
1260             mobj = re.search(pattern, string, flags)
1261         else:
1262             for p in pattern:
1263                 mobj = re.search(p, string, flags)
1264                 if mobj:
1265                     break
1266
1267         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1268
1269         if mobj:
1270             if group is None:
1271                 # return the first matching group
1272                 return next(g for g in mobj.groups() if g is not None)
1273             elif isinstance(group, (list, tuple)):
1274                 return tuple(mobj.group(g) for g in group)
1275             else:
1276                 return mobj.group(group)
1277         elif default is not NO_DEFAULT:
1278             return default
1279         elif fatal:
1280             raise RegexNotFoundError('Unable to extract %s' % _name)
1281         else:
1282             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1283             return None
1284
1285     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1286                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1287         """Searches string for the JSON object specified by start_pattern"""
1288         # NB: end_pattern is only used to reduce the size of the initial match
1289         if default is NO_DEFAULT:
1290             default, has_default = {}, False
1291         else:
1292             fatal, has_default = False, True
1293
1294         json_string = self._search_regex(
1295             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1296             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1297         if not json_string:
1298             return default
1299
1300         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1301         try:
1302             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1303         except ExtractorError as e:
1304             if fatal:
1305                 raise ExtractorError(
1306                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1307             elif not has_default:
1308                 self.report_warning(
1309                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1310         return default
1311
1312     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1313         """
1314         Like _search_regex, but strips HTML tags and unescapes entities.
1315         """
1316         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1317         if isinstance(res, tuple):
1318             return tuple(map(clean_html, res))
1319         return clean_html(res)
1320
1321     def _get_netrc_login_info(self, netrc_machine=None):
1322         netrc_machine = netrc_machine or self._NETRC_MACHINE
1323
1324         cmd = self.get_param('netrc_cmd')
1325         if cmd:
1326             cmd = cmd.replace('{}', netrc_machine)
1327             self.to_screen(f'Executing command: {cmd}')
1328             stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1329             if ret != 0:
1330                 raise OSError(f'Command returned error code {ret}')
1331             info = netrc_from_content(stdout).authenticators(netrc_machine)
1332
1333         elif self.get_param('usenetrc', False):
1334             netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1335             if os.path.isdir(netrc_file):
1336                 netrc_file = os.path.join(netrc_file, '.netrc')
1337             info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1338
1339         else:
1340             return None, None
1341         if not info:
1342             self.to_screen(f'No authenticators for {netrc_machine}')
1343             return None, None
1344
1345         self.write_debug(f'Using netrc for {netrc_machine} authentication')
1346         return info[0], info[2]
1347
1348     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1349         """
1350         Get the login info as (username, password)
1351         First look for the manually specified credentials using username_option
1352         and password_option as keys in params dictionary. If no such credentials
1353         are available try the netrc_cmd if it is defined or look in the
1354         netrc file using the netrc_machine or _NETRC_MACHINE value.
1355         If there's no info available, return (None, None)
1356         """
1357
1358         username = self.get_param(username_option)
1359         if username is not None:
1360             password = self.get_param(password_option)
1361         else:
1362             try:
1363                 username, password = self._get_netrc_login_info(netrc_machine)
1364             except (OSError, netrc.NetrcParseError) as err:
1365                 self.report_warning(f'Failed to parse .netrc: {err}')
1366                 return None, None
1367         return username, password
1368
1369     def _get_tfa_info(self, note='two-factor verification code'):
1370         """
1371         Get the two-factor authentication info
1372         TODO - asking the user will be required for sms/phone verify
1373         currently just uses the command line option
1374         If there's no info available, return None
1375         """
1376
1377         tfa = self.get_param('twofactor')
1378         if tfa is not None:
1379             return tfa
1380
1381         return getpass.getpass('Type %s and press [Return]: ' % note)
1382
1383     # Helper functions for extracting OpenGraph info
1384     @staticmethod
1385     def _og_regexes(prop):
1386         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1387         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1388                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1389         template = r'<meta[^>]+?%s[^>]+?%s'
1390         return [
1391             template % (property_re, content_re),
1392             template % (content_re, property_re),
1393         ]
1394
1395     @staticmethod
1396     def _meta_regex(prop):
1397         return r'''(?isx)<meta
1398                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1399                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1400
1401     def _og_search_property(self, prop, html, name=None, **kargs):
1402         prop = variadic(prop)
1403         if name is None:
1404             name = 'OpenGraph %s' % prop[0]
1405         og_regexes = []
1406         for p in prop:
1407             og_regexes.extend(self._og_regexes(p))
1408         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1409         if escaped is None:
1410             return None
1411         return unescapeHTML(escaped)
1412
1413     def _og_search_thumbnail(self, html, **kargs):
1414         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1415
1416     def _og_search_description(self, html, **kargs):
1417         return self._og_search_property('description', html, fatal=False, **kargs)
1418
1419     def _og_search_title(self, html, *, fatal=False, **kargs):
1420         return self._og_search_property('title', html, fatal=fatal, **kargs)
1421
1422     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1423         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1424         if secure:
1425             regexes = self._og_regexes('video:secure_url') + regexes
1426         return self._html_search_regex(regexes, html, name, **kargs)
1427
1428     def _og_search_url(self, html, **kargs):
1429         return self._og_search_property('url', html, **kargs)
1430
1431     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1432         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1433
1434     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1435         name = variadic(name)
1436         if display_name is None:
1437             display_name = name[0]
1438         return self._html_search_regex(
1439             [self._meta_regex(n) for n in name],
1440             html, display_name, fatal=fatal, group='content', **kwargs)
1441
1442     def _dc_search_uploader(self, html):
1443         return self._html_search_meta('dc.creator', html, 'uploader')
1444
1445     @staticmethod
1446     def _rta_search(html):
1447         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1448         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1449                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1450                      html):
1451             return 18
1452
1453         # And then there are the jokers who advertise that they use RTA, but actually don't.
1454         AGE_LIMIT_MARKERS = [
1455             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1456             r'>[^<]*you acknowledge you are at least (\d+) years old',
1457             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1458         ]
1459
1460         age_limit = 0
1461         for marker in AGE_LIMIT_MARKERS:
1462             mobj = re.search(marker, html)
1463             if mobj:
1464                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1465         return age_limit
1466
1467     def _media_rating_search(self, html):
1468         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1469         rating = self._html_search_meta('rating', html)
1470
1471         if not rating:
1472             return None
1473
1474         RATING_TABLE = {
1475             'safe for kids': 0,
1476             'general': 8,
1477             '14 years': 14,
1478             'mature': 17,
1479             'restricted': 19,
1480         }
1481         return RATING_TABLE.get(rating.lower())
1482
1483     def _family_friendly_search(self, html):
1484         # See http://schema.org/VideoObject
1485         family_friendly = self._html_search_meta(
1486             'isFamilyFriendly', html, default=None)
1487
1488         if not family_friendly:
1489             return None
1490
1491         RATING_TABLE = {
1492             '1': 0,
1493             'true': 0,
1494             '0': 18,
1495             'false': 18,
1496         }
1497         return RATING_TABLE.get(family_friendly.lower())
1498
1499     def _twitter_search_player(self, html):
1500         return self._html_search_meta('twitter:player', html,
1501                                       'twitter card player')
1502
1503     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1504         """Yield all json ld objects in the html"""
1505         if default is not NO_DEFAULT:
1506             fatal = False
1507         for mobj in re.finditer(JSON_LD_RE, html):
1508             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1509             for json_ld in variadic(json_ld_item):
1510                 if isinstance(json_ld, dict):
1511                     yield json_ld
1512
1513     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1514         """Search for a video in any json ld in the html"""
1515         if default is not NO_DEFAULT:
1516             fatal = False
1517         info = self._json_ld(
1518             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1519             video_id, fatal=fatal, expected_type=expected_type)
1520         if info:
1521             return info
1522         if default is not NO_DEFAULT:
1523             return default
1524         elif fatal:
1525             raise RegexNotFoundError('Unable to extract JSON-LD')
1526         else:
1527             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1528             return {}
1529
1530     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1531         if isinstance(json_ld, str):
1532             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1533         if not json_ld:
1534             return {}
1535         info = {}
1536
1537         INTERACTION_TYPE_MAP = {
1538             'CommentAction': 'comment',
1539             'AgreeAction': 'like',
1540             'DisagreeAction': 'dislike',
1541             'LikeAction': 'like',
1542             'DislikeAction': 'dislike',
1543             'ListenAction': 'view',
1544             'WatchAction': 'view',
1545             'ViewAction': 'view',
1546         }
1547
1548         def is_type(e, *expected_types):
1549             type = variadic(traverse_obj(e, '@type'))
1550             return any(x in type for x in expected_types)
1551
1552         def extract_interaction_type(e):
1553             interaction_type = e.get('interactionType')
1554             if isinstance(interaction_type, dict):
1555                 interaction_type = interaction_type.get('@type')
1556             return str_or_none(interaction_type)
1557
1558         def extract_interaction_statistic(e):
1559             interaction_statistic = e.get('interactionStatistic')
1560             if isinstance(interaction_statistic, dict):
1561                 interaction_statistic = [interaction_statistic]
1562             if not isinstance(interaction_statistic, list):
1563                 return
1564             for is_e in interaction_statistic:
1565                 if not is_type(is_e, 'InteractionCounter'):
1566                     continue
1567                 interaction_type = extract_interaction_type(is_e)
1568                 if not interaction_type:
1569                     continue
1570                 # For interaction count some sites provide string instead of
1571                 # an integer (as per spec) with non digit characters (e.g. ",")
1572                 # so extracting count with more relaxed str_to_int
1573                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1574                 if interaction_count is None:
1575                     continue
1576                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1577                 if not count_kind:
1578                     continue
1579                 count_key = '%s_count' % count_kind
1580                 if info.get(count_key) is not None:
1581                     continue
1582                 info[count_key] = interaction_count
1583
1584         def extract_chapter_information(e):
1585             chapters = [{
1586                 'title': part.get('name'),
1587                 'start_time': part.get('startOffset'),
1588                 'end_time': part.get('endOffset'),
1589             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1590             for idx, (last_c, current_c, next_c) in enumerate(zip(
1591                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1592                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1593                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1594                 if None in current_c.values():
1595                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1596                     return
1597             if chapters:
1598                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1599                 info['chapters'] = chapters
1600
1601         def extract_video_object(e):
1602             author = e.get('author')
1603             info.update({
1604                 'url': url_or_none(e.get('contentUrl')),
1605                 'ext': mimetype2ext(e.get('encodingFormat')),
1606                 'title': unescapeHTML(e.get('name')),
1607                 'description': unescapeHTML(e.get('description')),
1608                 'thumbnails': [{'url': unescapeHTML(url)}
1609                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1610                                if url_or_none(url)],
1611                 'duration': parse_duration(e.get('duration')),
1612                 'timestamp': unified_timestamp(e.get('uploadDate')),
1613                 # author can be an instance of 'Organization' or 'Person' types.
1614                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1615                 # however some websites are using 'Text' type instead.
1616                 # 1. https://schema.org/VideoObject
1617                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1618                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1619                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1620                 'tbr': int_or_none(e.get('bitrate')),
1621                 'width': int_or_none(e.get('width')),
1622                 'height': int_or_none(e.get('height')),
1623                 'view_count': int_or_none(e.get('interactionCount')),
1624                 'tags': try_call(lambda: e.get('keywords').split(',')),
1625             })
1626             if is_type(e, 'AudioObject'):
1627                 info.update({
1628                     'vcodec': 'none',
1629                     'abr': int_or_none(e.get('bitrate')),
1630                 })
1631             extract_interaction_statistic(e)
1632             extract_chapter_information(e)
1633
1634         def traverse_json_ld(json_ld, at_top_level=True):
1635             for e in variadic(json_ld):
1636                 if not isinstance(e, dict):
1637                     continue
1638                 if at_top_level and '@context' not in e:
1639                     continue
1640                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1641                     traverse_json_ld(e['@graph'], at_top_level=False)
1642                     continue
1643                 if expected_type is not None and not is_type(e, expected_type):
1644                     continue
1645                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1646                 if rating is not None:
1647                     info['average_rating'] = rating
1648                 if is_type(e, 'TVEpisode', 'Episode'):
1649                     episode_name = unescapeHTML(e.get('name'))
1650                     info.update({
1651                         'episode': episode_name,
1652                         'episode_number': int_or_none(e.get('episodeNumber')),
1653                         'description': unescapeHTML(e.get('description')),
1654                     })
1655                     if not info.get('title') and episode_name:
1656                         info['title'] = episode_name
1657                     part_of_season = e.get('partOfSeason')
1658                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1659                         info.update({
1660                             'season': unescapeHTML(part_of_season.get('name')),
1661                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1662                         })
1663                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1664                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1665                         info['series'] = unescapeHTML(part_of_series.get('name'))
1666                 elif is_type(e, 'Movie'):
1667                     info.update({
1668                         'title': unescapeHTML(e.get('name')),
1669                         'description': unescapeHTML(e.get('description')),
1670                         'duration': parse_duration(e.get('duration')),
1671                         'timestamp': unified_timestamp(e.get('dateCreated')),
1672                     })
1673                 elif is_type(e, 'Article', 'NewsArticle'):
1674                     info.update({
1675                         'timestamp': parse_iso8601(e.get('datePublished')),
1676                         'title': unescapeHTML(e.get('headline')),
1677                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1678                     })
1679                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1680                         extract_video_object(e['video'][0])
1681                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1682                         extract_video_object(e['subjectOf'][0])
1683                 elif is_type(e, 'VideoObject', 'AudioObject'):
1684                     extract_video_object(e)
1685                     if expected_type is None:
1686                         continue
1687                     else:
1688                         break
1689                 video = e.get('video')
1690                 if is_type(video, 'VideoObject'):
1691                     extract_video_object(video)
1692                 if expected_type is None:
1693                     continue
1694                 else:
1695                     break
1696
1697         traverse_json_ld(json_ld)
1698         return filter_dict(info)
1699
1700     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1701         return self._parse_json(
1702             self._search_regex(
1703                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1704                 webpage, 'next.js data', fatal=fatal, **kw),
1705             video_id, transform_source=transform_source, fatal=fatal)
1706
1707     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1708         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1709         rectx = re.escape(context_name)
1710         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1711         js, arg_keys, arg_vals = self._search_regex(
1712             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1713             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1714             default=NO_DEFAULT if fatal else (None, None, None))
1715         if js is None:
1716             return {}
1717
1718         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1719             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1720
1721         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1722         return traverse_obj(ret, traverse) or {}
1723
1724     @staticmethod
1725     def _hidden_inputs(html):
1726         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1727         hidden_inputs = {}
1728         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1729             attrs = extract_attributes(input)
1730             if not input:
1731                 continue
1732             if attrs.get('type') not in ('hidden', 'submit'):
1733                 continue
1734             name = attrs.get('name') or attrs.get('id')
1735             value = attrs.get('value')
1736             if name and value is not None:
1737                 hidden_inputs[name] = value
1738         return hidden_inputs
1739
1740     def _form_hidden_inputs(self, form_id, html):
1741         form = self._search_regex(
1742             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1743             html, '%s form' % form_id, group='form')
1744         return self._hidden_inputs(form)
1745
1746     @classproperty(cache=True)
1747     def FormatSort(cls):
1748         class FormatSort(FormatSorter):
1749             def __init__(ie, *args, **kwargs):
1750                 super().__init__(ie._downloader, *args, **kwargs)
1751
1752         deprecation_warning(
1753             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1754             'Use yt_dlp.utils.FormatSorter instead')
1755         return FormatSort
1756
1757     def _sort_formats(self, formats, field_preference=[]):
1758         if not field_preference:
1759             self._downloader.deprecation_warning(
1760                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1761             return
1762         self._downloader.deprecation_warning(
1763             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1764             'Return _format_sort_fields in the info_dict instead')
1765         if formats:
1766             formats[0]['__sort_fields'] = field_preference
1767
1768     def _check_formats(self, formats, video_id):
1769         if formats:
1770             formats[:] = filter(
1771                 lambda f: self._is_valid_url(
1772                     f['url'], video_id,
1773                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1774                 formats)
1775
1776     @staticmethod
1777     def _remove_duplicate_formats(formats):
1778         format_urls = set()
1779         unique_formats = []
1780         for f in formats:
1781             if f['url'] not in format_urls:
1782                 format_urls.add(f['url'])
1783                 unique_formats.append(f)
1784         formats[:] = unique_formats
1785
1786     def _is_valid_url(self, url, video_id, item='video', headers={}):
1787         url = self._proto_relative_url(url, scheme='http:')
1788         # For now assume non HTTP(S) URLs always valid
1789         if not (url.startswith('http://') or url.startswith('https://')):
1790             return True
1791         try:
1792             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1793             return True
1794         except ExtractorError as e:
1795             self.to_screen(
1796                 '%s: %s URL is invalid, skipping: %s'
1797                 % (video_id, item, error_to_compat_str(e.cause)))
1798             return False
1799
1800     def http_scheme(self):
1801         """ Either "http:" or "https:", depending on the user's preferences """
1802         return (
1803             'http:'
1804             if self.get_param('prefer_insecure', False)
1805             else 'https:')
1806
1807     def _proto_relative_url(self, url, scheme=None):
1808         scheme = scheme or self.http_scheme()
1809         assert scheme.endswith(':')
1810         return sanitize_url(url, scheme=scheme[:-1])
1811
1812     def _sleep(self, timeout, video_id, msg_template=None):
1813         if msg_template is None:
1814             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1815         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1816         self.to_screen(msg)
1817         time.sleep(timeout)
1818
1819     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1820                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1821                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1822         if self.get_param('ignore_no_formats_error'):
1823             fatal = False
1824
1825         res = self._download_xml_handle(
1826             manifest_url, video_id, 'Downloading f4m manifest',
1827             'Unable to download f4m manifest',
1828             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1829             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1830             transform_source=transform_source,
1831             fatal=fatal, data=data, headers=headers, query=query)
1832         if res is False:
1833             return []
1834
1835         manifest, urlh = res
1836         manifest_url = urlh.url
1837
1838         return self._parse_f4m_formats(
1839             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1840             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1841
1842     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1843                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1844                            fatal=True, m3u8_id=None):
1845         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1846             return []
1847
1848         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1849         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1850         if akamai_pv is not None and ';' in akamai_pv.text:
1851             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1852             if playerVerificationChallenge.strip() != '':
1853                 return []
1854
1855         formats = []
1856         manifest_version = '1.0'
1857         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1858         if not media_nodes:
1859             manifest_version = '2.0'
1860             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1861         # Remove unsupported DRM protected media from final formats
1862         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1863         media_nodes = remove_encrypted_media(media_nodes)
1864         if not media_nodes:
1865             return formats
1866
1867         manifest_base_url = get_base_url(manifest)
1868
1869         bootstrap_info = xpath_element(
1870             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1871             'bootstrap info', default=None)
1872
1873         vcodec = None
1874         mime_type = xpath_text(
1875             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1876             'base URL', default=None)
1877         if mime_type and mime_type.startswith('audio/'):
1878             vcodec = 'none'
1879
1880         for i, media_el in enumerate(media_nodes):
1881             tbr = int_or_none(media_el.attrib.get('bitrate'))
1882             width = int_or_none(media_el.attrib.get('width'))
1883             height = int_or_none(media_el.attrib.get('height'))
1884             format_id = join_nonempty(f4m_id, tbr or i)
1885             # If <bootstrapInfo> is present, the specified f4m is a
1886             # stream-level manifest, and only set-level manifests may refer to
1887             # external resources.  See section 11.4 and section 4 of F4M spec
1888             if bootstrap_info is None:
1889                 media_url = None
1890                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1891                 if manifest_version == '2.0':
1892                     media_url = media_el.attrib.get('href')
1893                 if media_url is None:
1894                     media_url = media_el.attrib.get('url')
1895                 if not media_url:
1896                     continue
1897                 manifest_url = (
1898                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1899                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1900                 # If media_url is itself a f4m manifest do the recursive extraction
1901                 # since bitrates in parent manifest (this one) and media_url manifest
1902                 # may differ leading to inability to resolve the format by requested
1903                 # bitrate in f4m downloader
1904                 ext = determine_ext(manifest_url)
1905                 if ext == 'f4m':
1906                     f4m_formats = self._extract_f4m_formats(
1907                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1908                         transform_source=transform_source, fatal=fatal)
1909                     # Sometimes stream-level manifest contains single media entry that
1910                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1911                     # At the same time parent's media entry in set-level manifest may
1912                     # contain it. We will copy it from parent in such cases.
1913                     if len(f4m_formats) == 1:
1914                         f = f4m_formats[0]
1915                         f.update({
1916                             'tbr': f.get('tbr') or tbr,
1917                             'width': f.get('width') or width,
1918                             'height': f.get('height') or height,
1919                             'format_id': f.get('format_id') if not tbr else format_id,
1920                             'vcodec': vcodec,
1921                         })
1922                     formats.extend(f4m_formats)
1923                     continue
1924                 elif ext == 'm3u8':
1925                     formats.extend(self._extract_m3u8_formats(
1926                         manifest_url, video_id, 'mp4', preference=preference,
1927                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1928                     continue
1929             formats.append({
1930                 'format_id': format_id,
1931                 'url': manifest_url,
1932                 'manifest_url': manifest_url,
1933                 'ext': 'flv' if bootstrap_info is not None else None,
1934                 'protocol': 'f4m',
1935                 'tbr': tbr,
1936                 'width': width,
1937                 'height': height,
1938                 'vcodec': vcodec,
1939                 'preference': preference,
1940                 'quality': quality,
1941             })
1942         return formats
1943
1944     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1945         return {
1946             'format_id': join_nonempty(m3u8_id, 'meta'),
1947             'url': m3u8_url,
1948             'ext': ext,
1949             'protocol': 'm3u8',
1950             'preference': preference - 100 if preference else -100,
1951             'quality': quality,
1952             'resolution': 'multiple',
1953             'format_note': 'Quality selection URL',
1954         }
1955
1956     def _report_ignoring_subs(self, name):
1957         self.report_warning(bug_reports_message(
1958             f'Ignoring subtitle tracks found in the {name} manifest; '
1959             'if any subtitle tracks are missing,'
1960         ), only_once=True)
1961
1962     def _extract_m3u8_formats(self, *args, **kwargs):
1963         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1964         if subs:
1965             self._report_ignoring_subs('HLS')
1966         return fmts
1967
1968     def _extract_m3u8_formats_and_subtitles(
1969             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1970             preference=None, quality=None, m3u8_id=None, note=None,
1971             errnote=None, fatal=True, live=False, data=None, headers={},
1972             query={}):
1973
1974         if self.get_param('ignore_no_formats_error'):
1975             fatal = False
1976
1977         if not m3u8_url:
1978             if errnote is not False:
1979                 errnote = errnote or 'Failed to obtain m3u8 URL'
1980                 if fatal:
1981                     raise ExtractorError(errnote, video_id=video_id)
1982                 self.report_warning(f'{errnote}{bug_reports_message()}')
1983             return [], {}
1984
1985         res = self._download_webpage_handle(
1986             m3u8_url, video_id,
1987             note='Downloading m3u8 information' if note is None else note,
1988             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1989             fatal=fatal, data=data, headers=headers, query=query)
1990
1991         if res is False:
1992             return [], {}
1993
1994         m3u8_doc, urlh = res
1995         m3u8_url = urlh.url
1996
1997         return self._parse_m3u8_formats_and_subtitles(
1998             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1999             preference=preference, quality=quality, m3u8_id=m3u8_id,
2000             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2001             headers=headers, query=query, video_id=video_id)
2002
2003     def _parse_m3u8_formats_and_subtitles(
2004             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2005             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2006             errnote=None, fatal=True, data=None, headers={}, query={},
2007             video_id=None):
2008         formats, subtitles = [], {}
2009         has_drm = HlsFD._has_drm(m3u8_doc)
2010
2011         def format_url(url):
2012             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2013
2014         if self.get_param('hls_split_discontinuity', False):
2015             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2016                 if not m3u8_doc:
2017                     if not manifest_url:
2018                         return []
2019                     m3u8_doc = self._download_webpage(
2020                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2021                         note=False, errnote='Failed to download m3u8 playlist information')
2022                     if m3u8_doc is False:
2023                         return []
2024                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2025
2026         else:
2027             def _extract_m3u8_playlist_indices(*args, **kwargs):
2028                 return [None]
2029
2030         # References:
2031         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2032         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2033         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2034
2035         # We should try extracting formats only from master playlists [1, 4.3.4],
2036         # i.e. playlists that describe available qualities. On the other hand
2037         # media playlists [1, 4.3.3] should be returned as is since they contain
2038         # just the media without qualities renditions.
2039         # Fortunately, master playlist can be easily distinguished from media
2040         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2041         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2042         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2043         # media playlist and MUST NOT appear in master playlist thus we can
2044         # clearly detect media playlist with this criterion.
2045
2046         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2047             formats = [{
2048                 'format_id': join_nonempty(m3u8_id, idx),
2049                 'format_index': idx,
2050                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2051                 'ext': ext,
2052                 'protocol': entry_protocol,
2053                 'preference': preference,
2054                 'quality': quality,
2055                 'has_drm': has_drm,
2056             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2057
2058             return formats, subtitles
2059
2060         groups = {}
2061         last_stream_inf = {}
2062
2063         def extract_media(x_media_line):
2064             media = parse_m3u8_attributes(x_media_line)
2065             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2066             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2067             if not (media_type and group_id and name):
2068                 return
2069             groups.setdefault(group_id, []).append(media)
2070             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2071             if media_type == 'SUBTITLES':
2072                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2073                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2074                 # However, lack of URI has been spotted in the wild.
2075                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2076                 if not media.get('URI'):
2077                     return
2078                 url = format_url(media['URI'])
2079                 sub_info = {
2080                     'url': url,
2081                     'ext': determine_ext(url),
2082                 }
2083                 if sub_info['ext'] == 'm3u8':
2084                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2085                     # files may contain is WebVTT:
2086                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2087                     sub_info['ext'] = 'vtt'
2088                     sub_info['protocol'] = 'm3u8_native'
2089                 lang = media.get('LANGUAGE') or 'und'
2090                 subtitles.setdefault(lang, []).append(sub_info)
2091             if media_type not in ('VIDEO', 'AUDIO'):
2092                 return
2093             media_url = media.get('URI')
2094             if media_url:
2095                 manifest_url = format_url(media_url)
2096                 formats.extend({
2097                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2098                     'format_note': name,
2099                     'format_index': idx,
2100                     'url': manifest_url,
2101                     'manifest_url': m3u8_url,
2102                     'language': media.get('LANGUAGE'),
2103                     'ext': ext,
2104                     'protocol': entry_protocol,
2105                     'preference': preference,
2106                     'quality': quality,
2107                     'has_drm': has_drm,
2108                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2109                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2110
2111         def build_stream_name():
2112             # Despite specification does not mention NAME attribute for
2113             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2114             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2115             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2116             stream_name = last_stream_inf.get('NAME')
2117             if stream_name:
2118                 return stream_name
2119             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2120             # from corresponding rendition group
2121             stream_group_id = last_stream_inf.get('VIDEO')
2122             if not stream_group_id:
2123                 return
2124             stream_group = groups.get(stream_group_id)
2125             if not stream_group:
2126                 return stream_group_id
2127             rendition = stream_group[0]
2128             return rendition.get('NAME') or stream_group_id
2129
2130         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2131         # chance to detect video only formats when EXT-X-STREAM-INF tags
2132         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2133         for line in m3u8_doc.splitlines():
2134             if line.startswith('#EXT-X-MEDIA:'):
2135                 extract_media(line)
2136
2137         for line in m3u8_doc.splitlines():
2138             if line.startswith('#EXT-X-STREAM-INF:'):
2139                 last_stream_inf = parse_m3u8_attributes(line)
2140             elif line.startswith('#') or not line.strip():
2141                 continue
2142             else:
2143                 tbr = float_or_none(
2144                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2145                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2146                 manifest_url = format_url(line.strip())
2147
2148                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2149                     format_id = [m3u8_id, None, idx]
2150                     # Bandwidth of live streams may differ over time thus making
2151                     # format_id unpredictable. So it's better to keep provided
2152                     # format_id intact.
2153                     if not live:
2154                         stream_name = build_stream_name()
2155                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2156                     f = {
2157                         'format_id': join_nonempty(*format_id),
2158                         'format_index': idx,
2159                         'url': manifest_url,
2160                         'manifest_url': m3u8_url,
2161                         'tbr': tbr,
2162                         'ext': ext,
2163                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2164                         'protocol': entry_protocol,
2165                         'preference': preference,
2166                         'quality': quality,
2167                         'has_drm': has_drm,
2168                     }
2169                     resolution = last_stream_inf.get('RESOLUTION')
2170                     if resolution:
2171                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2172                         if mobj:
2173                             f['width'] = int(mobj.group('width'))
2174                             f['height'] = int(mobj.group('height'))
2175                     # Unified Streaming Platform
2176                     mobj = re.search(
2177                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2178                     if mobj:
2179                         abr, vbr = mobj.groups()
2180                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2181                         f.update({
2182                             'vbr': vbr,
2183                             'abr': abr,
2184                         })
2185                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2186                     f.update(codecs)
2187                     audio_group_id = last_stream_inf.get('AUDIO')
2188                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2189                     # references a rendition group MUST have a CODECS attribute.
2190                     # However, this is not always respected. E.g. [2]
2191                     # contains EXT-X-STREAM-INF tag which references AUDIO
2192                     # rendition group but does not have CODECS and despite
2193                     # referencing an audio group it represents a complete
2194                     # (with audio and video) format. So, for such cases we will
2195                     # ignore references to rendition groups and treat them
2196                     # as complete formats.
2197                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2198                         audio_group = groups.get(audio_group_id)
2199                         if audio_group and audio_group[0].get('URI'):
2200                             # TODO: update acodec for audio only formats with
2201                             # the same GROUP-ID
2202                             f['acodec'] = 'none'
2203                     if not f.get('ext'):
2204                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2205                     formats.append(f)
2206
2207                     # for DailyMotion
2208                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2209                     if progressive_uri:
2210                         http_f = f.copy()
2211                         del http_f['manifest_url']
2212                         http_f.update({
2213                             'format_id': f['format_id'].replace('hls-', 'http-'),
2214                             'protocol': 'http',
2215                             'url': progressive_uri,
2216                         })
2217                         formats.append(http_f)
2218
2219                 last_stream_inf = {}
2220         return formats, subtitles
2221
2222     def _extract_m3u8_vod_duration(
2223             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2224
2225         m3u8_vod = self._download_webpage(
2226             m3u8_vod_url, video_id,
2227             note='Downloading m3u8 VOD manifest' if note is None else note,
2228             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2229             fatal=False, data=data, headers=headers, query=query)
2230
2231         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2232
2233     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2234         if '#EXT-X-ENDLIST' not in m3u8_vod:
2235             return None
2236
2237         return int(sum(
2238             float(line[len('#EXTINF:'):].split(',')[0])
2239             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2240
2241     def _extract_mpd_vod_duration(
2242             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2243
2244         mpd_doc = self._download_xml(
2245             mpd_url, video_id,
2246             note='Downloading MPD VOD manifest' if note is None else note,
2247             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2248             fatal=False, data=data, headers=headers, query=query)
2249         if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2250             return None
2251         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2252
2253     @staticmethod
2254     def _xpath_ns(path, namespace=None):
2255         if not namespace:
2256             return path
2257         out = []
2258         for c in path.split('/'):
2259             if not c or c == '.':
2260                 out.append(c)
2261             else:
2262                 out.append('{%s}%s' % (namespace, c))
2263         return '/'.join(out)
2264
2265     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2266         if self.get_param('ignore_no_formats_error'):
2267             fatal = False
2268
2269         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2270         if res is False:
2271             assert not fatal
2272             return [], {}
2273         smil, urlh = res
2274
2275         return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2276                                                       namespace=self._parse_smil_namespace(smil))
2277
2278     def _extract_smil_formats(self, *args, **kwargs):
2279         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2280         if subs:
2281             self._report_ignoring_subs('SMIL')
2282         return fmts
2283
2284     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2285         res = self._download_smil(smil_url, video_id, fatal=fatal)
2286         if res is False:
2287             return {}
2288
2289         smil, urlh = res
2290         smil_url = urlh.url
2291
2292         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2293
2294     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2295         return self._download_xml_handle(
2296             smil_url, video_id, 'Downloading SMIL file',
2297             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2298
2299     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2300         namespace = self._parse_smil_namespace(smil)
2301
2302         formats, subtitles = self._parse_smil_formats_and_subtitles(
2303             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2304
2305         video_id = os.path.splitext(url_basename(smil_url))[0]
2306         title = None
2307         description = None
2308         upload_date = None
2309         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2310             name = meta.attrib.get('name')
2311             content = meta.attrib.get('content')
2312             if not name or not content:
2313                 continue
2314             if not title and name == 'title':
2315                 title = content
2316             elif not description and name in ('description', 'abstract'):
2317                 description = content
2318             elif not upload_date and name == 'date':
2319                 upload_date = unified_strdate(content)
2320
2321         thumbnails = [{
2322             'id': image.get('type'),
2323             'url': image.get('src'),
2324             'width': int_or_none(image.get('width')),
2325             'height': int_or_none(image.get('height')),
2326         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2327
2328         return {
2329             'id': video_id,
2330             'title': title or video_id,
2331             'description': description,
2332             'upload_date': upload_date,
2333             'thumbnails': thumbnails,
2334             'formats': formats,
2335             'subtitles': subtitles,
2336         }
2337
2338     def _parse_smil_namespace(self, smil):
2339         return self._search_regex(
2340             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2341
2342     def _parse_smil_formats(self, *args, **kwargs):
2343         fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2344         if subs:
2345             self._report_ignoring_subs('SMIL')
2346         return fmts
2347
2348     def _parse_smil_formats_and_subtitles(
2349             self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2350         base = smil_url
2351         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2352             b = meta.get('base') or meta.get('httpBase')
2353             if b:
2354                 base = b
2355                 break
2356
2357         formats, subtitles = [], {}
2358         rtmp_count = 0
2359         http_count = 0
2360         m3u8_count = 0
2361         imgs_count = 0
2362
2363         srcs = set()
2364         media = itertools.chain.from_iterable(
2365             smil.findall(self._xpath_ns(arg, namespace))
2366             for arg in ['.//video', './/audio', './/media'])
2367         for medium in media:
2368             src = medium.get('src')
2369             if not src or src in srcs:
2370                 continue
2371             srcs.add(src)
2372
2373             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2374             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2375             width = int_or_none(medium.get('width'))
2376             height = int_or_none(medium.get('height'))
2377             proto = medium.get('proto')
2378             ext = medium.get('ext')
2379             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2380                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2381             streamer = medium.get('streamer') or base
2382
2383             if proto == 'rtmp' or streamer.startswith('rtmp'):
2384                 rtmp_count += 1
2385                 formats.append({
2386                     'url': streamer,
2387                     'play_path': src,
2388                     'ext': 'flv',
2389                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2390                     'tbr': bitrate,
2391                     'filesize': filesize,
2392                     'width': width,
2393                     'height': height,
2394                 })
2395                 if transform_rtmp_url:
2396                     streamer, src = transform_rtmp_url(streamer, src)
2397                     formats[-1].update({
2398                         'url': streamer,
2399                         'play_path': src,
2400                     })
2401                 continue
2402
2403             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2404             src_url = src_url.strip()
2405
2406             if proto == 'm3u8' or src_ext == 'm3u8':
2407                 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
2408                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2409                 self._merge_subtitles(m3u8_subs, target=subtitles)
2410                 if len(m3u8_formats) == 1:
2411                     m3u8_count += 1
2412                     m3u8_formats[0].update({
2413                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2414                         'tbr': bitrate,
2415                         'width': width,
2416                         'height': height,
2417                     })
2418                 formats.extend(m3u8_formats)
2419             elif src_ext == 'f4m':
2420                 f4m_url = src_url
2421                 if not f4m_params:
2422                     f4m_params = {
2423                         'hdcore': '3.2.0',
2424                         'plugin': 'flowplayer-3.2.0.1',
2425                     }
2426                 f4m_url += '&' if '?' in f4m_url else '?'
2427                 f4m_url += urllib.parse.urlencode(f4m_params)
2428                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2429             elif src_ext == 'mpd':
2430                 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2431                     src_url, video_id, mpd_id='dash', fatal=False)
2432                 formats.extend(mpd_formats)
2433                 self._merge_subtitles(mpd_subs, target=subtitles)
2434             elif re.search(r'\.ism/[Mm]anifest', src_url):
2435                 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2436                     src_url, video_id, ism_id='mss', fatal=False)
2437                 formats.extend(ism_formats)
2438                 self._merge_subtitles(ism_subs, target=subtitles)
2439             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2440                 http_count += 1
2441                 formats.append({
2442                     'url': src_url,
2443                     'ext': ext or src_ext or 'flv',
2444                     'format_id': 'http-%d' % (bitrate or http_count),
2445                     'tbr': bitrate,
2446                     'filesize': filesize,
2447                     'width': width,
2448                     'height': height,
2449                 })
2450
2451         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2452             src = medium.get('src')
2453             if not src or src in srcs:
2454                 continue
2455             srcs.add(src)
2456
2457             imgs_count += 1
2458             formats.append({
2459                 'format_id': 'imagestream-%d' % (imgs_count),
2460                 'url': src,
2461                 'ext': mimetype2ext(medium.get('type')),
2462                 'acodec': 'none',
2463                 'vcodec': 'none',
2464                 'width': int_or_none(medium.get('width')),
2465                 'height': int_or_none(medium.get('height')),
2466                 'format_note': 'SMIL storyboards',
2467             })
2468
2469         smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2470         self._merge_subtitles(smil_subs, target=subtitles)
2471
2472         return formats, subtitles
2473
2474     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2475         urls = []
2476         subtitles = {}
2477         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2478             src = textstream.get('src')
2479             if not src or src in urls:
2480                 continue
2481             urls.append(src)
2482             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2483             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2484             subtitles.setdefault(lang, []).append({
2485                 'url': src,
2486                 'ext': ext,
2487             })
2488         return subtitles
2489
2490     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2491         res = self._download_xml_handle(
2492             xspf_url, playlist_id, 'Downloading xpsf playlist',
2493             'Unable to download xspf manifest', fatal=fatal)
2494         if res is False:
2495             return []
2496
2497         xspf, urlh = res
2498         xspf_url = urlh.url
2499
2500         return self._parse_xspf(
2501             xspf, playlist_id, xspf_url=xspf_url,
2502             xspf_base_url=base_url(xspf_url))
2503
2504     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2505         NS_MAP = {
2506             'xspf': 'http://xspf.org/ns/0/',
2507             's1': 'http://static.streamone.nl/player/ns/0',
2508         }
2509
2510         entries = []
2511         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2512             title = xpath_text(
2513                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2514             description = xpath_text(
2515                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2516             thumbnail = xpath_text(
2517                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2518             duration = float_or_none(
2519                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2520
2521             formats = []
2522             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2523                 format_url = urljoin(xspf_base_url, location.text)
2524                 if not format_url:
2525                     continue
2526                 formats.append({
2527                     'url': format_url,
2528                     'manifest_url': xspf_url,
2529                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2530                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2531                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2532                 })
2533
2534             entries.append({
2535                 'id': playlist_id,
2536                 'title': title,
2537                 'description': description,
2538                 'thumbnail': thumbnail,
2539                 'duration': duration,
2540                 'formats': formats,
2541             })
2542         return entries
2543
2544     def _extract_mpd_formats(self, *args, **kwargs):
2545         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2546         if subs:
2547             self._report_ignoring_subs('DASH')
2548         return fmts
2549
2550     def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
2551         periods = self._extract_mpd_periods(*args, **kwargs)
2552         return self._merge_mpd_periods(periods)
2553
2554     def _extract_mpd_periods(
2555             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2556             fatal=True, data=None, headers={}, query={}):
2557
2558         if self.get_param('ignore_no_formats_error'):
2559             fatal = False
2560
2561         res = self._download_xml_handle(
2562             mpd_url, video_id,
2563             note='Downloading MPD manifest' if note is None else note,
2564             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2565             fatal=fatal, data=data, headers=headers, query=query)
2566         if res is False:
2567             return []
2568         mpd_doc, urlh = res
2569         if mpd_doc is None:
2570             return []
2571
2572         # We could have been redirected to a new url when we retrieved our mpd file.
2573         mpd_url = urlh.url
2574         mpd_base_url = base_url(mpd_url)
2575
2576         return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
2577
2578     def _parse_mpd_formats(self, *args, **kwargs):
2579         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2580         if subs:
2581             self._report_ignoring_subs('DASH')
2582         return fmts
2583
2584     def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
2585         periods = self._parse_mpd_periods(*args, **kwargs)
2586         return self._merge_mpd_periods(periods)
2587
2588     def _merge_mpd_periods(self, periods):
2589         """
2590         Combine all formats and subtitles from an MPD manifest into a single list,
2591         by concatenate streams with similar formats.
2592         """
2593         formats, subtitles = {}, {}
2594         for period in periods:
2595             for f in period['formats']:
2596                 assert 'is_dash_periods' not in f, 'format already processed'
2597                 f['is_dash_periods'] = True
2598                 format_key = tuple(v for k, v in f.items() if k not in (
2599                     ('format_id', 'fragments', 'manifest_stream_number')))
2600                 if format_key not in formats:
2601                     formats[format_key] = f
2602                 elif 'fragments' in f:
2603                     formats[format_key].setdefault('fragments', []).extend(f['fragments'])
2604
2605             if subtitles and period['subtitles']:
2606                 self.report_warning(bug_reports_message(
2607                     'Found subtitles in multiple periods in the DASH manifest; '
2608                     'if part of the subtitles are missing,'
2609                 ), only_once=True)
2610
2611             for sub_lang, sub_info in period['subtitles'].items():
2612                 subtitles.setdefault(sub_lang, []).extend(sub_info)
2613
2614         return list(formats.values()), subtitles
2615
2616     def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2617         """
2618         Parse formats from MPD manifest.
2619         References:
2620          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2621             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2622          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2623         """
2624         if not self.get_param('dynamic_mpd', True):
2625             if mpd_doc.get('type') == 'dynamic':
2626                 return [], {}
2627
2628         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2629
2630         def _add_ns(path):
2631             return self._xpath_ns(path, namespace)
2632
2633         def is_drm_protected(element):
2634             return element.find(_add_ns('ContentProtection')) is not None
2635
2636         def extract_multisegment_info(element, ms_parent_info):
2637             ms_info = ms_parent_info.copy()
2638
2639             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2640             # common attributes and elements.  We will only extract relevant
2641             # for us.
2642             def extract_common(source):
2643                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2644                 if segment_timeline is not None:
2645                     s_e = segment_timeline.findall(_add_ns('S'))
2646                     if s_e:
2647                         ms_info['total_number'] = 0
2648                         ms_info['s'] = []
2649                         for s in s_e:
2650                             r = int(s.get('r', 0))
2651                             ms_info['total_number'] += 1 + r
2652                             ms_info['s'].append({
2653                                 't': int(s.get('t', 0)),
2654                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2655                                 'd': int(s.attrib['d']),
2656                                 'r': r,
2657                             })
2658                 start_number = source.get('startNumber')
2659                 if start_number:
2660                     ms_info['start_number'] = int(start_number)
2661                 timescale = source.get('timescale')
2662                 if timescale:
2663                     ms_info['timescale'] = int(timescale)
2664                 segment_duration = source.get('duration')
2665                 if segment_duration:
2666                     ms_info['segment_duration'] = float(segment_duration)
2667
2668             def extract_Initialization(source):
2669                 initialization = source.find(_add_ns('Initialization'))
2670                 if initialization is not None:
2671                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2672
2673             segment_list = element.find(_add_ns('SegmentList'))
2674             if segment_list is not None:
2675                 extract_common(segment_list)
2676                 extract_Initialization(segment_list)
2677                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2678                 if segment_urls_e:
2679                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2680             else:
2681                 segment_template = element.find(_add_ns('SegmentTemplate'))
2682                 if segment_template is not None:
2683                     extract_common(segment_template)
2684                     media = segment_template.get('media')
2685                     if media:
2686                         ms_info['media'] = media
2687                     initialization = segment_template.get('initialization')
2688                     if initialization:
2689                         ms_info['initialization'] = initialization
2690                     else:
2691                         extract_Initialization(segment_template)
2692             return ms_info
2693
2694         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2695         stream_numbers = collections.defaultdict(int)
2696         for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
2697             period_entry = {
2698                 'id': period.get('id', f'period-{period_idx}'),
2699                 'formats': [],
2700                 'subtitles': collections.defaultdict(list),
2701             }
2702             period_duration = parse_duration(period.get('duration')) or mpd_duration
2703             period_ms_info = extract_multisegment_info(period, {
2704                 'start_number': 1,
2705                 'timescale': 1,
2706             })
2707             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2708                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2709                 for representation in adaptation_set.findall(_add_ns('Representation')):
2710                     representation_attrib = adaptation_set.attrib.copy()
2711                     representation_attrib.update(representation.attrib)
2712                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2713                     mime_type = representation_attrib['mimeType']
2714                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2715
2716                     codec_str = representation_attrib.get('codecs', '')
2717                     # Some kind of binary subtitle found in some youtube livestreams
2718                     if mime_type == 'application/x-rawcc':
2719                         codecs = {'scodec': codec_str}
2720                     else:
2721                         codecs = parse_codecs(codec_str)
2722                     if content_type not in ('video', 'audio', 'text'):
2723                         if mime_type == 'image/jpeg':
2724                             content_type = mime_type
2725                         elif codecs.get('vcodec', 'none') != 'none':
2726                             content_type = 'video'
2727                         elif codecs.get('acodec', 'none') != 'none':
2728                             content_type = 'audio'
2729                         elif codecs.get('scodec', 'none') != 'none':
2730                             content_type = 'text'
2731                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2732                             content_type = 'text'
2733                         else:
2734                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2735                             continue
2736
2737                     base_url = ''
2738                     for element in (representation, adaptation_set, period, mpd_doc):
2739                         base_url_e = element.find(_add_ns('BaseURL'))
2740                         if try_call(lambda: base_url_e.text) is not None:
2741                             base_url = base_url_e.text + base_url
2742                             if re.match(r'^https?://', base_url):
2743                                 break
2744                     if mpd_base_url and base_url.startswith('/'):
2745                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2746                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2747                         if not mpd_base_url.endswith('/'):
2748                             mpd_base_url += '/'
2749                         base_url = mpd_base_url + base_url
2750                     representation_id = representation_attrib.get('id')
2751                     lang = representation_attrib.get('lang')
2752                     url_el = representation.find(_add_ns('BaseURL'))
2753                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2754                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2755                     if representation_id is not None:
2756                         format_id = representation_id
2757                     else:
2758                         format_id = content_type
2759                     if mpd_id:
2760                         format_id = mpd_id + '-' + format_id
2761                     if content_type in ('video', 'audio'):
2762                         f = {
2763                             'format_id': format_id,
2764                             'manifest_url': mpd_url,
2765                             'ext': mimetype2ext(mime_type),
2766                             'width': int_or_none(representation_attrib.get('width')),
2767                             'height': int_or_none(representation_attrib.get('height')),
2768                             'tbr': float_or_none(bandwidth, 1000),
2769                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2770                             'fps': int_or_none(representation_attrib.get('frameRate')),
2771                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2772                             'format_note': 'DASH %s' % content_type,
2773                             'filesize': filesize,
2774                             'container': mimetype2ext(mime_type) + '_dash',
2775                             **codecs
2776                         }
2777                     elif content_type == 'text':
2778                         f = {
2779                             'ext': mimetype2ext(mime_type),
2780                             'manifest_url': mpd_url,
2781                             'filesize': filesize,
2782                         }
2783                     elif content_type == 'image/jpeg':
2784                         # See test case in VikiIE
2785                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2786                         f = {
2787                             'format_id': format_id,
2788                             'ext': 'mhtml',
2789                             'manifest_url': mpd_url,
2790                             'format_note': 'DASH storyboards (jpeg)',
2791                             'acodec': 'none',
2792                             'vcodec': 'none',
2793                         }
2794                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2795                         f['has_drm'] = True
2796                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2797
2798                     def prepare_template(template_name, identifiers):
2799                         tmpl = representation_ms_info[template_name]
2800                         if representation_id is not None:
2801                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2802                         # First of, % characters outside $...$ templates
2803                         # must be escaped by doubling for proper processing
2804                         # by % operator string formatting used further (see
2805                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2806                         t = ''
2807                         in_template = False
2808                         for c in tmpl:
2809                             t += c
2810                             if c == '$':
2811                                 in_template = not in_template
2812                             elif c == '%' and not in_template:
2813                                 t += c
2814                         # Next, $...$ templates are translated to their
2815                         # %(...) counterparts to be used with % operator
2816                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2817                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2818                         t.replace('$$', '$')
2819                         return t
2820
2821                     # @initialization is a regular template like @media one
2822                     # so it should be handled just the same way (see
2823                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2824                     if 'initialization' in representation_ms_info:
2825                         initialization_template = prepare_template(
2826                             'initialization',
2827                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2828                             # $Time$ shall not be included for @initialization thus
2829                             # only $Bandwidth$ remains
2830                             ('Bandwidth', ))
2831                         representation_ms_info['initialization_url'] = initialization_template % {
2832                             'Bandwidth': bandwidth,
2833                         }
2834
2835                     def location_key(location):
2836                         return 'url' if re.match(r'^https?://', location) else 'path'
2837
2838                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2839
2840                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2841                         media_location_key = location_key(media_template)
2842
2843                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2844                         # can't be used at the same time
2845                         if '%(Number' in media_template and 's' not in representation_ms_info:
2846                             segment_duration = None
2847                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2848                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2849                                 representation_ms_info['total_number'] = int(math.ceil(
2850                                     float_or_none(period_duration, segment_duration, default=0)))
2851                             representation_ms_info['fragments'] = [{
2852                                 media_location_key: media_template % {
2853                                     'Number': segment_number,
2854                                     'Bandwidth': bandwidth,
2855                                 },
2856                                 'duration': segment_duration,
2857                             } for segment_number in range(
2858                                 representation_ms_info['start_number'],
2859                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2860                         else:
2861                             # $Number*$ or $Time$ in media template with S list available
2862                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2863                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2864                             representation_ms_info['fragments'] = []
2865                             segment_time = 0
2866                             segment_d = None
2867                             segment_number = representation_ms_info['start_number']
2868
2869                             def add_segment_url():
2870                                 segment_url = media_template % {
2871                                     'Time': segment_time,
2872                                     'Bandwidth': bandwidth,
2873                                     'Number': segment_number,
2874                                 }
2875                                 representation_ms_info['fragments'].append({
2876                                     media_location_key: segment_url,
2877                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2878                                 })
2879
2880                             for num, s in enumerate(representation_ms_info['s']):
2881                                 segment_time = s.get('t') or segment_time
2882                                 segment_d = s['d']
2883                                 add_segment_url()
2884                                 segment_number += 1
2885                                 for r in range(s.get('r', 0)):
2886                                     segment_time += segment_d
2887                                     add_segment_url()
2888                                     segment_number += 1
2889                                 segment_time += segment_d
2890                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2891                         # No media template,
2892                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2893                         # or any YouTube dashsegments video
2894                         fragments = []
2895                         segment_index = 0
2896                         timescale = representation_ms_info['timescale']
2897                         for s in representation_ms_info['s']:
2898                             duration = float_or_none(s['d'], timescale)
2899                             for r in range(s.get('r', 0) + 1):
2900                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2901                                 fragments.append({
2902                                     location_key(segment_uri): segment_uri,
2903                                     'duration': duration,
2904                                 })
2905                                 segment_index += 1
2906                         representation_ms_info['fragments'] = fragments
2907                     elif 'segment_urls' in representation_ms_info:
2908                         # Segment URLs with no SegmentTimeline
2909                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2910                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2911                         fragments = []
2912                         segment_duration = float_or_none(
2913                             representation_ms_info['segment_duration'],
2914                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2915                         for segment_url in representation_ms_info['segment_urls']:
2916                             fragment = {
2917                                 location_key(segment_url): segment_url,
2918                             }
2919                             if segment_duration:
2920                                 fragment['duration'] = segment_duration
2921                             fragments.append(fragment)
2922                         representation_ms_info['fragments'] = fragments
2923                     # If there is a fragments key available then we correctly recognized fragmented media.
2924                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2925                     # assumption is not necessarily correct since we may simply have no support for
2926                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2927                     if 'fragments' in representation_ms_info:
2928                         f.update({
2929                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2930                             'url': mpd_url or base_url,
2931                             'fragment_base_url': base_url,
2932                             'fragments': [],
2933                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2934                         })
2935                         if 'initialization_url' in representation_ms_info:
2936                             initialization_url = representation_ms_info['initialization_url']
2937                             if not f.get('url'):
2938                                 f['url'] = initialization_url
2939                             f['fragments'].append({location_key(initialization_url): initialization_url})
2940                         f['fragments'].extend(representation_ms_info['fragments'])
2941                         if not period_duration:
2942                             period_duration = try_get(
2943                                 representation_ms_info,
2944                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2945                     else:
2946                         # Assuming direct URL to unfragmented media.
2947                         f['url'] = base_url
2948                     if content_type in ('video', 'audio', 'image/jpeg'):
2949                         f['manifest_stream_number'] = stream_numbers[f['url']]
2950                         stream_numbers[f['url']] += 1
2951                         period_entry['formats'].append(f)
2952                     elif content_type == 'text':
2953                         period_entry['subtitles'][lang or 'und'].append(f)
2954             yield period_entry
2955
2956     def _extract_ism_formats(self, *args, **kwargs):
2957         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2958         if subs:
2959             self._report_ignoring_subs('ISM')
2960         return fmts
2961
2962     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2963         if self.get_param('ignore_no_formats_error'):
2964             fatal = False
2965
2966         res = self._download_xml_handle(
2967             ism_url, video_id,
2968             note='Downloading ISM manifest' if note is None else note,
2969             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2970             fatal=fatal, data=data, headers=headers, query=query)
2971         if res is False:
2972             return [], {}
2973         ism_doc, urlh = res
2974         if ism_doc is None:
2975             return [], {}
2976
2977         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
2978
2979     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2980         """
2981         Parse formats from ISM manifest.
2982         References:
2983          1. [MS-SSTR]: Smooth Streaming Protocol,
2984             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2985         """
2986         if ism_doc.get('IsLive') == 'TRUE':
2987             return [], {}
2988
2989         duration = int(ism_doc.attrib['Duration'])
2990         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2991
2992         formats = []
2993         subtitles = {}
2994         for stream in ism_doc.findall('StreamIndex'):
2995             stream_type = stream.get('Type')
2996             if stream_type not in ('video', 'audio', 'text'):
2997                 continue
2998             url_pattern = stream.attrib['Url']
2999             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3000             stream_name = stream.get('Name')
3001             stream_language = stream.get('Language', 'und')
3002             for track in stream.findall('QualityLevel'):
3003                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3004                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3005                 # TODO: add support for WVC1 and WMAP
3006                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3007                     self.report_warning('%s is not a supported codec' % fourcc)
3008                     continue
3009                 tbr = int(track.attrib['Bitrate']) // 1000
3010                 # [1] does not mention Width and Height attributes. However,
3011                 # they're often present while MaxWidth and MaxHeight are
3012                 # missing, so should be used as fallbacks
3013                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3014                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3015                 sampling_rate = int_or_none(track.get('SamplingRate'))
3016
3017                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3018                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3019
3020                 fragments = []
3021                 fragment_ctx = {
3022                     'time': 0,
3023                 }
3024                 stream_fragments = stream.findall('c')
3025                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3026                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3027                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3028                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3029                     if not fragment_ctx['duration']:
3030                         try:
3031                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3032                         except IndexError:
3033                             next_fragment_time = duration
3034                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3035                     for _ in range(fragment_repeat):
3036                         fragments.append({
3037                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3038                             'duration': fragment_ctx['duration'] / stream_timescale,
3039                         })
3040                         fragment_ctx['time'] += fragment_ctx['duration']
3041
3042                 if stream_type == 'text':
3043                     subtitles.setdefault(stream_language, []).append({
3044                         'ext': 'ismt',
3045                         'protocol': 'ism',
3046                         'url': ism_url,
3047                         'manifest_url': ism_url,
3048                         'fragments': fragments,
3049                         '_download_params': {
3050                             'stream_type': stream_type,
3051                             'duration': duration,
3052                             'timescale': stream_timescale,
3053                             'fourcc': fourcc,
3054                             'language': stream_language,
3055                             'codec_private_data': track.get('CodecPrivateData'),
3056                         }
3057                     })
3058                 elif stream_type in ('video', 'audio'):
3059                     formats.append({
3060                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3061                         'url': ism_url,
3062                         'manifest_url': ism_url,
3063                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3064                         'width': width,
3065                         'height': height,
3066                         'tbr': tbr,
3067                         'asr': sampling_rate,
3068                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3069                         'acodec': 'none' if stream_type == 'video' else fourcc,
3070                         'protocol': 'ism',
3071                         'fragments': fragments,
3072                         'has_drm': ism_doc.find('Protection') is not None,
3073                         'language': stream_language,
3074                         'audio_channels': int_or_none(track.get('Channels')),
3075                         '_download_params': {
3076                             'stream_type': stream_type,
3077                             'duration': duration,
3078                             'timescale': stream_timescale,
3079                             'width': width or 0,
3080                             'height': height or 0,
3081                             'fourcc': fourcc,
3082                             'language': stream_language,
3083                             'codec_private_data': track.get('CodecPrivateData'),
3084                             'sampling_rate': sampling_rate,
3085                             'channels': int_or_none(track.get('Channels', 2)),
3086                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3087                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3088                         },
3089                     })
3090         return formats, subtitles
3091
3092     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3093         def absolute_url(item_url):
3094             return urljoin(base_url, item_url)
3095
3096         def parse_content_type(content_type):
3097             if not content_type:
3098                 return {}
3099             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3100             if ctr:
3101                 mimetype, codecs = ctr.groups()
3102                 f = parse_codecs(codecs)
3103                 f['ext'] = mimetype2ext(mimetype)
3104                 return f
3105             return {}
3106
3107         def _media_formats(src, cur_media_type, type_info=None):
3108             type_info = type_info or {}
3109             full_url = absolute_url(src)
3110             ext = type_info.get('ext') or determine_ext(full_url)
3111             if ext == 'm3u8':
3112                 is_plain_url = False
3113                 formats = self._extract_m3u8_formats(
3114                     full_url, video_id, ext='mp4',
3115                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3116                     preference=preference, quality=quality, fatal=False)
3117             elif ext == 'mpd':
3118                 is_plain_url = False
3119                 formats = self._extract_mpd_formats(
3120                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3121             else:
3122                 is_plain_url = True
3123                 formats = [{
3124                     'url': full_url,
3125                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3126                     'ext': ext,
3127                 }]
3128             return is_plain_url, formats
3129
3130         entries = []
3131         # amp-video and amp-audio are very similar to their HTML5 counterparts
3132         # so we will include them right here (see
3133         # https://www.ampproject.org/docs/reference/components/amp-video)
3134         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3135         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3136         media_tags = [(media_tag, media_tag_name, media_type, '')
3137                       for media_tag, media_tag_name, media_type
3138                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3139         media_tags.extend(re.findall(
3140             # We only allow video|audio followed by a whitespace or '>'.
3141             # Allowing more characters may end up in significant slow down (see
3142             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3143             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3144             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3145         for media_tag, _, media_type, media_content in media_tags:
3146             media_info = {
3147                 'formats': [],
3148                 'subtitles': {},
3149             }
3150             media_attributes = extract_attributes(media_tag)
3151             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3152             if src:
3153                 f = parse_content_type(media_attributes.get('type'))
3154                 _, formats = _media_formats(src, media_type, f)
3155                 media_info['formats'].extend(formats)
3156             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3157             if media_content:
3158                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3159                     s_attr = extract_attributes(source_tag)
3160                     # data-video-src and data-src are non standard but seen
3161                     # several times in the wild
3162                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3163                     if not src:
3164                         continue
3165                     f = parse_content_type(s_attr.get('type'))
3166                     is_plain_url, formats = _media_formats(src, media_type, f)
3167                     if is_plain_url:
3168                         # width, height, res, label and title attributes are
3169                         # all not standard but seen several times in the wild
3170                         labels = [
3171                             s_attr.get(lbl)
3172                             for lbl in ('label', 'title')
3173                             if str_or_none(s_attr.get(lbl))
3174                         ]
3175                         width = int_or_none(s_attr.get('width'))
3176                         height = (int_or_none(s_attr.get('height'))
3177                                   or int_or_none(s_attr.get('res')))
3178                         if not width or not height:
3179                             for lbl in labels:
3180                                 resolution = parse_resolution(lbl)
3181                                 if not resolution:
3182                                     continue
3183                                 width = width or resolution.get('width')
3184                                 height = height or resolution.get('height')
3185                         for lbl in labels:
3186                             tbr = parse_bitrate(lbl)
3187                             if tbr:
3188                                 break
3189                         else:
3190                             tbr = None
3191                         f.update({
3192                             'width': width,
3193                             'height': height,
3194                             'tbr': tbr,
3195                             'format_id': s_attr.get('label') or s_attr.get('title'),
3196                         })
3197                         f.update(formats[0])
3198                         media_info['formats'].append(f)
3199                     else:
3200                         media_info['formats'].extend(formats)
3201                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3202                     track_attributes = extract_attributes(track_tag)
3203                     kind = track_attributes.get('kind')
3204                     if not kind or kind in ('subtitles', 'captions'):
3205                         src = strip_or_none(track_attributes.get('src'))
3206                         if not src:
3207                             continue
3208                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3209                         media_info['subtitles'].setdefault(lang, []).append({
3210                             'url': absolute_url(src),
3211                         })
3212             for f in media_info['formats']:
3213                 f.setdefault('http_headers', {})['Referer'] = base_url
3214             if media_info['formats'] or media_info['subtitles']:
3215                 entries.append(media_info)
3216         return entries
3217
3218     def _extract_akamai_formats(self, *args, **kwargs):
3219         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3220         if subs:
3221             self._report_ignoring_subs('akamai')
3222         return fmts
3223
3224     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3225         signed = 'hdnea=' in manifest_url
3226         if not signed:
3227             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3228             manifest_url = re.sub(
3229                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3230                 '', manifest_url).strip('?')
3231
3232         formats = []
3233         subtitles = {}
3234
3235         hdcore_sign = 'hdcore=3.7.0'
3236         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3237         hds_host = hosts.get('hds')
3238         if hds_host:
3239             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3240         if 'hdcore=' not in f4m_url:
3241             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3242         f4m_formats = self._extract_f4m_formats(
3243             f4m_url, video_id, f4m_id='hds', fatal=False)
3244         for entry in f4m_formats:
3245             entry.update({'extra_param_to_segment_url': hdcore_sign})
3246         formats.extend(f4m_formats)
3247
3248         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3249         hls_host = hosts.get('hls')
3250         if hls_host:
3251             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3252         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3253             m3u8_url, video_id, 'mp4', 'm3u8_native',
3254             m3u8_id='hls', fatal=False)
3255         formats.extend(m3u8_formats)
3256         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3257
3258         http_host = hosts.get('http')
3259         if http_host and m3u8_formats and not signed:
3260             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3261             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3262             qualities_length = len(qualities)
3263             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3264                 i = 0
3265                 for f in m3u8_formats:
3266                     if f['vcodec'] != 'none':
3267                         for protocol in ('http', 'https'):
3268                             http_f = f.copy()
3269                             del http_f['manifest_url']
3270                             http_url = re.sub(
3271                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3272                             http_f.update({
3273                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3274                                 'url': http_url,
3275                                 'protocol': protocol,
3276                             })
3277                             formats.append(http_f)
3278                         i += 1
3279
3280         return formats, subtitles
3281
3282     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3283         query = urllib.parse.urlparse(url).query
3284         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3285         mobj = re.search(
3286             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3287         url_base = mobj.group('url')
3288         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3289         formats = []
3290
3291         def manifest_url(manifest):
3292             m_url = f'{http_base_url}/{manifest}'
3293             if query:
3294                 m_url += '?%s' % query
3295             return m_url
3296
3297         if 'm3u8' not in skip_protocols:
3298             formats.extend(self._extract_m3u8_formats(
3299                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3300                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3301         if 'f4m' not in skip_protocols:
3302             formats.extend(self._extract_f4m_formats(
3303                 manifest_url('manifest.f4m'),
3304                 video_id, f4m_id='hds', fatal=False))
3305         if 'dash' not in skip_protocols:
3306             formats.extend(self._extract_mpd_formats(
3307                 manifest_url('manifest.mpd'),
3308                 video_id, mpd_id='dash', fatal=False))
3309         if re.search(r'(?:/smil:|\.smil)', url_base):
3310             if 'smil' not in skip_protocols:
3311                 rtmp_formats = self._extract_smil_formats(
3312                     manifest_url('jwplayer.smil'),
3313                     video_id, fatal=False)
3314                 for rtmp_format in rtmp_formats:
3315                     rtsp_format = rtmp_format.copy()
3316                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3317                     del rtsp_format['play_path']
3318                     del rtsp_format['ext']
3319                     rtsp_format.update({
3320                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3321                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3322                         'protocol': 'rtsp',
3323                     })
3324                     formats.extend([rtmp_format, rtsp_format])
3325         else:
3326             for protocol in ('rtmp', 'rtsp'):
3327                 if protocol not in skip_protocols:
3328                     formats.append({
3329                         'url': f'{protocol}:{url_base}',
3330                         'format_id': protocol,
3331                         'protocol': protocol,
3332                     })
3333         return formats
3334
3335     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3336         mobj = re.search(
3337             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3338             webpage)
3339         if mobj:
3340             try:
3341                 jwplayer_data = self._parse_json(mobj.group('options'),
3342                                                  video_id=video_id,
3343                                                  transform_source=transform_source)
3344             except ExtractorError:
3345                 pass
3346             else:
3347                 if isinstance(jwplayer_data, dict):
3348                     return jwplayer_data
3349
3350     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3351         jwplayer_data = self._find_jwplayer_data(
3352             webpage, video_id, transform_source=js_to_json)
3353         return self._parse_jwplayer_data(
3354             jwplayer_data, video_id, *args, **kwargs)
3355
3356     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3357                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3358         entries = []
3359         if not isinstance(jwplayer_data, dict):
3360             return entries
3361
3362         playlist_items = jwplayer_data.get('playlist')
3363         # JWPlayer backward compatibility: single playlist item/flattened playlists
3364         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3365         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3366         if not isinstance(playlist_items, list):
3367             playlist_items = (playlist_items or jwplayer_data, )
3368
3369         for video_data in playlist_items:
3370             if not isinstance(video_data, dict):
3371                 continue
3372             # JWPlayer backward compatibility: flattened sources
3373             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3374             if 'sources' not in video_data:
3375                 video_data['sources'] = [video_data]
3376
3377             this_video_id = video_id or video_data['mediaid']
3378
3379             formats = self._parse_jwplayer_formats(
3380                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3381                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3382
3383             subtitles = {}
3384             tracks = video_data.get('tracks')
3385             if tracks and isinstance(tracks, list):
3386                 for track in tracks:
3387                     if not isinstance(track, dict):
3388                         continue
3389                     track_kind = track.get('kind')
3390                     if not track_kind or not isinstance(track_kind, str):
3391                         continue
3392                     if track_kind.lower() not in ('captions', 'subtitles'):
3393                         continue
3394                     track_url = urljoin(base_url, track.get('file'))
3395                     if not track_url:
3396                         continue
3397                     subtitles.setdefault(track.get('label') or 'en', []).append({
3398                         'url': self._proto_relative_url(track_url)
3399                     })
3400
3401             entry = {
3402                 'id': this_video_id,
3403                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3404                 'description': clean_html(video_data.get('description')),
3405                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3406                 'timestamp': int_or_none(video_data.get('pubdate')),
3407                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3408                 'subtitles': subtitles,
3409                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3410                 'genre': clean_html(video_data.get('genre')),
3411                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3412                 'season_number': int_or_none(video_data.get('season')),
3413                 'episode_number': int_or_none(video_data.get('episode')),
3414                 'release_year': int_or_none(video_data.get('releasedate')),
3415                 'age_limit': int_or_none(video_data.get('age_restriction')),
3416             }
3417             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3418             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3419                 entry.update({
3420                     '_type': 'url_transparent',
3421                     'url': formats[0]['url'],
3422                 })
3423             else:
3424                 entry['formats'] = formats
3425             entries.append(entry)
3426         if len(entries) == 1:
3427             return entries[0]
3428         else:
3429             return self.playlist_result(entries)
3430
3431     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3432                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3433         urls = set()
3434         formats = []
3435         for source in jwplayer_sources_data:
3436             if not isinstance(source, dict):
3437                 continue
3438             source_url = urljoin(
3439                 base_url, self._proto_relative_url(source.get('file')))
3440             if not source_url or source_url in urls:
3441                 continue
3442             urls.add(source_url)
3443             source_type = source.get('type') or ''
3444             ext = mimetype2ext(source_type) or determine_ext(source_url)
3445             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3446                 formats.extend(self._extract_m3u8_formats(
3447                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3448                     m3u8_id=m3u8_id, fatal=False))
3449             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3450                 formats.extend(self._extract_mpd_formats(
3451                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3452             elif ext == 'smil':
3453                 formats.extend(self._extract_smil_formats(
3454                     source_url, video_id, fatal=False))
3455             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3456             elif source_type.startswith('audio') or ext in (
3457                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3458                 formats.append({
3459                     'url': source_url,
3460                     'vcodec': 'none',
3461                     'ext': ext,
3462                 })
3463             else:
3464                 format_id = str_or_none(source.get('label'))
3465                 height = int_or_none(source.get('height'))
3466                 if height is None and format_id:
3467                     # Often no height is provided but there is a label in
3468                     # format like "1080p", "720p SD", or 1080.
3469                     height = parse_resolution(format_id).get('height')
3470                 a_format = {
3471                     'url': source_url,
3472                     'width': int_or_none(source.get('width')),
3473                     'height': height,
3474                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3475                     'filesize': int_or_none(source.get('filesize')),
3476                     'ext': ext,
3477                     'format_id': format_id
3478                 }
3479                 if source_url.startswith('rtmp'):
3480                     a_format['ext'] = 'flv'
3481                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3482                     # of jwplayer.flash.swf
3483                     rtmp_url_parts = re.split(
3484                         r'((?:mp4|mp3|flv):)', source_url, 1)
3485                     if len(rtmp_url_parts) == 3:
3486                         rtmp_url, prefix, play_path = rtmp_url_parts
3487                         a_format.update({
3488                             'url': rtmp_url,
3489                             'play_path': prefix + play_path,
3490                         })
3491                     if rtmp_params:
3492                         a_format.update(rtmp_params)
3493                 formats.append(a_format)
3494         return formats
3495
3496     def _live_title(self, name):
3497         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3498         return name
3499
3500     def _int(self, v, name, fatal=False, **kwargs):
3501         res = int_or_none(v, **kwargs)
3502         if res is None:
3503             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3504             if fatal:
3505                 raise ExtractorError(msg)
3506             else:
3507                 self.report_warning(msg)
3508         return res
3509
3510     def _float(self, v, name, fatal=False, **kwargs):
3511         res = float_or_none(v, **kwargs)
3512         if res is None:
3513             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3514             if fatal:
3515                 raise ExtractorError(msg)
3516             else:
3517                 self.report_warning(msg)
3518         return res
3519
3520     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3521                     path='/', secure=False, discard=False, rest={}, **kwargs):
3522         cookie = http.cookiejar.Cookie(
3523             0, name, value, port, port is not None, domain, True,
3524             domain.startswith('.'), path, True, secure, expire_time,
3525             discard, None, None, rest)
3526         self.cookiejar.set_cookie(cookie)
3527
3528     def _get_cookies(self, url):
3529         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3530         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3531
3532     def _apply_first_set_cookie_header(self, url_handle, cookie):
3533         """
3534         Apply first Set-Cookie header instead of the last. Experimental.
3535
3536         Some sites (e.g. [1-3]) may serve two cookies under the same name
3537         in Set-Cookie header and expect the first (old) one to be set rather
3538         than second (new). However, as of RFC6265 the newer one cookie
3539         should be set into cookie store what actually happens.
3540         We will workaround this issue by resetting the cookie to
3541         the first one manually.
3542         1. https://new.vk.com/
3543         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3544         3. https://learning.oreilly.com/
3545         """
3546         for header, cookies in url_handle.headers.items():
3547             if header.lower() != 'set-cookie':
3548                 continue
3549             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3550             cookie_value = re.search(
3551                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3552             if cookie_value:
3553                 value, domain = cookie_value.groups()
3554                 self._set_cookie(domain, cookie, value)
3555                 break
3556
3557     @classmethod
3558     def get_testcases(cls, include_onlymatching=False):
3559         # Do not look in super classes
3560         t = vars(cls).get('_TEST')
3561         if t:
3562             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3563             tests = [t]
3564         else:
3565             tests = vars(cls).get('_TESTS', [])
3566         for t in tests:
3567             if not include_onlymatching and t.get('only_matching', False):
3568                 continue
3569             t['name'] = cls.ie_key()
3570             yield t
3571         if getattr(cls, '__wrapped__', None):
3572             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3573
3574     @classmethod
3575     def get_webpage_testcases(cls):
3576         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3577         for t in tests:
3578             t['name'] = cls.ie_key()
3579             yield t
3580         if getattr(cls, '__wrapped__', None):
3581             yield from cls.__wrapped__.get_webpage_testcases()
3582
3583     @classproperty(cache=True)
3584     def age_limit(cls):
3585         """Get age limit from the testcases"""
3586         return max(traverse_obj(
3587             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3588             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3589
3590     @classproperty(cache=True)
3591     def _RETURN_TYPE(cls):
3592         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3593         tests = tuple(cls.get_testcases(include_onlymatching=False))
3594         if not tests:
3595             return None
3596         elif not any(k.startswith('playlist') for test in tests for k in test):
3597             return 'video'
3598         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3599             return 'playlist'
3600         return 'any'
3601
3602     @classmethod
3603     def is_single_video(cls, url):
3604         """Returns whether the URL is of a single video, None if unknown"""
3605         if cls.suitable(url):
3606             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3607
3608     @classmethod
3609     def is_suitable(cls, age_limit):
3610         """Test whether the extractor is generally suitable for the given age limit"""
3611         return not age_restricted(cls.age_limit, age_limit)
3612
3613     @classmethod
3614     def description(cls, *, markdown=True, search_examples=None):
3615         """Description of the extractor"""
3616         desc = ''
3617         if cls._NETRC_MACHINE:
3618             if markdown:
3619                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3620             else:
3621                 desc += f' [{cls._NETRC_MACHINE}]'
3622         if cls.IE_DESC is False:
3623             desc += ' [HIDDEN]'
3624         elif cls.IE_DESC:
3625             desc += f' {cls.IE_DESC}'
3626         if cls.SEARCH_KEY:
3627             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3628             if search_examples:
3629                 _COUNTS = ('', '5', '10', 'all')
3630                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3631         if not cls.working():
3632             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3633
3634         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3635         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3636         return f'{name}:{desc}' if desc else name
3637
3638     def extract_subtitles(self, *args, **kwargs):
3639         if (self.get_param('writesubtitles', False)
3640                 or self.get_param('listsubtitles')):
3641             return self._get_subtitles(*args, **kwargs)
3642         return {}
3643
3644     def _get_subtitles(self, *args, **kwargs):
3645         raise NotImplementedError('This method must be implemented by subclasses')
3646
3647     class CommentsDisabled(Exception):
3648         """Raise in _get_comments if comments are disabled for the video"""
3649
3650     def extract_comments(self, *args, **kwargs):
3651         if not self.get_param('getcomments'):
3652             return None
3653         generator = self._get_comments(*args, **kwargs)
3654
3655         def extractor():
3656             comments = []
3657             interrupted = True
3658             try:
3659                 while True:
3660                     comments.append(next(generator))
3661             except StopIteration:
3662                 interrupted = False
3663             except KeyboardInterrupt:
3664                 self.to_screen('Interrupted by user')
3665             except self.CommentsDisabled:
3666                 return {'comments': None, 'comment_count': None}
3667             except Exception as e:
3668                 if self.get_param('ignoreerrors') is not True:
3669                     raise
3670                 self._downloader.report_error(e)
3671             comment_count = len(comments)
3672             self.to_screen(f'Extracted {comment_count} comments')
3673             return {
3674                 'comments': comments,
3675                 'comment_count': None if interrupted else comment_count
3676             }
3677         return extractor
3678
3679     def _get_comments(self, *args, **kwargs):
3680         raise NotImplementedError('This method must be implemented by subclasses')
3681
3682     @staticmethod
3683     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3684         """ Merge subtitle items for one language. Items with duplicated URLs/data
3685         will be dropped. """
3686         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3687         ret = list(subtitle_list1)
3688         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3689         return ret
3690
3691     @classmethod
3692     def _merge_subtitles(cls, *dicts, target=None):
3693         """ Merge subtitle dictionaries, language by language. """
3694         if target is None:
3695             target = {}
3696         for d in dicts:
3697             for lang, subs in d.items():
3698                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3699         return target
3700
3701     def extract_automatic_captions(self, *args, **kwargs):
3702         if (self.get_param('writeautomaticsub', False)
3703                 or self.get_param('listsubtitles')):
3704             return self._get_automatic_captions(*args, **kwargs)
3705         return {}
3706
3707     def _get_automatic_captions(self, *args, **kwargs):
3708         raise NotImplementedError('This method must be implemented by subclasses')
3709
3710     @functools.cached_property
3711     def _cookies_passed(self):
3712         """Whether cookies have been passed to YoutubeDL"""
3713         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3714
3715     def mark_watched(self, *args, **kwargs):
3716         if not self.get_param('mark_watched', False):
3717             return
3718         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3719             self._mark_watched(*args, **kwargs)
3720
3721     def _mark_watched(self, *args, **kwargs):
3722         raise NotImplementedError('This method must be implemented by subclasses')
3723
3724     def geo_verification_headers(self):
3725         headers = {}
3726         geo_verification_proxy = self.get_param('geo_verification_proxy')
3727         if geo_verification_proxy:
3728             headers['Ytdl-request-proxy'] = geo_verification_proxy
3729         return headers
3730
3731     @staticmethod
3732     def _generic_id(url):
3733         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3734
3735     def _generic_title(self, url='', webpage='', *, default=None):
3736         return (self._og_search_title(webpage, default=None)
3737                 or self._html_extract_title(webpage, default=None)
3738                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3739                 or default)
3740
3741     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3742         if not duration:
3743             return
3744         chapter_list = [{
3745             'start_time': start_function(chapter),
3746             'title': title_function(chapter),
3747         } for chapter in chapter_list or []]
3748         if strict:
3749             warn = self.report_warning
3750         else:
3751             warn = self.write_debug
3752             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3753
3754         chapters = [{'start_time': 0}]
3755         for idx, chapter in enumerate(chapter_list):
3756             if chapter['start_time'] is None:
3757                 warn(f'Incomplete chapter {idx}')
3758             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3759                 chapters.append(chapter)
3760             elif chapter not in chapters:
3761                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3762                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3763                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3764         return chapters[1:]
3765
3766     def _extract_chapters_from_description(self, description, duration):
3767         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3768         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3769         return self._extract_chapters_helper(
3770             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3771             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3772             duration=duration, strict=False) or self._extract_chapters_helper(
3773             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3774             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3775             duration=duration, strict=False)
3776
3777     @staticmethod
3778     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3779         all_known = all(map(
3780             lambda x: x is not None,
3781             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3782         return (
3783             'private' if is_private
3784             else 'premium_only' if needs_premium
3785             else 'subscriber_only' if needs_subscription
3786             else 'needs_auth' if needs_auth
3787             else 'unlisted' if is_unlisted
3788             else 'public' if all_known
3789             else None)
3790
3791     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3792         '''
3793         @returns            A list of values for the extractor argument given by "key"
3794                             or "default" if no such key is present
3795         @param default      The default value to return when the key is not present (default: [])
3796         @param casesense    When false, the values are converted to lower case
3797         '''
3798         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3799         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3800         if val is None:
3801             return [] if default is NO_DEFAULT else default
3802         return list(val) if casesense else [x.lower() for x in val]
3803
3804     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3805         if not playlist_id or not video_id:
3806             return not video_id
3807
3808         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3809         if no_playlist is not None:
3810             return not no_playlist
3811
3812         video_id = '' if video_id is True else f' {video_id}'
3813         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3814         if self.get_param('noplaylist'):
3815             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3816             return False
3817         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3818         return True
3819
3820     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3821         RetryManager.report_retry(
3822             err, _count or int(fatal), _retries,
3823             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3824             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3825
3826     def RetryManager(self, **kwargs):
3827         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3828
3829     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3830         display_id = traverse_obj(info_dict, 'display_id', 'id')
3831         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3832         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3833             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3834
3835     @classmethod
3836     def extract_from_webpage(cls, ydl, url, webpage):
3837         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3838               else ydl.get_info_extractor(cls.ie_key()))
3839         for info in ie._extract_from_webpage(url, webpage) or []:
3840             # url = None since we do not want to set (webpage/original)_url
3841             ydl.add_default_extra_info(info, ie, None)
3842             yield info
3843
3844     @classmethod
3845     def _extract_from_webpage(cls, url, webpage):
3846         for embed_url in orderedSet(
3847                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3848             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3849
3850     @classmethod
3851     def _extract_embed_urls(cls, url, webpage):
3852         """@returns all the embed urls on the webpage"""
3853         if '_EMBED_URL_RE' not in cls.__dict__:
3854             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3855             for idx, regex in enumerate(cls._EMBED_REGEX):
3856                 assert regex.count('(?P<url>') == 1, \
3857                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3858             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3859
3860         for regex in cls._EMBED_URL_RE:
3861             for mobj in regex.finditer(webpage):
3862                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3863                 if cls._VALID_URL is False or cls.suitable(embed_url):
3864                     yield embed_url
3865
3866     class StopExtraction(Exception):
3867         pass
3868
3869     @classmethod
3870     def _extract_url(cls, webpage):  # TODO: Remove
3871         """Only for compatibility with some older extractors"""
3872         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3873
3874     @classmethod
3875     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3876         if plugin_name:
3877             mro = inspect.getmro(cls)
3878             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3879             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3880             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3881             while getattr(super_class, '__wrapped__', None):
3882                 super_class = super_class.__wrapped__
3883             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3884             _PLUGIN_OVERRIDES[super_class].append(cls)
3885
3886         return super().__init_subclass__(**kwargs)
3887
3888
3889 class SearchInfoExtractor(InfoExtractor):
3890     """
3891     Base class for paged search queries extractors.
3892     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3893     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3894     """
3895
3896     _MAX_RESULTS = float('inf')
3897     _RETURN_TYPE = 'playlist'
3898
3899     @classproperty
3900     def _VALID_URL(cls):
3901         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3902
3903     def _real_extract(self, query):
3904         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3905         if prefix == '':
3906             return self._get_n_results(query, 1)
3907         elif prefix == 'all':
3908             return self._get_n_results(query, self._MAX_RESULTS)
3909         else:
3910             n = int(prefix)
3911             if n <= 0:
3912                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3913             elif n > self._MAX_RESULTS:
3914                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3915                 n = self._MAX_RESULTS
3916             return self._get_n_results(query, n)
3917
3918     def _get_n_results(self, query, n):
3919         """Get a specified number of results for a query.
3920         Either this function or _search_results must be overridden by subclasses """
3921         return self.playlist_result(
3922             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3923             query, query)
3924
3925     def _search_results(self, query):
3926         """Returns an iterator of search results"""
3927         raise NotImplementedError('This method must be implemented by subclasses')
3928
3929     @classproperty
3930     def SEARCH_KEY(cls):
3931         return cls._SEARCH_KEY
3932
3933
3934 class UnsupportedURLIE(InfoExtractor):
3935     _VALID_URL = '.*'
3936     _ENABLED = False
3937     IE_DESC = False
3938
3939     def _real_extract(self, url):
3940         raise UnsupportedError(url)
3941
3942
3943 _PLUGIN_OVERRIDES = collections.defaultdict(list)