yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import subprocess
  17 import sys
  18 import time
  19 import types
  20 import urllib.parse
  21 import urllib.request
  22 import xml.etree.ElementTree
  23
  24 from ..compat import functools  # isort: split
  25 from ..compat import (
  26     compat_etree_fromstring,
  27     compat_expanduser,
  28     compat_os_name,
  29     urllib_req_to_req,
  30 )
  31 from ..cookies import LenientSimpleCookie
  32 from ..downloader.f4m import get_base_url, remove_encrypted_media
  33 from ..downloader.hls import HlsFD
  34 from ..networking import HEADRequest, Request
  35 from ..networking.exceptions import (
  36     HTTPError,
  37     IncompleteRead,
  38     network_exceptions,
  39 )
  40 from ..utils import (
  41     IDENTITY,
  42     JSON_LD_RE,
  43     NO_DEFAULT,
  44     ExtractorError,
  45     FormatSorter,
  46     GeoRestrictedError,
  47     GeoUtils,
  48     LenientJSONDecoder,
  49     Popen,
  50     RegexNotFoundError,
  51     RetryManager,
  52     UnsupportedError,
  53     age_restricted,
  54     base_url,
  55     bug_reports_message,
  56     classproperty,
  57     clean_html,
  58     deprecation_warning,
  59     determine_ext,
  60     dict_get,
  61     encode_data_uri,
  62     error_to_compat_str,
  63     extract_attributes,
  64     filter_dict,
  65     fix_xml_ampersands,
  66     float_or_none,
  67     format_field,
  68     int_or_none,
  69     join_nonempty,
  70     js_to_json,
  71     mimetype2ext,
  72     netrc_from_content,
  73     orderedSet,
  74     parse_bitrate,
  75     parse_codecs,
  76     parse_duration,
  77     parse_iso8601,
  78     parse_m3u8_attributes,
  79     parse_resolution,
  80     sanitize_filename,
  81     sanitize_url,
  82     smuggle_url,
  83     str_or_none,
  84     str_to_int,
  85     strip_or_none,
  86     traverse_obj,
  87     truncate_string,
  88     try_call,
  89     try_get,
  90     unescapeHTML,
  91     unified_strdate,
  92     unified_timestamp,
  93     url_basename,
  94     url_or_none,
  95     urlhandle_detect_ext,
  96     urljoin,
  97     variadic,
  98     xpath_element,
  99     xpath_text,
 100     xpath_with_ns,
 101 )
 102
 103
 104 class InfoExtractor:
 105     """Information Extractor class.
 106
 107     Information extractors are the classes that, given a URL, extract
 108     information about the video (or videos) the URL refers to. This
 109     information includes the real video URL, the video title, author and
 110     others. The information is stored in a dictionary which is then
 111     passed to the YoutubeDL. The YoutubeDL processes this
 112     information possibly downloading the video to the file system, among
 113     other possible outcomes.
 114
 115     The type field determines the type of the result.
 116     By far the most common value (and the default if _type is missing) is
 117     "video", which indicates a single video.
 118
 119     For a video, the dictionaries must include the following fields:
 120
 121     id:             Video identifier.
 122     title:          Video title, unescaped. Set to an empty string if video has
 123                     no title as opposed to "None" which signifies that the
 124                     extractor failed to obtain a title
 125
 126     Additionally, it must contain either a formats entry or a url one:
 127
 128     formats:        A list of dictionaries for each format available, ordered
 129                     from worst to best quality.
 130
 131                     Potential fields:
 132                     * url        The mandatory URL representing the media:
 133                                    for plain file media - HTTP URL of this file,
 134                                    for RTMP - RTMP URL,
 135                                    for HLS - URL of the M3U8 media playlist,
 136                                    for HDS - URL of the F4M manifest,
 137                                    for DASH
 138                                      - HTTP URL to plain file media (in case of
 139                                        unfragmented media)
 140                                      - URL of the MPD manifest or base URL
 141                                        representing the media if MPD manifest
 142                                        is parsed from a string (in case of
 143                                        fragmented media)
 144                                    for MSS - URL of the ISM manifest.
 145                     * request_data  Data to send in POST request to the URL
 146                     * manifest_url
 147                                  The URL of the manifest file in case of
 148                                  fragmented media:
 149                                    for HLS - URL of the M3U8 master playlist,
 150                                    for HDS - URL of the F4M manifest,
 151                                    for DASH - URL of the MPD manifest,
 152                                    for MSS - URL of the ISM manifest.
 153                     * manifest_stream_number  (For internal use only)
 154                                  The index of the stream in the manifest file
 155                     * ext        Will be calculated from URL if missing
 156                     * format     A human-readable description of the format
 157                                  ("mp4 container with h264/opus").
 158                                  Calculated from the format_id, width, height.
 159                                  and format_note fields if missing.
 160                     * format_id  A short description of the format
 161                                  ("mp4_h264_opus" or "19").
 162                                 Technically optional, but strongly recommended.
 163                     * format_note Additional info about the format
 164                                  ("3D" or "DASH video")
 165                     * width      Width of the video, if known
 166                     * height     Height of the video, if known
 167                     * aspect_ratio  Aspect ratio of the video, if known
 168                                  Automatically calculated from width and height
 169                     * resolution Textual description of width and height
 170                                  Automatically calculated from width and height
 171                     * dynamic_range The dynamic range of the video. One of:
 172                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 173                     * tbr        Average bitrate of audio and video in KBit/s
 174                     * abr        Average audio bitrate in KBit/s
 175                     * acodec     Name of the audio codec in use
 176                     * asr        Audio sampling rate in Hertz
 177                     * audio_channels  Number of audio channels
 178                     * vbr        Average video bitrate in KBit/s
 179                     * fps        Frame rate
 180                     * vcodec     Name of the video codec in use
 181                     * container  Name of the container format
 182                     * filesize   The number of bytes, if known in advance
 183                     * filesize_approx  An estimate for the number of bytes
 184                     * player_url SWF Player URL (used for rtmpdump).
 185                     * protocol   The protocol that will be used for the actual
 186                                  download, lower-case. One of "http", "https" or
 187                                  one of the protocols defined in downloader.PROTOCOL_MAP
 188                     * fragment_base_url
 189                                  Base URL for fragments. Each fragment's path
 190                                  value (if present) will be relative to
 191                                  this URL.
 192                     * fragments  A list of fragments of a fragmented media.
 193                                  Each fragment entry must contain either an url
 194                                  or a path. If an url is present it should be
 195                                  considered by a client. Otherwise both path and
 196                                  fragment_base_url must be present. Here is
 197                                  the list of all potential fields:
 198                                  * "url" - fragment's URL
 199                                  * "path" - fragment's path relative to
 200                                             fragment_base_url
 201                                  * "duration" (optional, int or float)
 202                                  * "filesize" (optional, int)
 203                     * is_from_start  Is a live format that can be downloaded
 204                                 from the start. Boolean
 205                     * preference Order number of this format. If this field is
 206                                  present and not None, the formats get sorted
 207                                  by this field, regardless of all other values.
 208                                  -1 for default (order by other properties),
 209                                  -2 or smaller for less than default.
 210                                  < -1000 to hide the format (if there is
 211                                     another one which is strictly better)
 212                     * language   Language code, e.g. "de" or "en-US".
 213                     * language_preference  Is this in the language mentioned in
 214                                  the URL?
 215                                  10 if it's what the URL is about,
 216                                  -1 for default (don't know),
 217                                  -10 otherwise, other values reserved for now.
 218                     * quality    Order number of the video quality of this
 219                                  format, irrespective of the file format.
 220                                  -1 for default (order by other properties),
 221                                  -2 or smaller for less than default.
 222                     * source_preference  Order number for this video source
 223                                   (quality takes higher priority)
 224                                  -1 for default (order by other properties),
 225                                  -2 or smaller for less than default.
 226                     * http_headers  A dictionary of additional HTTP headers
 227                                  to add to the request.
 228                     * stretched_ratio  If given and not 1, indicates that the
 229                                  video's pixels are not square.
 230                                  width : height ratio as float.
 231                     * no_resume  The server does not support resuming the
 232                                  (HTTP or RTMP) download. Boolean.
 233                     * has_drm    True if the format has DRM and cannot be downloaded.
 234                                  'maybe' if the format may have DRM and has to be tested before download.
 235                     * extra_param_to_segment_url  A query string to append to each
 236                                  fragment's URL, or to update each existing query string
 237                                  with. Only applied by the native HLS/DASH downloaders.
 238                     * hls_aes    A dictionary of HLS AES-128 decryption information
 239                                  used by the native HLS downloader to override the
 240                                  values in the media playlist when an '#EXT-X-KEY' tag
 241                                  is present in the playlist:
 242                                  * uri  The URI from which the key will be downloaded
 243                                  * key  The key (as hex) used to decrypt fragments.
 244                                         If `key` is given, any key URI will be ignored
 245                                  * iv   The IV (as hex) used to decrypt fragments
 246                     * downloader_options  A dictionary of downloader options
 247                                  (For internal use only)
 248                                  * http_chunk_size Chunk size for HTTP downloads
 249                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 250                     * is_dash_periods  Whether the format is a result of merging
 251                                  multiple DASH periods.
 252                     RTMP formats can also have the additional fields: page_url,
 253                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 254                     rtmp_protocol, rtmp_real_time
 255
 256     url:            Final video URL.
 257     ext:            Video filename extension.
 258     format:         The video format, defaults to ext (used for --get-format)
 259     player_url:     SWF Player URL (used for rtmpdump).
 260
 261     The following fields are optional:
 262
 263     direct:         True if a direct video file was given (must only be set by GenericIE)
 264     alt_title:      A secondary title of the video.
 265     display_id      An alternative identifier for the video, not necessarily
 266                     unique, but available before title. Typically, id is
 267                     something like "4234987", title "Dancing naked mole rats",
 268                     and display_id "dancing-naked-mole-rats"
 269     thumbnails:     A list of dictionaries, with the following entries:
 270                         * "id" (optional, string) - Thumbnail format ID
 271                         * "url"
 272                         * "preference" (optional, int) - quality of the image
 273                         * "width" (optional, int)
 274                         * "height" (optional, int)
 275                         * "resolution" (optional, string "{width}x{height}",
 276                                         deprecated)
 277                         * "filesize" (optional, int)
 278                         * "http_headers" (dict) - HTTP headers for the request
 279     thumbnail:      Full URL to a video thumbnail image.
 280     description:    Full video description.
 281     uploader:       Full name of the video uploader.
 282     license:        License name the video is licensed under.
 283     creators:       List of creators of the video.
 284     timestamp:      UNIX timestamp of the moment the video was uploaded
 285     upload_date:    Video upload date in UTC (YYYYMMDD).
 286                     If not explicitly set, calculated from timestamp
 287     release_timestamp: UNIX timestamp of the moment the video was released.
 288                     If it is not clear whether to use timestamp or this, use the former
 289     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 290                     If not explicitly set, calculated from release_timestamp
 291     release_year:   Year (YYYY) as integer when the video or album was released.
 292                     To be used if no exact release date is known.
 293                     If not explicitly set, calculated from release_date.
 294     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 295     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 296                     If not explicitly set, calculated from modified_timestamp
 297     uploader_id:    Nickname or id of the video uploader.
 298     uploader_url:   Full URL to a personal webpage of the video uploader.
 299     channel:        Full name of the channel the video is uploaded on.
 300                     Note that channel fields may or may not repeat uploader
 301                     fields. This depends on a particular extractor.
 302     channel_id:     Id of the channel.
 303     channel_url:    Full URL to a channel webpage.
 304     channel_follower_count: Number of followers of the channel.
 305     channel_is_verified: Whether the channel is verified on the platform.
 306     location:       Physical location where the video was filmed.
 307     subtitles:      The available subtitles as a dictionary in the format
 308                     {tag: subformats}. "tag" is usually a language code, and
 309                     "subformats" is a list sorted from lower to higher
 310                     preference, each element is a dictionary with the "ext"
 311                     entry and one of:
 312                         * "data": The subtitles file contents
 313                         * "url": A URL pointing to the subtitles file
 314                     It can optionally also have:
 315                         * "name": Name or description of the subtitles
 316                         * "http_headers": A dictionary of additional HTTP headers
 317                                   to add to the request.
 318                     "ext" will be calculated from URL if missing
 319     automatic_captions: Like 'subtitles'; contains automatically generated
 320                     captions instead of normal subtitles
 321     duration:       Length of the video in seconds, as an integer or float.
 322     view_count:     How many users have watched the video on the platform.
 323     concurrent_view_count: How many users are currently watching the video on the platform.
 324     like_count:     Number of positive ratings of the video
 325     dislike_count:  Number of negative ratings of the video
 326     repost_count:   Number of reposts of the video
 327     average_rating: Average rating give by users, the scale used depends on the webpage
 328     comment_count:  Number of comments on the video
 329     comments:       A list of comments, each with one or more of the following
 330                     properties (all but one of text or html optional):
 331                         * "author" - human-readable name of the comment author
 332                         * "author_id" - user ID of the comment author
 333                         * "author_thumbnail" - The thumbnail of the comment author
 334                         * "author_url" - The url to the comment author's page
 335                         * "author_is_verified" - Whether the author is verified
 336                                                  on the platform
 337                         * "author_is_uploader" - Whether the comment is made by
 338                                                  the video uploader
 339                         * "id" - Comment ID
 340                         * "html" - Comment as HTML
 341                         * "text" - Plain text of the comment
 342                         * "timestamp" - UNIX timestamp of comment
 343                         * "parent" - ID of the comment this one is replying to.
 344                                      Set to "root" to indicate that this is a
 345                                      comment to the original video.
 346                         * "like_count" - Number of positive ratings of the comment
 347                         * "dislike_count" - Number of negative ratings of the comment
 348                         * "is_favorited" - Whether the comment is marked as
 349                                            favorite by the video uploader
 350                         * "is_pinned" - Whether the comment is pinned to
 351                                         the top of the comments
 352     age_limit:      Age restriction for the video, as an integer (years)
 353     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 354                     should allow to get the same result again. (It will be set
 355                     by YoutubeDL if it's missing)
 356     categories:     A list of categories that the video falls in, for example
 357                     ["Sports", "Berlin"]
 358     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 359     cast:           A list of the video cast
 360     is_live:        True, False, or None (=unknown). Whether this video is a
 361                     live stream that goes on instead of a fixed-length video.
 362     was_live:       True, False, or None (=unknown). Whether this video was
 363                     originally a live stream.
 364     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 365                     or 'post_live' (was live, but VOD is not yet processed)
 366                     If absent, automatically set from is_live, was_live
 367     start_time:     Time in seconds where the reproduction should start, as
 368                     specified in the URL.
 369     end_time:       Time in seconds where the reproduction should end, as
 370                     specified in the URL.
 371     chapters:       A list of dictionaries, with the following entries:
 372                         * "start_time" - The start time of the chapter in seconds
 373                         * "end_time" - The end time of the chapter in seconds
 374                         * "title" (optional, string)
 375     heatmap:        A list of dictionaries, with the following entries:
 376                         * "start_time" - The start time of the data point in seconds
 377                         * "end_time" - The end time of the data point in seconds
 378                         * "value" - The normalized value of the data point (float between 0 and 1)
 379     playable_in_embed: Whether this video is allowed to play in embedded
 380                     players on other sites. Can be True (=always allowed),
 381                     False (=never allowed), None (=unknown), or a string
 382                     specifying the criteria for embedability; e.g. 'whitelist'
 383     availability:   Under what condition the video is available. One of
 384                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 385                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 386                     to set it
 387     media_type:     The type of media as classified by the site, e.g. "episode", "clip", "trailer"
 388     _old_archive_ids: A list of old archive ids needed for backward compatibility
 389     _format_sort_fields: A list of fields to use for sorting formats
 390     __post_extractor: A function to be called just before the metadata is
 391                     written to either disk, logger or console. The function
 392                     must return a dict which will be added to the info_dict.
 393                     This is usefull for additional information that is
 394                     time-consuming to extract. Note that the fields thus
 395                     extracted will not be available to output template and
 396                     match_filter. So, only "comments" and "comment_count" are
 397                     currently allowed to be extracted via this method.
 398
 399     The following fields should only be used when the video belongs to some logical
 400     chapter or section:
 401
 402     chapter:        Name or title of the chapter the video belongs to.
 403     chapter_number: Number of the chapter the video belongs to, as an integer.
 404     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 405
 406     The following fields should only be used when the video is an episode of some
 407     series, programme or podcast:
 408
 409     series:         Title of the series or programme the video episode belongs to.
 410     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 411     season:         Title of the season the video episode belongs to.
 412     season_number:  Number of the season the video episode belongs to, as an integer.
 413     season_id:      Id of the season the video episode belongs to, as a unicode string.
 414     episode:        Title of the video episode. Unlike mandatory video title field,
 415                     this field should denote the exact title of the video episode
 416                     without any kind of decoration.
 417     episode_number: Number of the video episode within a season, as an integer.
 418     episode_id:     Id of the video episode, as a unicode string.
 419
 420     The following fields should only be used when the media is a track or a part of
 421     a music album:
 422
 423     track:          Title of the track.
 424     track_number:   Number of the track within an album or a disc, as an integer.
 425     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 426                     as a unicode string.
 427     artists:        List of artists of the track.
 428     composers:      List of composers of the piece.
 429     genres:         List of genres of the track.
 430     album:          Title of the album the track belongs to.
 431     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 432     album_artists:  List of all artists appeared on the album.
 433                     E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
 434                     Useful for splits and compilations.
 435     disc_number:    Number of the disc or other physical medium the track belongs to,
 436                     as an integer.
 437
 438     The following fields should only be set for clips that should be cut from the original video:
 439
 440     section_start:  Start time of the section in seconds
 441     section_end:    End time of the section in seconds
 442
 443     The following fields should only be set for storyboards:
 444     rows:           Number of rows in each storyboard fragment, as an integer
 445     columns:        Number of columns in each storyboard fragment, as an integer
 446
 447     The following fields are deprecated and should not be set by new code:
 448     composer:       Use "composers" instead.
 449                     Composer(s) of the piece, comma-separated.
 450     artist:         Use "artists" instead.
 451                     Artist(s) of the track, comma-separated.
 452     genre:          Use "genres" instead.
 453                     Genre(s) of the track, comma-separated.
 454     album_artist:   Use "album_artists" instead.
 455                     All artists appeared on the album, comma-separated.
 456     creator:        Use "creators" instead.
 457                     The creator of the video.
 458
 459     Unless mentioned otherwise, the fields should be Unicode strings.
 460
 461     Unless mentioned otherwise, None is equivalent to absence of information.
 462
 463
 464     _type "playlist" indicates multiple videos.
 465     There must be a key "entries", which is a list, an iterable, or a PagedList
 466     object, each element of which is a valid dictionary by this specification.
 467
 468     Additionally, playlists can have "id", "title", and any other relevant
 469     attributes with the same semantics as videos (see above).
 470
 471     It can also have the following optional fields:
 472
 473     playlist_count: The total number of videos in a playlist. If not given,
 474                     YoutubeDL tries to calculate it from "entries"
 475
 476
 477     _type "multi_video" indicates that there are multiple videos that
 478     form a single show, for examples multiple acts of an opera or TV episode.
 479     It must have an entries key like a playlist and contain all the keys
 480     required for a video at the same time.
 481
 482
 483     _type "url" indicates that the video must be extracted from another
 484     location, possibly by a different extractor. Its only required key is:
 485     "url" - the next URL to extract.
 486     The key "ie_key" can be set to the class name (minus the trailing "IE",
 487     e.g. "Youtube") if the extractor class is known in advance.
 488     Additionally, the dictionary may have any properties of the resolved entity
 489     known in advance, for example "title" if the title of the referred video is
 490     known ahead of time.
 491
 492
 493     _type "url_transparent" entities have the same specification as "url", but
 494     indicate that the given additional information is more precise than the one
 495     associated with the resolved URL.
 496     This is useful when a site employs a video service that hosts the video and
 497     its technical metadata, but that video service does not embed a useful
 498     title, description etc.
 499
 500
 501     Subclasses of this should also be added to the list of extractors and
 502     should define _VALID_URL as a regexp or a Sequence of regexps, and
 503     re-define the _real_extract() and (optionally) _real_initialize() methods.
 504
 505     Subclasses may also override suitable() if necessary, but ensure the function
 506     signature is preserved and that this function imports everything it needs
 507     (except other extractors), so that lazy_extractors works correctly.
 508
 509     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 510     the HTML of Generic webpages. It may also override _extract_embed_urls
 511     or _extract_from_webpage as necessary. While these are normally classmethods,
 512     _extract_from_webpage is allowed to be an instance method.
 513
 514     _extract_from_webpage may raise self.StopExtraction() to stop further
 515     processing of the webpage and obtain exclusive rights to it. This is useful
 516     when the extractor cannot reliably be matched using just the URL,
 517     e.g. invidious/peertube instances
 518
 519     Embed-only extractors can be defined by setting _VALID_URL = False.
 520
 521     To support username + password (or netrc) login, the extractor must define a
 522     _NETRC_MACHINE and re-define _perform_login(username, password) and
 523     (optionally) _initialize_pre_login() methods. The _perform_login method will
 524     be called between _initialize_pre_login and _real_initialize if credentials
 525     are passed by the user. In cases where it is necessary to have the login
 526     process as part of the extraction rather than initialization, _perform_login
 527     can be left undefined.
 528
 529     _GEO_BYPASS attribute may be set to False in order to disable
 530     geo restriction bypass mechanisms for a particular extractor.
 531     Though it won't disable explicit geo restriction bypass based on
 532     country code provided with geo_bypass_country.
 533
 534     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 535     countries for this extractor. One of these countries will be used by
 536     geo restriction bypass mechanism right away in order to bypass
 537     geo restriction, of course, if the mechanism is not disabled.
 538
 539     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 540     IP blocks in CIDR notation for this extractor. One of these IP blocks
 541     will be used by geo restriction bypass mechanism similarly
 542     to _GEO_COUNTRIES.
 543
 544     The _ENABLED attribute should be set to False for IEs that
 545     are disabled by default and must be explicitly enabled.
 546
 547     The _WORKING attribute should be set to False for broken IEs
 548     in order to warn the users and skip the tests.
 549     """
 550
 551     _ready = False
 552     _downloader = None
 553     _x_forwarded_for_ip = None
 554     _GEO_BYPASS = True
 555     _GEO_COUNTRIES = None
 556     _GEO_IP_BLOCKS = None
 557     _WORKING = True
 558     _ENABLED = True
 559     _NETRC_MACHINE = None
 560     IE_DESC = None
 561     SEARCH_KEY = None
 562     _VALID_URL = None
 563     _EMBED_REGEX = []
 564
 565     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 566         password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 567         return {
 568             None: '',
 569             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 570             'password': f'Use {password_hint}',
 571             'cookies': (
 572                 'Use --cookies-from-browser or --cookies for the authentication. '
 573                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 574         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 575
 576     def __init__(self, downloader=None):
 577         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 578         If a downloader is not passed during initialization,
 579         it must be set using "set_downloader()" before "extract()" is called"""
 580         self._ready = False
 581         self._x_forwarded_for_ip = None
 582         self._printed_messages = set()
 583         self.set_downloader(downloader)
 584
 585     @classmethod
 586     def _match_valid_url(cls, url):
 587         if cls._VALID_URL is False:
 588             return None
 589         # This does not use has/getattr intentionally - we want to know whether
 590         # we have cached the regexp for *this* class, whereas getattr would also
 591         # match the superclass
 592         if '_VALID_URL_RE' not in cls.__dict__:
 593             cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
 594         return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
 595
 596     @classmethod
 597     def suitable(cls, url):
 598         """Receives a URL and returns True if suitable for this IE."""
 599         # This function must import everything it needs (except other extractors),
 600         # so that lazy_extractors works correctly
 601         return cls._match_valid_url(url) is not None
 602
 603     @classmethod
 604     def _match_id(cls, url):
 605         return cls._match_valid_url(url).group('id')
 606
 607     @classmethod
 608     def get_temp_id(cls, url):
 609         try:
 610             return cls._match_id(url)
 611         except (IndexError, AttributeError):
 612             return None
 613
 614     @classmethod
 615     def working(cls):
 616         """Getter method for _WORKING."""
 617         return cls._WORKING
 618
 619     @classmethod
 620     def supports_login(cls):
 621         return bool(cls._NETRC_MACHINE)
 622
 623     def initialize(self):
 624         """Initializes an instance (authentication, etc)."""
 625         self._printed_messages = set()
 626         self._initialize_geo_bypass({
 627             'countries': self._GEO_COUNTRIES,
 628             'ip_blocks': self._GEO_IP_BLOCKS,
 629         })
 630         if not self._ready:
 631             self._initialize_pre_login()
 632             if self.supports_login():
 633                 username, password = self._get_login_info()
 634                 if username:
 635                     self._perform_login(username, password)
 636             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 637                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 638             self._real_initialize()
 639             self._ready = True
 640
 641     def _initialize_geo_bypass(self, geo_bypass_context):
 642         """
 643         Initialize geo restriction bypass mechanism.
 644
 645         This method is used to initialize geo bypass mechanism based on faking
 646         X-Forwarded-For HTTP header. A random country from provided country list
 647         is selected and a random IP belonging to this country is generated. This
 648         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 649         HTTP requests.
 650
 651         This method will be used for initial geo bypass mechanism initialization
 652         during the instance initialization with _GEO_COUNTRIES and
 653         _GEO_IP_BLOCKS.
 654
 655         You may also manually call it from extractor's code if geo bypass
 656         information is not available beforehand (e.g. obtained during
 657         extraction) or due to some other reason. In this case you should pass
 658         this information in geo bypass context passed as first argument. It may
 659         contain following fields:
 660
 661         countries:  List of geo unrestricted countries (similar
 662                     to _GEO_COUNTRIES)
 663         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 664                     (similar to _GEO_IP_BLOCKS)
 665
 666         """
 667         if not self._x_forwarded_for_ip:
 668
 669             # Geo bypass mechanism is explicitly disabled by user
 670             if not self.get_param('geo_bypass', True):
 671                 return
 672
 673             if not geo_bypass_context:
 674                 geo_bypass_context = {}
 675
 676             # Backward compatibility: previously _initialize_geo_bypass
 677             # expected a list of countries, some 3rd party code may still use
 678             # it this way
 679             if isinstance(geo_bypass_context, (list, tuple)):
 680                 geo_bypass_context = {
 681                     'countries': geo_bypass_context,
 682                 }
 683
 684             # The whole point of geo bypass mechanism is to fake IP
 685             # as X-Forwarded-For HTTP header based on some IP block or
 686             # country code.
 687
 688             # Path 1: bypassing based on IP block in CIDR notation
 689
 690             # Explicit IP block specified by user, use it right away
 691             # regardless of whether extractor is geo bypassable or not
 692             ip_block = self.get_param('geo_bypass_ip_block', None)
 693
 694             # Otherwise use random IP block from geo bypass context but only
 695             # if extractor is known as geo bypassable
 696             if not ip_block:
 697                 ip_blocks = geo_bypass_context.get('ip_blocks')
 698                 if self._GEO_BYPASS and ip_blocks:
 699                     ip_block = random.choice(ip_blocks)
 700
 701             if ip_block:
 702                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 703                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 704                 return
 705
 706             # Path 2: bypassing based on country code
 707
 708             # Explicit country code specified by user, use it right away
 709             # regardless of whether extractor is geo bypassable or not
 710             country = self.get_param('geo_bypass_country', None)
 711
 712             # Otherwise use random country code from geo bypass context but
 713             # only if extractor is known as geo bypassable
 714             if not country:
 715                 countries = geo_bypass_context.get('countries')
 716                 if self._GEO_BYPASS and countries:
 717                     country = random.choice(countries)
 718
 719             if country:
 720                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 721                 self._downloader.write_debug(
 722                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 723
 724     def extract(self, url):
 725         """Extracts URL information and returns it in list of dicts."""
 726         try:
 727             for _ in range(2):
 728                 try:
 729                     self.initialize()
 730                     self.to_screen('Extracting URL: %s' % (
 731                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 732                     ie_result = self._real_extract(url)
 733                     if ie_result is None:
 734                         return None
 735                     if self._x_forwarded_for_ip:
 736                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 737                     subtitles = ie_result.get('subtitles') or {}
 738                     if 'no-live-chat' in self.get_param('compat_opts'):
 739                         for lang in ('live_chat', 'comments', 'danmaku'):
 740                             subtitles.pop(lang, None)
 741                     return ie_result
 742                 except GeoRestrictedError as e:
 743                     if self.__maybe_fake_ip_and_retry(e.countries):
 744                         continue
 745                     raise
 746         except UnsupportedError:
 747             raise
 748         except ExtractorError as e:
 749             e.video_id = e.video_id or self.get_temp_id(url)
 750             e.ie = e.ie or self.IE_NAME,
 751             e.traceback = e.traceback or sys.exc_info()[2]
 752             raise
 753         except IncompleteRead as e:
 754             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 755         except (KeyError, StopIteration) as e:
 756             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 757
 758     def __maybe_fake_ip_and_retry(self, countries):
 759         if (not self.get_param('geo_bypass_country', None)
 760                 and self._GEO_BYPASS
 761                 and self.get_param('geo_bypass', True)
 762                 and not self._x_forwarded_for_ip
 763                 and countries):
 764             country_code = random.choice(countries)
 765             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 766             if self._x_forwarded_for_ip:
 767                 self.report_warning(
 768                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 769                     % (self._x_forwarded_for_ip, country_code.upper()))
 770                 return True
 771         return False
 772
 773     def set_downloader(self, downloader):
 774         """Sets a YoutubeDL instance as the downloader for this IE."""
 775         self._downloader = downloader
 776
 777     @property
 778     def cache(self):
 779         return self._downloader.cache
 780
 781     @property
 782     def cookiejar(self):
 783         return self._downloader.cookiejar
 784
 785     def _initialize_pre_login(self):
 786         """ Initialization before login. Redefine in subclasses."""
 787         pass
 788
 789     def _perform_login(self, username, password):
 790         """ Login with username and password. Redefine in subclasses."""
 791         pass
 792
 793     def _real_initialize(self):
 794         """Real initialization process. Redefine in subclasses."""
 795         pass
 796
 797     def _real_extract(self, url):
 798         """Real extraction process. Redefine in subclasses."""
 799         raise NotImplementedError('This method must be implemented by subclasses')
 800
 801     @classmethod
 802     def ie_key(cls):
 803         """A string for getting the InfoExtractor with get_info_extractor"""
 804         return cls.__name__[:-2]
 805
 806     @classproperty
 807     def IE_NAME(cls):
 808         return cls.__name__[:-2]
 809
 810     @staticmethod
 811     def __can_accept_status_code(err, expected_status):
 812         assert isinstance(err, HTTPError)
 813         if expected_status is None:
 814             return False
 815         elif callable(expected_status):
 816             return expected_status(err.status) is True
 817         else:
 818             return err.status in variadic(expected_status)
 819
 820     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 821         if isinstance(url_or_request, urllib.request.Request):
 822             self._downloader.deprecation_warning(
 823                 'Passing a urllib.request.Request to _create_request() is deprecated. '
 824                 'Use yt_dlp.networking.common.Request instead.')
 825             url_or_request = urllib_req_to_req(url_or_request)
 826         elif not isinstance(url_or_request, Request):
 827             url_or_request = Request(url_or_request)
 828
 829         url_or_request.update(data=data, headers=headers, query=query)
 830         return url_or_request
 831
 832     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 833         """
 834         Return the response handle.
 835
 836         See _download_webpage docstring for arguments specification.
 837         """
 838         if not self._downloader._first_webpage_request:
 839             sleep_interval = self.get_param('sleep_interval_requests') or 0
 840             if sleep_interval > 0:
 841                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 842                 time.sleep(sleep_interval)
 843         else:
 844             self._downloader._first_webpage_request = False
 845
 846         if note is None:
 847             self.report_download_webpage(video_id)
 848         elif note is not False:
 849             if video_id is None:
 850                 self.to_screen(str(note))
 851             else:
 852                 self.to_screen(f'{video_id}: {note}')
 853
 854         # Some sites check X-Forwarded-For HTTP header in order to figure out
 855         # the origin of the client behind proxy. This allows bypassing geo
 856         # restriction by faking this header's value to IP that belongs to some
 857         # geo unrestricted country. We will do so once we encounter any
 858         # geo restriction error.
 859         if self._x_forwarded_for_ip:
 860             headers = (headers or {}).copy()
 861             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 862
 863         try:
 864             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 865         except network_exceptions as err:
 866             if isinstance(err, HTTPError):
 867                 if self.__can_accept_status_code(err, expected_status):
 868                     return err.response
 869
 870             if errnote is False:
 871                 return False
 872             if errnote is None:
 873                 errnote = 'Unable to download webpage'
 874
 875             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 876             if fatal:
 877                 raise ExtractorError(errmsg, cause=err)
 878             else:
 879                 self.report_warning(errmsg)
 880                 return False
 881
 882     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 883                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 884         """
 885         Return a tuple (page content as string, URL handle).
 886
 887         Arguments:
 888         url_or_request -- plain text URL as a string or
 889             a urllib.request.Request object
 890         video_id -- Video/playlist/item identifier (string)
 891
 892         Keyword arguments:
 893         note -- note printed before downloading (string)
 894         errnote -- note printed in case of an error (string)
 895         fatal -- flag denoting whether error should be considered fatal,
 896             i.e. whether it should cause ExtractionError to be raised,
 897             otherwise a warning will be reported and extraction continued
 898         encoding -- encoding for a page content decoding, guessed automatically
 899             when not explicitly specified
 900         data -- POST data (bytes)
 901         headers -- HTTP headers (dict)
 902         query -- URL query (dict)
 903         expected_status -- allows to accept failed HTTP requests (non 2xx
 904             status code) by explicitly specifying a set of accepted status
 905             codes. Can be any of the following entities:
 906                 - an integer type specifying an exact failed status code to
 907                   accept
 908                 - a list or a tuple of integer types specifying a list of
 909                   failed status codes to accept
 910                 - a callable accepting an actual failed status code and
 911                   returning True if it should be accepted
 912             Note that this argument does not affect success status codes (2xx)
 913             which are always accepted.
 914         """
 915
 916         # Strip hashes from the URL (#1038)
 917         if isinstance(url_or_request, str):
 918             url_or_request = url_or_request.partition('#')[0]
 919
 920         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 921         if urlh is False:
 922             assert not fatal
 923             return False
 924         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 925         return (content, urlh)
 926
 927     @staticmethod
 928     def _guess_encoding_from_content(content_type, webpage_bytes):
 929         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 930         if m:
 931             encoding = m.group(1)
 932         else:
 933             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 934                           webpage_bytes[:1024])
 935             if m:
 936                 encoding = m.group(1).decode('ascii')
 937             elif webpage_bytes.startswith(b'\xff\xfe'):
 938                 encoding = 'utf-16'
 939             else:
 940                 encoding = 'utf-8'
 941
 942         return encoding
 943
 944     def __check_blocked(self, content):
 945         first_block = content[:512]
 946         if ('<title>Access to this site is blocked</title>' in content
 947                 and 'Websense' in first_block):
 948             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 949             blocked_iframe = self._html_search_regex(
 950                 r'<iframe src="([^"]+)"', content,
 951                 'Websense information URL', default=None)
 952             if blocked_iframe:
 953                 msg += ' Visit %s for more details' % blocked_iframe
 954             raise ExtractorError(msg, expected=True)
 955         if '<title>The URL you requested has been blocked</title>' in first_block:
 956             msg = (
 957                 'Access to this webpage has been blocked by Indian censorship. '
 958                 'Use a VPN or proxy server (with --proxy) to route around it.')
 959             block_msg = self._html_search_regex(
 960                 r'</h1><p>(.*?)</p>',
 961                 content, 'block message', default=None)
 962             if block_msg:
 963                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 964             raise ExtractorError(msg, expected=True)
 965         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 966                 and 'blocklist.rkn.gov.ru' in content):
 967             raise ExtractorError(
 968                 'Access to this webpage has been blocked by decision of the Russian government. '
 969                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 970                 expected=True)
 971
 972     def _request_dump_filename(self, url, video_id):
 973         basen = f'{video_id}_{url}'
 974         trim_length = self.get_param('trim_file_name') or 240
 975         if len(basen) > trim_length:
 976             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 977             basen = basen[:trim_length - len(h)] + h
 978         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 979         # Working around MAX_PATH limitation on Windows (see
 980         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 981         if compat_os_name == 'nt':
 982             absfilepath = os.path.abspath(filename)
 983             if len(absfilepath) > 259:
 984                 filename = fR'\\?\{absfilepath}'
 985         return filename
 986
 987     def __decode_webpage(self, webpage_bytes, encoding, headers):
 988         if not encoding:
 989             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 990         try:
 991             return webpage_bytes.decode(encoding, 'replace')
 992         except LookupError:
 993             return webpage_bytes.decode('utf-8', 'replace')
 994
 995     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 996         webpage_bytes = urlh.read()
 997         if prefix is not None:
 998             webpage_bytes = prefix + webpage_bytes
 999         if self.get_param('dump_intermediate_pages', False):
1000             self.to_screen('Dumping request to ' + urlh.url)
1001             dump = base64.b64encode(webpage_bytes).decode('ascii')
1002             self._downloader.to_screen(dump)
1003         if self.get_param('write_pages'):
1004             filename = self._request_dump_filename(urlh.url, video_id)
1005             self.to_screen(f'Saving request to {filename}')
1006             with open(filename, 'wb') as outf:
1007                 outf.write(webpage_bytes)
1008
1009         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
1010         self.__check_blocked(content)
1011
1012         return content
1013
1014     def __print_error(self, errnote, fatal, video_id, err):
1015         if fatal:
1016             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
1017         elif errnote:
1018             self.report_warning(f'{video_id}: {errnote}: {err}')
1019
1020     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
1021         if transform_source:
1022             xml_string = transform_source(xml_string)
1023         try:
1024             return compat_etree_fromstring(xml_string.encode('utf-8'))
1025         except xml.etree.ElementTree.ParseError as ve:
1026             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1027
1028     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1029         try:
1030             return json.loads(
1031                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1032         except ValueError as ve:
1033             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1034
1035     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1036         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1037
1038     def __create_download_methods(name, parser, note, errnote, return_value):
1039
1040         def parse(ie, content, *args, errnote=errnote, **kwargs):
1041             if parser is None:
1042                 return content
1043             if errnote is False:
1044                 kwargs['errnote'] = errnote
1045             # parser is fetched by name so subclasses can override it
1046             return getattr(ie, parser)(content, *args, **kwargs)
1047
1048         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1049                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1050             res = self._download_webpage_handle(
1051                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1052                 data=data, headers=headers, query=query, expected_status=expected_status)
1053             if res is False:
1054                 return res
1055             content, urlh = res
1056             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1057
1058         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1059                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1060             if self.get_param('load_pages'):
1061                 url_or_request = self._create_request(url_or_request, data, headers, query)
1062                 filename = self._request_dump_filename(url_or_request.url, video_id)
1063                 self.to_screen(f'Loading request from {filename}')
1064                 try:
1065                     with open(filename, 'rb') as dumpf:
1066                         webpage_bytes = dumpf.read()
1067                 except OSError as e:
1068                     self.report_warning(f'Unable to load request from disk: {e}')
1069                 else:
1070                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1071                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1072             kwargs = {
1073                 'note': note,
1074                 'errnote': errnote,
1075                 'transform_source': transform_source,
1076                 'fatal': fatal,
1077                 'encoding': encoding,
1078                 'data': data,
1079                 'headers': headers,
1080                 'query': query,
1081                 'expected_status': expected_status,
1082             }
1083             if parser is None:
1084                 kwargs.pop('transform_source')
1085             # The method is fetched by name so subclasses can override _download_..._handle
1086             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1087             return res if res is False else res[0]
1088
1089         def impersonate(func, name, return_value):
1090             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1091             func.__doc__ = f'''
1092                 @param transform_source     Apply this transformation before parsing
1093                 @returns                    {return_value}
1094
1095                 See _download_webpage_handle docstring for other arguments specification
1096             '''
1097
1098         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1099         impersonate(download_content, f'_download_{name}', f'{return_value}')
1100         return download_handle, download_content
1101
1102     _download_xml_handle, _download_xml = __create_download_methods(
1103         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1104     _download_json_handle, _download_json = __create_download_methods(
1105         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1106     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1107         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1108     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1109
1110     def _download_webpage(
1111             self, url_or_request, video_id, note=None, errnote=None,
1112             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1113         """
1114         Return the data of the page as a string.
1115
1116         Keyword arguments:
1117         tries -- number of tries
1118         timeout -- sleep interval between tries
1119
1120         See _download_webpage_handle docstring for other arguments specification.
1121         """
1122
1123         R''' # NB: These are unused; should they be deprecated?
1124         if tries != 1:
1125             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1126         if timeout is NO_DEFAULT:
1127             timeout = 5
1128         else:
1129             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1130         '''
1131
1132         try_count = 0
1133         while True:
1134             try:
1135                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1136             except IncompleteRead as e:
1137                 try_count += 1
1138                 if try_count >= tries:
1139                     raise e
1140                 self._sleep(timeout, video_id)
1141
1142     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1143         idstr = format_field(video_id, None, '%s: ')
1144         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1145         if only_once:
1146             if f'WARNING: {msg}' in self._printed_messages:
1147                 return
1148             self._printed_messages.add(f'WARNING: {msg}')
1149         self._downloader.report_warning(msg, *args, **kwargs)
1150
1151     def to_screen(self, msg, *args, **kwargs):
1152         """Print msg to screen, prefixing it with '[ie_name]'"""
1153         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1154
1155     def write_debug(self, msg, *args, **kwargs):
1156         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1157
1158     def get_param(self, name, default=None, *args, **kwargs):
1159         if self._downloader:
1160             return self._downloader.params.get(name, default, *args, **kwargs)
1161         return default
1162
1163     def report_drm(self, video_id, partial=NO_DEFAULT):
1164         if partial is not NO_DEFAULT:
1165             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1166         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1167
1168     def report_extraction(self, id_or_name):
1169         """Report information extraction."""
1170         self.to_screen('%s: Extracting information' % id_or_name)
1171
1172     def report_download_webpage(self, video_id):
1173         """Report webpage download."""
1174         self.to_screen('%s: Downloading webpage' % video_id)
1175
1176     def report_age_confirmation(self):
1177         """Report attempt to confirm age."""
1178         self.to_screen('Confirming age')
1179
1180     def report_login(self):
1181         """Report attempt to log in."""
1182         self.to_screen('Logging in')
1183
1184     def raise_login_required(
1185             self, msg='This video is only available for registered users',
1186             metadata_available=False, method=NO_DEFAULT):
1187         if metadata_available and (
1188                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1189             self.report_warning(msg)
1190             return
1191         msg += format_field(self._login_hint(method), None, '. %s')
1192         raise ExtractorError(msg, expected=True)
1193
1194     def raise_geo_restricted(
1195             self, msg='This video is not available from your location due to geo restriction',
1196             countries=None, metadata_available=False):
1197         if metadata_available and (
1198                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1199             self.report_warning(msg)
1200         else:
1201             raise GeoRestrictedError(msg, countries=countries)
1202
1203     def raise_no_formats(self, msg, expected=False, video_id=None):
1204         if expected and (
1205                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1206             self.report_warning(msg, video_id)
1207         elif isinstance(msg, ExtractorError):
1208             raise msg
1209         else:
1210             raise ExtractorError(msg, expected=expected, video_id=video_id)
1211
1212     # Methods for following #608
1213     @staticmethod
1214     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1215         """Returns a URL that points to a page that should be processed"""
1216         if ie is not None:
1217             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1218         if video_id is not None:
1219             kwargs['id'] = video_id
1220         if video_title is not None:
1221             kwargs['title'] = video_title
1222         return {
1223             **kwargs,
1224             '_type': 'url_transparent' if url_transparent else 'url',
1225             'url': url,
1226         }
1227
1228     @classmethod
1229     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1230                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1231         return cls.playlist_result(
1232             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1233             playlist_id, playlist_title, **kwargs)
1234
1235     @staticmethod
1236     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1237         """Returns a playlist"""
1238         if playlist_id:
1239             kwargs['id'] = playlist_id
1240         if playlist_title:
1241             kwargs['title'] = playlist_title
1242         if playlist_description is not None:
1243             kwargs['description'] = playlist_description
1244         return {
1245             **kwargs,
1246             '_type': 'multi_video' if multi_video else 'playlist',
1247             'entries': entries,
1248         }
1249
1250     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1251         """
1252         Perform a regex search on the given string, using a single or a list of
1253         patterns returning the first matching group.
1254         In case of failure return a default value or raise a WARNING or a
1255         RegexNotFoundError, depending on fatal, specifying the field name.
1256         """
1257         if string is None:
1258             mobj = None
1259         elif isinstance(pattern, (str, re.Pattern)):
1260             mobj = re.search(pattern, string, flags)
1261         else:
1262             for p in pattern:
1263                 mobj = re.search(p, string, flags)
1264                 if mobj:
1265                     break
1266
1267         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1268
1269         if mobj:
1270             if group is None:
1271                 # return the first matching group
1272                 return next(g for g in mobj.groups() if g is not None)
1273             elif isinstance(group, (list, tuple)):
1274                 return tuple(mobj.group(g) for g in group)
1275             else:
1276                 return mobj.group(group)
1277         elif default is not NO_DEFAULT:
1278             return default
1279         elif fatal:
1280             raise RegexNotFoundError('Unable to extract %s' % _name)
1281         else:
1282             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1283             return None
1284
1285     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1286                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1287         """Searches string for the JSON object specified by start_pattern"""
1288         # NB: end_pattern is only used to reduce the size of the initial match
1289         if default is NO_DEFAULT:
1290             default, has_default = {}, False
1291         else:
1292             fatal, has_default = False, True
1293
1294         json_string = self._search_regex(
1295             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1296             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1297         if not json_string:
1298             return default
1299
1300         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1301         try:
1302             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1303         except ExtractorError as e:
1304             if fatal:
1305                 raise ExtractorError(
1306                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1307             elif not has_default:
1308                 self.report_warning(
1309                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1310         return default
1311
1312     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1313         """
1314         Like _search_regex, but strips HTML tags and unescapes entities.
1315         """
1316         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1317         if isinstance(res, tuple):
1318             return tuple(map(clean_html, res))
1319         return clean_html(res)
1320
1321     def _get_netrc_login_info(self, netrc_machine=None):
1322         netrc_machine = netrc_machine or self._NETRC_MACHINE
1323
1324         cmd = self.get_param('netrc_cmd')
1325         if cmd:
1326             cmd = cmd.replace('{}', netrc_machine)
1327             self.to_screen(f'Executing command: {cmd}')
1328             stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1329             if ret != 0:
1330                 raise OSError(f'Command returned error code {ret}')
1331             info = netrc_from_content(stdout).authenticators(netrc_machine)
1332
1333         elif self.get_param('usenetrc', False):
1334             netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1335             if os.path.isdir(netrc_file):
1336                 netrc_file = os.path.join(netrc_file, '.netrc')
1337             info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1338
1339         else:
1340             return None, None
1341         if not info:
1342             raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')
1343         return info[0], info[2]
1344
1345     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1346         """
1347         Get the login info as (username, password)
1348         First look for the manually specified credentials using username_option
1349         and password_option as keys in params dictionary. If no such credentials
1350         are available try the netrc_cmd if it is defined or look in the
1351         netrc file using the netrc_machine or _NETRC_MACHINE value.
1352         If there's no info available, return (None, None)
1353         """
1354
1355         username = self.get_param(username_option)
1356         if username is not None:
1357             password = self.get_param(password_option)
1358         else:
1359             try:
1360                 username, password = self._get_netrc_login_info(netrc_machine)
1361             except (OSError, netrc.NetrcParseError) as err:
1362                 self.report_warning(f'Failed to parse .netrc: {err}')
1363                 return None, None
1364         return username, password
1365
1366     def _get_tfa_info(self, note='two-factor verification code'):
1367         """
1368         Get the two-factor authentication info
1369         TODO - asking the user will be required for sms/phone verify
1370         currently just uses the command line option
1371         If there's no info available, return None
1372         """
1373
1374         tfa = self.get_param('twofactor')
1375         if tfa is not None:
1376             return tfa
1377
1378         return getpass.getpass('Type %s and press [Return]: ' % note)
1379
1380     # Helper functions for extracting OpenGraph info
1381     @staticmethod
1382     def _og_regexes(prop):
1383         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1384         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1385                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1386         template = r'<meta[^>]+?%s[^>]+?%s'
1387         return [
1388             template % (property_re, content_re),
1389             template % (content_re, property_re),
1390         ]
1391
1392     @staticmethod
1393     def _meta_regex(prop):
1394         return r'''(?isx)<meta
1395                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1396                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1397
1398     def _og_search_property(self, prop, html, name=None, **kargs):
1399         prop = variadic(prop)
1400         if name is None:
1401             name = 'OpenGraph %s' % prop[0]
1402         og_regexes = []
1403         for p in prop:
1404             og_regexes.extend(self._og_regexes(p))
1405         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1406         if escaped is None:
1407             return None
1408         return unescapeHTML(escaped)
1409
1410     def _og_search_thumbnail(self, html, **kargs):
1411         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1412
1413     def _og_search_description(self, html, **kargs):
1414         return self._og_search_property('description', html, fatal=False, **kargs)
1415
1416     def _og_search_title(self, html, *, fatal=False, **kargs):
1417         return self._og_search_property('title', html, fatal=fatal, **kargs)
1418
1419     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1420         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1421         if secure:
1422             regexes = self._og_regexes('video:secure_url') + regexes
1423         return self._html_search_regex(regexes, html, name, **kargs)
1424
1425     def _og_search_url(self, html, **kargs):
1426         return self._og_search_property('url', html, **kargs)
1427
1428     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1429         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1430
1431     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1432         name = variadic(name)
1433         if display_name is None:
1434             display_name = name[0]
1435         return self._html_search_regex(
1436             [self._meta_regex(n) for n in name],
1437             html, display_name, fatal=fatal, group='content', **kwargs)
1438
1439     def _dc_search_uploader(self, html):
1440         return self._html_search_meta('dc.creator', html, 'uploader')
1441
1442     @staticmethod
1443     def _rta_search(html):
1444         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1445         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1446                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1447                      html):
1448             return 18
1449
1450         # And then there are the jokers who advertise that they use RTA, but actually don't.
1451         AGE_LIMIT_MARKERS = [
1452             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1453             r'>[^<]*you acknowledge you are at least (\d+) years old',
1454             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1455         ]
1456
1457         age_limit = 0
1458         for marker in AGE_LIMIT_MARKERS:
1459             mobj = re.search(marker, html)
1460             if mobj:
1461                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1462         return age_limit
1463
1464     def _media_rating_search(self, html):
1465         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1466         rating = self._html_search_meta('rating', html)
1467
1468         if not rating:
1469             return None
1470
1471         RATING_TABLE = {
1472             'safe for kids': 0,
1473             'general': 8,
1474             '14 years': 14,
1475             'mature': 17,
1476             'restricted': 19,
1477         }
1478         return RATING_TABLE.get(rating.lower())
1479
1480     def _family_friendly_search(self, html):
1481         # See http://schema.org/VideoObject
1482         family_friendly = self._html_search_meta(
1483             'isFamilyFriendly', html, default=None)
1484
1485         if not family_friendly:
1486             return None
1487
1488         RATING_TABLE = {
1489             '1': 0,
1490             'true': 0,
1491             '0': 18,
1492             'false': 18,
1493         }
1494         return RATING_TABLE.get(family_friendly.lower())
1495
1496     def _twitter_search_player(self, html):
1497         return self._html_search_meta('twitter:player', html,
1498                                       'twitter card player')
1499
1500     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1501         """Yield all json ld objects in the html"""
1502         if default is not NO_DEFAULT:
1503             fatal = False
1504         for mobj in re.finditer(JSON_LD_RE, html):
1505             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1506             for json_ld in variadic(json_ld_item):
1507                 if isinstance(json_ld, dict):
1508                     yield json_ld
1509
1510     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1511         """Search for a video in any json ld in the html"""
1512         if default is not NO_DEFAULT:
1513             fatal = False
1514         info = self._json_ld(
1515             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1516             video_id, fatal=fatal, expected_type=expected_type)
1517         if info:
1518             return info
1519         if default is not NO_DEFAULT:
1520             return default
1521         elif fatal:
1522             raise RegexNotFoundError('Unable to extract JSON-LD')
1523         else:
1524             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1525             return {}
1526
1527     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1528         if isinstance(json_ld, str):
1529             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1530         if not json_ld:
1531             return {}
1532         info = {}
1533
1534         INTERACTION_TYPE_MAP = {
1535             'CommentAction': 'comment',
1536             'AgreeAction': 'like',
1537             'DisagreeAction': 'dislike',
1538             'LikeAction': 'like',
1539             'DislikeAction': 'dislike',
1540             'ListenAction': 'view',
1541             'WatchAction': 'view',
1542             'ViewAction': 'view',
1543         }
1544
1545         def is_type(e, *expected_types):
1546             type = variadic(traverse_obj(e, '@type'))
1547             return any(x in type for x in expected_types)
1548
1549         def extract_interaction_type(e):
1550             interaction_type = e.get('interactionType')
1551             if isinstance(interaction_type, dict):
1552                 interaction_type = interaction_type.get('@type')
1553             return str_or_none(interaction_type)
1554
1555         def extract_interaction_statistic(e):
1556             interaction_statistic = e.get('interactionStatistic')
1557             if isinstance(interaction_statistic, dict):
1558                 interaction_statistic = [interaction_statistic]
1559             if not isinstance(interaction_statistic, list):
1560                 return
1561             for is_e in interaction_statistic:
1562                 if not is_type(is_e, 'InteractionCounter'):
1563                     continue
1564                 interaction_type = extract_interaction_type(is_e)
1565                 if not interaction_type:
1566                     continue
1567                 # For interaction count some sites provide string instead of
1568                 # an integer (as per spec) with non digit characters (e.g. ",")
1569                 # so extracting count with more relaxed str_to_int
1570                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1571                 if interaction_count is None:
1572                     continue
1573                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1574                 if not count_kind:
1575                     continue
1576                 count_key = '%s_count' % count_kind
1577                 if info.get(count_key) is not None:
1578                     continue
1579                 info[count_key] = interaction_count
1580
1581         def extract_chapter_information(e):
1582             chapters = [{
1583                 'title': part.get('name'),
1584                 'start_time': part.get('startOffset'),
1585                 'end_time': part.get('endOffset'),
1586             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1587             for idx, (last_c, current_c, next_c) in enumerate(zip(
1588                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1589                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1590                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1591                 if None in current_c.values():
1592                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1593                     return
1594             if chapters:
1595                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1596                 info['chapters'] = chapters
1597
1598         def extract_video_object(e):
1599             author = e.get('author')
1600             info.update({
1601                 'url': url_or_none(e.get('contentUrl')),
1602                 'ext': mimetype2ext(e.get('encodingFormat')),
1603                 'title': unescapeHTML(e.get('name')),
1604                 'description': unescapeHTML(e.get('description')),
1605                 'thumbnails': [{'url': unescapeHTML(url)}
1606                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1607                                if url_or_none(url)],
1608                 'duration': parse_duration(e.get('duration')),
1609                 'timestamp': unified_timestamp(e.get('uploadDate')),
1610                 # author can be an instance of 'Organization' or 'Person' types.
1611                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1612                 # however some websites are using 'Text' type instead.
1613                 # 1. https://schema.org/VideoObject
1614                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1615                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1616                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1617                 'tbr': int_or_none(e.get('bitrate')),
1618                 'width': int_or_none(e.get('width')),
1619                 'height': int_or_none(e.get('height')),
1620                 'view_count': int_or_none(e.get('interactionCount')),
1621                 'tags': try_call(lambda: e.get('keywords').split(',')),
1622             })
1623             if is_type(e, 'AudioObject'):
1624                 info.update({
1625                     'vcodec': 'none',
1626                     'abr': int_or_none(e.get('bitrate')),
1627                 })
1628             extract_interaction_statistic(e)
1629             extract_chapter_information(e)
1630
1631         def traverse_json_ld(json_ld, at_top_level=True):
1632             for e in variadic(json_ld):
1633                 if not isinstance(e, dict):
1634                     continue
1635                 if at_top_level and '@context' not in e:
1636                     continue
1637                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1638                     traverse_json_ld(e['@graph'], at_top_level=False)
1639                     continue
1640                 if expected_type is not None and not is_type(e, expected_type):
1641                     continue
1642                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1643                 if rating is not None:
1644                     info['average_rating'] = rating
1645                 if is_type(e, 'TVEpisode', 'Episode'):
1646                     episode_name = unescapeHTML(e.get('name'))
1647                     info.update({
1648                         'episode': episode_name,
1649                         'episode_number': int_or_none(e.get('episodeNumber')),
1650                         'description': unescapeHTML(e.get('description')),
1651                     })
1652                     if not info.get('title') and episode_name:
1653                         info['title'] = episode_name
1654                     part_of_season = e.get('partOfSeason')
1655                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1656                         info.update({
1657                             'season': unescapeHTML(part_of_season.get('name')),
1658                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1659                         })
1660                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1661                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1662                         info['series'] = unescapeHTML(part_of_series.get('name'))
1663                 elif is_type(e, 'Movie'):
1664                     info.update({
1665                         'title': unescapeHTML(e.get('name')),
1666                         'description': unescapeHTML(e.get('description')),
1667                         'duration': parse_duration(e.get('duration')),
1668                         'timestamp': unified_timestamp(e.get('dateCreated')),
1669                     })
1670                 elif is_type(e, 'Article', 'NewsArticle'):
1671                     info.update({
1672                         'timestamp': parse_iso8601(e.get('datePublished')),
1673                         'title': unescapeHTML(e.get('headline')),
1674                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1675                     })
1676                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1677                         extract_video_object(e['video'][0])
1678                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1679                         extract_video_object(e['subjectOf'][0])
1680                 elif is_type(e, 'VideoObject', 'AudioObject'):
1681                     extract_video_object(e)
1682                     if expected_type is None:
1683                         continue
1684                     else:
1685                         break
1686                 video = e.get('video')
1687                 if is_type(video, 'VideoObject'):
1688                     extract_video_object(video)
1689                 if expected_type is None:
1690                     continue
1691                 else:
1692                     break
1693
1694         traverse_json_ld(json_ld)
1695         return filter_dict(info)
1696
1697     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1698         return self._parse_json(
1699             self._search_regex(
1700                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1701                 webpage, 'next.js data', fatal=fatal, **kw),
1702             video_id, transform_source=transform_source, fatal=fatal)
1703
1704     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1705         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1706         rectx = re.escape(context_name)
1707         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1708         js, arg_keys, arg_vals = self._search_regex(
1709             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1710             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1711             default=NO_DEFAULT if fatal else (None, None, None))
1712         if js is None:
1713             return {}
1714
1715         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1716             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1717
1718         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1719         return traverse_obj(ret, traverse) or {}
1720
1721     @staticmethod
1722     def _hidden_inputs(html):
1723         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1724         hidden_inputs = {}
1725         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1726             attrs = extract_attributes(input)
1727             if not input:
1728                 continue
1729             if attrs.get('type') not in ('hidden', 'submit'):
1730                 continue
1731             name = attrs.get('name') or attrs.get('id')
1732             value = attrs.get('value')
1733             if name and value is not None:
1734                 hidden_inputs[name] = value
1735         return hidden_inputs
1736
1737     def _form_hidden_inputs(self, form_id, html):
1738         form = self._search_regex(
1739             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1740             html, '%s form' % form_id, group='form')
1741         return self._hidden_inputs(form)
1742
1743     @classproperty(cache=True)
1744     def FormatSort(cls):
1745         class FormatSort(FormatSorter):
1746             def __init__(ie, *args, **kwargs):
1747                 super().__init__(ie._downloader, *args, **kwargs)
1748
1749         deprecation_warning(
1750             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1751             'Use yt_dlp.utils.FormatSorter instead')
1752         return FormatSort
1753
1754     def _sort_formats(self, formats, field_preference=[]):
1755         if not field_preference:
1756             self._downloader.deprecation_warning(
1757                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1758             return
1759         self._downloader.deprecation_warning(
1760             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1761             'Return _format_sort_fields in the info_dict instead')
1762         if formats:
1763             formats[0]['__sort_fields'] = field_preference
1764
1765     def _check_formats(self, formats, video_id):
1766         if formats:
1767             formats[:] = filter(
1768                 lambda f: self._is_valid_url(
1769                     f['url'], video_id,
1770                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1771                 formats)
1772
1773     @staticmethod
1774     def _remove_duplicate_formats(formats):
1775         format_urls = set()
1776         unique_formats = []
1777         for f in formats:
1778             if f['url'] not in format_urls:
1779                 format_urls.add(f['url'])
1780                 unique_formats.append(f)
1781         formats[:] = unique_formats
1782
1783     def _is_valid_url(self, url, video_id, item='video', headers={}):
1784         url = self._proto_relative_url(url, scheme='http:')
1785         # For now assume non HTTP(S) URLs always valid
1786         if not (url.startswith('http://') or url.startswith('https://')):
1787             return True
1788         try:
1789             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1790             return True
1791         except ExtractorError as e:
1792             self.to_screen(
1793                 '%s: %s URL is invalid, skipping: %s'
1794                 % (video_id, item, error_to_compat_str(e.cause)))
1795             return False
1796
1797     def http_scheme(self):
1798         """ Either "http:" or "https:", depending on the user's preferences """
1799         return (
1800             'http:'
1801             if self.get_param('prefer_insecure', False)
1802             else 'https:')
1803
1804     def _proto_relative_url(self, url, scheme=None):
1805         scheme = scheme or self.http_scheme()
1806         assert scheme.endswith(':')
1807         return sanitize_url(url, scheme=scheme[:-1])
1808
1809     def _sleep(self, timeout, video_id, msg_template=None):
1810         if msg_template is None:
1811             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1812         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1813         self.to_screen(msg)
1814         time.sleep(timeout)
1815
1816     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1817                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1818                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1819         if self.get_param('ignore_no_formats_error'):
1820             fatal = False
1821
1822         res = self._download_xml_handle(
1823             manifest_url, video_id, 'Downloading f4m manifest',
1824             'Unable to download f4m manifest',
1825             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1826             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1827             transform_source=transform_source,
1828             fatal=fatal, data=data, headers=headers, query=query)
1829         if res is False:
1830             return []
1831
1832         manifest, urlh = res
1833         manifest_url = urlh.url
1834
1835         return self._parse_f4m_formats(
1836             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1837             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1838
1839     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1840                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1841                            fatal=True, m3u8_id=None):
1842         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1843             return []
1844
1845         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1846         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1847         if akamai_pv is not None and ';' in akamai_pv.text:
1848             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1849             if playerVerificationChallenge.strip() != '':
1850                 return []
1851
1852         formats = []
1853         manifest_version = '1.0'
1854         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1855         if not media_nodes:
1856             manifest_version = '2.0'
1857             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1858         # Remove unsupported DRM protected media from final formats
1859         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1860         media_nodes = remove_encrypted_media(media_nodes)
1861         if not media_nodes:
1862             return formats
1863
1864         manifest_base_url = get_base_url(manifest)
1865
1866         bootstrap_info = xpath_element(
1867             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1868             'bootstrap info', default=None)
1869
1870         vcodec = None
1871         mime_type = xpath_text(
1872             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1873             'base URL', default=None)
1874         if mime_type and mime_type.startswith('audio/'):
1875             vcodec = 'none'
1876
1877         for i, media_el in enumerate(media_nodes):
1878             tbr = int_or_none(media_el.attrib.get('bitrate'))
1879             width = int_or_none(media_el.attrib.get('width'))
1880             height = int_or_none(media_el.attrib.get('height'))
1881             format_id = join_nonempty(f4m_id, tbr or i)
1882             # If <bootstrapInfo> is present, the specified f4m is a
1883             # stream-level manifest, and only set-level manifests may refer to
1884             # external resources.  See section 11.4 and section 4 of F4M spec
1885             if bootstrap_info is None:
1886                 media_url = None
1887                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1888                 if manifest_version == '2.0':
1889                     media_url = media_el.attrib.get('href')
1890                 if media_url is None:
1891                     media_url = media_el.attrib.get('url')
1892                 if not media_url:
1893                     continue
1894                 manifest_url = (
1895                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1896                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1897                 # If media_url is itself a f4m manifest do the recursive extraction
1898                 # since bitrates in parent manifest (this one) and media_url manifest
1899                 # may differ leading to inability to resolve the format by requested
1900                 # bitrate in f4m downloader
1901                 ext = determine_ext(manifest_url)
1902                 if ext == 'f4m':
1903                     f4m_formats = self._extract_f4m_formats(
1904                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1905                         transform_source=transform_source, fatal=fatal)
1906                     # Sometimes stream-level manifest contains single media entry that
1907                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1908                     # At the same time parent's media entry in set-level manifest may
1909                     # contain it. We will copy it from parent in such cases.
1910                     if len(f4m_formats) == 1:
1911                         f = f4m_formats[0]
1912                         f.update({
1913                             'tbr': f.get('tbr') or tbr,
1914                             'width': f.get('width') or width,
1915                             'height': f.get('height') or height,
1916                             'format_id': f.get('format_id') if not tbr else format_id,
1917                             'vcodec': vcodec,
1918                         })
1919                     formats.extend(f4m_formats)
1920                     continue
1921                 elif ext == 'm3u8':
1922                     formats.extend(self._extract_m3u8_formats(
1923                         manifest_url, video_id, 'mp4', preference=preference,
1924                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1925                     continue
1926             formats.append({
1927                 'format_id': format_id,
1928                 'url': manifest_url,
1929                 'manifest_url': manifest_url,
1930                 'ext': 'flv' if bootstrap_info is not None else None,
1931                 'protocol': 'f4m',
1932                 'tbr': tbr,
1933                 'width': width,
1934                 'height': height,
1935                 'vcodec': vcodec,
1936                 'preference': preference,
1937                 'quality': quality,
1938             })
1939         return formats
1940
1941     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1942         return {
1943             'format_id': join_nonempty(m3u8_id, 'meta'),
1944             'url': m3u8_url,
1945             'ext': ext,
1946             'protocol': 'm3u8',
1947             'preference': preference - 100 if preference else -100,
1948             'quality': quality,
1949             'resolution': 'multiple',
1950             'format_note': 'Quality selection URL',
1951         }
1952
1953     def _report_ignoring_subs(self, name):
1954         self.report_warning(bug_reports_message(
1955             f'Ignoring subtitle tracks found in the {name} manifest; '
1956             'if any subtitle tracks are missing,'
1957         ), only_once=True)
1958
1959     def _extract_m3u8_formats(self, *args, **kwargs):
1960         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1961         if subs:
1962             self._report_ignoring_subs('HLS')
1963         return fmts
1964
1965     def _extract_m3u8_formats_and_subtitles(
1966             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1967             preference=None, quality=None, m3u8_id=None, note=None,
1968             errnote=None, fatal=True, live=False, data=None, headers={},
1969             query={}):
1970
1971         if self.get_param('ignore_no_formats_error'):
1972             fatal = False
1973
1974         if not m3u8_url:
1975             if errnote is not False:
1976                 errnote = errnote or 'Failed to obtain m3u8 URL'
1977                 if fatal:
1978                     raise ExtractorError(errnote, video_id=video_id)
1979                 self.report_warning(f'{errnote}{bug_reports_message()}')
1980             return [], {}
1981
1982         res = self._download_webpage_handle(
1983             m3u8_url, video_id,
1984             note='Downloading m3u8 information' if note is None else note,
1985             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1986             fatal=fatal, data=data, headers=headers, query=query)
1987
1988         if res is False:
1989             return [], {}
1990
1991         m3u8_doc, urlh = res
1992         m3u8_url = urlh.url
1993
1994         return self._parse_m3u8_formats_and_subtitles(
1995             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1996             preference=preference, quality=quality, m3u8_id=m3u8_id,
1997             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1998             headers=headers, query=query, video_id=video_id)
1999
2000     def _parse_m3u8_formats_and_subtitles(
2001             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2002             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2003             errnote=None, fatal=True, data=None, headers={}, query={},
2004             video_id=None):
2005         formats, subtitles = [], {}
2006         has_drm = HlsFD._has_drm(m3u8_doc)
2007
2008         def format_url(url):
2009             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2010
2011         if self.get_param('hls_split_discontinuity', False):
2012             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2013                 if not m3u8_doc:
2014                     if not manifest_url:
2015                         return []
2016                     m3u8_doc = self._download_webpage(
2017                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2018                         note=False, errnote='Failed to download m3u8 playlist information')
2019                     if m3u8_doc is False:
2020                         return []
2021                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2022
2023         else:
2024             def _extract_m3u8_playlist_indices(*args, **kwargs):
2025                 return [None]
2026
2027         # References:
2028         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2029         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2030         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2031
2032         # We should try extracting formats only from master playlists [1, 4.3.4],
2033         # i.e. playlists that describe available qualities. On the other hand
2034         # media playlists [1, 4.3.3] should be returned as is since they contain
2035         # just the media without qualities renditions.
2036         # Fortunately, master playlist can be easily distinguished from media
2037         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2038         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2039         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2040         # media playlist and MUST NOT appear in master playlist thus we can
2041         # clearly detect media playlist with this criterion.
2042
2043         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2044             formats = [{
2045                 'format_id': join_nonempty(m3u8_id, idx),
2046                 'format_index': idx,
2047                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2048                 'ext': ext,
2049                 'protocol': entry_protocol,
2050                 'preference': preference,
2051                 'quality': quality,
2052                 'has_drm': has_drm,
2053             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2054
2055             return formats, subtitles
2056
2057         groups = {}
2058         last_stream_inf = {}
2059
2060         def extract_media(x_media_line):
2061             media = parse_m3u8_attributes(x_media_line)
2062             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2063             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2064             if not (media_type and group_id and name):
2065                 return
2066             groups.setdefault(group_id, []).append(media)
2067             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2068             if media_type == 'SUBTITLES':
2069                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2070                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2071                 # However, lack of URI has been spotted in the wild.
2072                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2073                 if not media.get('URI'):
2074                     return
2075                 url = format_url(media['URI'])
2076                 sub_info = {
2077                     'url': url,
2078                     'ext': determine_ext(url),
2079                 }
2080                 if sub_info['ext'] == 'm3u8':
2081                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2082                     # files may contain is WebVTT:
2083                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2084                     sub_info['ext'] = 'vtt'
2085                     sub_info['protocol'] = 'm3u8_native'
2086                 lang = media.get('LANGUAGE') or 'und'
2087                 subtitles.setdefault(lang, []).append(sub_info)
2088             if media_type not in ('VIDEO', 'AUDIO'):
2089                 return
2090             media_url = media.get('URI')
2091             if media_url:
2092                 manifest_url = format_url(media_url)
2093                 formats.extend({
2094                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2095                     'format_note': name,
2096                     'format_index': idx,
2097                     'url': manifest_url,
2098                     'manifest_url': m3u8_url,
2099                     'language': media.get('LANGUAGE'),
2100                     'ext': ext,
2101                     'protocol': entry_protocol,
2102                     'preference': preference,
2103                     'quality': quality,
2104                     'has_drm': has_drm,
2105                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2106                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2107
2108         def build_stream_name():
2109             # Despite specification does not mention NAME attribute for
2110             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2111             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2112             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2113             stream_name = last_stream_inf.get('NAME')
2114             if stream_name:
2115                 return stream_name
2116             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2117             # from corresponding rendition group
2118             stream_group_id = last_stream_inf.get('VIDEO')
2119             if not stream_group_id:
2120                 return
2121             stream_group = groups.get(stream_group_id)
2122             if not stream_group:
2123                 return stream_group_id
2124             rendition = stream_group[0]
2125             return rendition.get('NAME') or stream_group_id
2126
2127         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2128         # chance to detect video only formats when EXT-X-STREAM-INF tags
2129         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2130         for line in m3u8_doc.splitlines():
2131             if line.startswith('#EXT-X-MEDIA:'):
2132                 extract_media(line)
2133
2134         for line in m3u8_doc.splitlines():
2135             if line.startswith('#EXT-X-STREAM-INF:'):
2136                 last_stream_inf = parse_m3u8_attributes(line)
2137             elif line.startswith('#') or not line.strip():
2138                 continue
2139             else:
2140                 tbr = float_or_none(
2141                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2142                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2143                 manifest_url = format_url(line.strip())
2144
2145                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2146                     format_id = [m3u8_id, None, idx]
2147                     # Bandwidth of live streams may differ over time thus making
2148                     # format_id unpredictable. So it's better to keep provided
2149                     # format_id intact.
2150                     if not live:
2151                         stream_name = build_stream_name()
2152                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2153                     f = {
2154                         'format_id': join_nonempty(*format_id),
2155                         'format_index': idx,
2156                         'url': manifest_url,
2157                         'manifest_url': m3u8_url,
2158                         'tbr': tbr,
2159                         'ext': ext,
2160                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2161                         'protocol': entry_protocol,
2162                         'preference': preference,
2163                         'quality': quality,
2164                         'has_drm': has_drm,
2165                     }
2166                     resolution = last_stream_inf.get('RESOLUTION')
2167                     if resolution:
2168                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2169                         if mobj:
2170                             f['width'] = int(mobj.group('width'))
2171                             f['height'] = int(mobj.group('height'))
2172                     # Unified Streaming Platform
2173                     mobj = re.search(
2174                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2175                     if mobj:
2176                         abr, vbr = mobj.groups()
2177                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2178                         f.update({
2179                             'vbr': vbr,
2180                             'abr': abr,
2181                         })
2182                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2183                     f.update(codecs)
2184                     audio_group_id = last_stream_inf.get('AUDIO')
2185                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2186                     # references a rendition group MUST have a CODECS attribute.
2187                     # However, this is not always respected. E.g. [2]
2188                     # contains EXT-X-STREAM-INF tag which references AUDIO
2189                     # rendition group but does not have CODECS and despite
2190                     # referencing an audio group it represents a complete
2191                     # (with audio and video) format. So, for such cases we will
2192                     # ignore references to rendition groups and treat them
2193                     # as complete formats.
2194                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2195                         audio_group = groups.get(audio_group_id)
2196                         if audio_group and audio_group[0].get('URI'):
2197                             # TODO: update acodec for audio only formats with
2198                             # the same GROUP-ID
2199                             f['acodec'] = 'none'
2200                     if not f.get('ext'):
2201                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2202                     formats.append(f)
2203
2204                     # for DailyMotion
2205                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2206                     if progressive_uri:
2207                         http_f = f.copy()
2208                         del http_f['manifest_url']
2209                         http_f.update({
2210                             'format_id': f['format_id'].replace('hls-', 'http-'),
2211                             'protocol': 'http',
2212                             'url': progressive_uri,
2213                         })
2214                         formats.append(http_f)
2215
2216                 last_stream_inf = {}
2217         return formats, subtitles
2218
2219     def _extract_m3u8_vod_duration(
2220             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2221
2222         m3u8_vod = self._download_webpage(
2223             m3u8_vod_url, video_id,
2224             note='Downloading m3u8 VOD manifest' if note is None else note,
2225             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2226             fatal=False, data=data, headers=headers, query=query)
2227
2228         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2229
2230     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2231         if '#EXT-X-ENDLIST' not in m3u8_vod:
2232             return None
2233
2234         return int(sum(
2235             float(line[len('#EXTINF:'):].split(',')[0])
2236             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2237
2238     def _extract_mpd_vod_duration(
2239             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2240
2241         mpd_doc = self._download_xml(
2242             mpd_url, video_id,
2243             note='Downloading MPD VOD manifest' if note is None else note,
2244             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2245             fatal=False, data=data, headers=headers, query=query)
2246         if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2247             return None
2248         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2249
2250     @staticmethod
2251     def _xpath_ns(path, namespace=None):
2252         if not namespace:
2253             return path
2254         out = []
2255         for c in path.split('/'):
2256             if not c or c == '.':
2257                 out.append(c)
2258             else:
2259                 out.append('{%s}%s' % (namespace, c))
2260         return '/'.join(out)
2261
2262     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2263         if self.get_param('ignore_no_formats_error'):
2264             fatal = False
2265
2266         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2267         if res is False:
2268             assert not fatal
2269             return [], {}
2270         smil, urlh = res
2271
2272         return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2273                                                       namespace=self._parse_smil_namespace(smil))
2274
2275     def _extract_smil_formats(self, *args, **kwargs):
2276         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2277         if subs:
2278             self._report_ignoring_subs('SMIL')
2279         return fmts
2280
2281     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2282         res = self._download_smil(smil_url, video_id, fatal=fatal)
2283         if res is False:
2284             return {}
2285
2286         smil, urlh = res
2287         smil_url = urlh.url
2288
2289         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2290
2291     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2292         return self._download_xml_handle(
2293             smil_url, video_id, 'Downloading SMIL file',
2294             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2295
2296     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2297         namespace = self._parse_smil_namespace(smil)
2298
2299         formats, subtitles = self._parse_smil_formats_and_subtitles(
2300             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2301
2302         video_id = os.path.splitext(url_basename(smil_url))[0]
2303         title = None
2304         description = None
2305         upload_date = None
2306         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2307             name = meta.attrib.get('name')
2308             content = meta.attrib.get('content')
2309             if not name or not content:
2310                 continue
2311             if not title and name == 'title':
2312                 title = content
2313             elif not description and name in ('description', 'abstract'):
2314                 description = content
2315             elif not upload_date and name == 'date':
2316                 upload_date = unified_strdate(content)
2317
2318         thumbnails = [{
2319             'id': image.get('type'),
2320             'url': image.get('src'),
2321             'width': int_or_none(image.get('width')),
2322             'height': int_or_none(image.get('height')),
2323         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2324
2325         return {
2326             'id': video_id,
2327             'title': title or video_id,
2328             'description': description,
2329             'upload_date': upload_date,
2330             'thumbnails': thumbnails,
2331             'formats': formats,
2332             'subtitles': subtitles,
2333         }
2334
2335     def _parse_smil_namespace(self, smil):
2336         return self._search_regex(
2337             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2338
2339     def _parse_smil_formats(self, *args, **kwargs):
2340         fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2341         if subs:
2342             self._report_ignoring_subs('SMIL')
2343         return fmts
2344
2345     def _parse_smil_formats_and_subtitles(
2346             self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2347         base = smil_url
2348         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2349             b = meta.get('base') or meta.get('httpBase')
2350             if b:
2351                 base = b
2352                 break
2353
2354         formats, subtitles = [], {}
2355         rtmp_count = 0
2356         http_count = 0
2357         m3u8_count = 0
2358         imgs_count = 0
2359
2360         srcs = set()
2361         media = itertools.chain.from_iterable(
2362             smil.findall(self._xpath_ns(arg, namespace))
2363             for arg in ['.//video', './/audio', './/media'])
2364         for medium in media:
2365             src = medium.get('src')
2366             if not src or src in srcs:
2367                 continue
2368             srcs.add(src)
2369
2370             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2371             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2372             width = int_or_none(medium.get('width'))
2373             height = int_or_none(medium.get('height'))
2374             proto = medium.get('proto')
2375             ext = medium.get('ext')
2376             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2377                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2378             streamer = medium.get('streamer') or base
2379
2380             if proto == 'rtmp' or streamer.startswith('rtmp'):
2381                 rtmp_count += 1
2382                 formats.append({
2383                     'url': streamer,
2384                     'play_path': src,
2385                     'ext': 'flv',
2386                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2387                     'tbr': bitrate,
2388                     'filesize': filesize,
2389                     'width': width,
2390                     'height': height,
2391                 })
2392                 if transform_rtmp_url:
2393                     streamer, src = transform_rtmp_url(streamer, src)
2394                     formats[-1].update({
2395                         'url': streamer,
2396                         'play_path': src,
2397                     })
2398                 continue
2399
2400             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2401             src_url = src_url.strip()
2402
2403             if proto == 'm3u8' or src_ext == 'm3u8':
2404                 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
2405                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2406                 self._merge_subtitles(m3u8_subs, target=subtitles)
2407                 if len(m3u8_formats) == 1:
2408                     m3u8_count += 1
2409                     m3u8_formats[0].update({
2410                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2411                         'tbr': bitrate,
2412                         'width': width,
2413                         'height': height,
2414                     })
2415                 formats.extend(m3u8_formats)
2416             elif src_ext == 'f4m':
2417                 f4m_url = src_url
2418                 if not f4m_params:
2419                     f4m_params = {
2420                         'hdcore': '3.2.0',
2421                         'plugin': 'flowplayer-3.2.0.1',
2422                     }
2423                 f4m_url += '&' if '?' in f4m_url else '?'
2424                 f4m_url += urllib.parse.urlencode(f4m_params)
2425                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2426             elif src_ext == 'mpd':
2427                 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2428                     src_url, video_id, mpd_id='dash', fatal=False)
2429                 formats.extend(mpd_formats)
2430                 self._merge_subtitles(mpd_subs, target=subtitles)
2431             elif re.search(r'\.ism/[Mm]anifest', src_url):
2432                 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2433                     src_url, video_id, ism_id='mss', fatal=False)
2434                 formats.extend(ism_formats)
2435                 self._merge_subtitles(ism_subs, target=subtitles)
2436             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2437                 http_count += 1
2438                 formats.append({
2439                     'url': src_url,
2440                     'ext': ext or src_ext or 'flv',
2441                     'format_id': 'http-%d' % (bitrate or http_count),
2442                     'tbr': bitrate,
2443                     'filesize': filesize,
2444                     'width': width,
2445                     'height': height,
2446                 })
2447
2448         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2449             src = medium.get('src')
2450             if not src or src in srcs:
2451                 continue
2452             srcs.add(src)
2453
2454             imgs_count += 1
2455             formats.append({
2456                 'format_id': 'imagestream-%d' % (imgs_count),
2457                 'url': src,
2458                 'ext': mimetype2ext(medium.get('type')),
2459                 'acodec': 'none',
2460                 'vcodec': 'none',
2461                 'width': int_or_none(medium.get('width')),
2462                 'height': int_or_none(medium.get('height')),
2463                 'format_note': 'SMIL storyboards',
2464             })
2465
2466         smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2467         self._merge_subtitles(smil_subs, target=subtitles)
2468
2469         return formats, subtitles
2470
2471     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2472         urls = []
2473         subtitles = {}
2474         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2475             src = textstream.get('src')
2476             if not src or src in urls:
2477                 continue
2478             urls.append(src)
2479             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2480             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2481             subtitles.setdefault(lang, []).append({
2482                 'url': src,
2483                 'ext': ext,
2484             })
2485         return subtitles
2486
2487     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2488         res = self._download_xml_handle(
2489             xspf_url, playlist_id, 'Downloading xpsf playlist',
2490             'Unable to download xspf manifest', fatal=fatal)
2491         if res is False:
2492             return []
2493
2494         xspf, urlh = res
2495         xspf_url = urlh.url
2496
2497         return self._parse_xspf(
2498             xspf, playlist_id, xspf_url=xspf_url,
2499             xspf_base_url=base_url(xspf_url))
2500
2501     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2502         NS_MAP = {
2503             'xspf': 'http://xspf.org/ns/0/',
2504             's1': 'http://static.streamone.nl/player/ns/0',
2505         }
2506
2507         entries = []
2508         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2509             title = xpath_text(
2510                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2511             description = xpath_text(
2512                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2513             thumbnail = xpath_text(
2514                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2515             duration = float_or_none(
2516                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2517
2518             formats = []
2519             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2520                 format_url = urljoin(xspf_base_url, location.text)
2521                 if not format_url:
2522                     continue
2523                 formats.append({
2524                     'url': format_url,
2525                     'manifest_url': xspf_url,
2526                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2527                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2528                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2529                 })
2530
2531             entries.append({
2532                 'id': playlist_id,
2533                 'title': title,
2534                 'description': description,
2535                 'thumbnail': thumbnail,
2536                 'duration': duration,
2537                 'formats': formats,
2538             })
2539         return entries
2540
2541     def _extract_mpd_formats(self, *args, **kwargs):
2542         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2543         if subs:
2544             self._report_ignoring_subs('DASH')
2545         return fmts
2546
2547     def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
2548         periods = self._extract_mpd_periods(*args, **kwargs)
2549         return self._merge_mpd_periods(periods)
2550
2551     def _extract_mpd_periods(
2552             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2553             fatal=True, data=None, headers={}, query={}):
2554
2555         if self.get_param('ignore_no_formats_error'):
2556             fatal = False
2557
2558         res = self._download_xml_handle(
2559             mpd_url, video_id,
2560             note='Downloading MPD manifest' if note is None else note,
2561             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2562             fatal=fatal, data=data, headers=headers, query=query)
2563         if res is False:
2564             return []
2565         mpd_doc, urlh = res
2566         if mpd_doc is None:
2567             return []
2568
2569         # We could have been redirected to a new url when we retrieved our mpd file.
2570         mpd_url = urlh.url
2571         mpd_base_url = base_url(mpd_url)
2572
2573         return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
2574
2575     def _parse_mpd_formats(self, *args, **kwargs):
2576         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2577         if subs:
2578             self._report_ignoring_subs('DASH')
2579         return fmts
2580
2581     def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
2582         periods = self._parse_mpd_periods(*args, **kwargs)
2583         return self._merge_mpd_periods(periods)
2584
2585     def _merge_mpd_periods(self, periods):
2586         """
2587         Combine all formats and subtitles from an MPD manifest into a single list,
2588         by concatenate streams with similar formats.
2589         """
2590         formats, subtitles = {}, {}
2591         for period in periods:
2592             for f in period['formats']:
2593                 assert 'is_dash_periods' not in f, 'format already processed'
2594                 f['is_dash_periods'] = True
2595                 format_key = tuple(v for k, v in f.items() if k not in (
2596                     ('format_id', 'fragments', 'manifest_stream_number')))
2597                 if format_key not in formats:
2598                     formats[format_key] = f
2599                 elif 'fragments' in f:
2600                     formats[format_key].setdefault('fragments', []).extend(f['fragments'])
2601
2602             if subtitles and period['subtitles']:
2603                 self.report_warning(bug_reports_message(
2604                     'Found subtitles in multiple periods in the DASH manifest; '
2605                     'if part of the subtitles are missing,'
2606                 ), only_once=True)
2607
2608             for sub_lang, sub_info in period['subtitles'].items():
2609                 subtitles.setdefault(sub_lang, []).extend(sub_info)
2610
2611         return list(formats.values()), subtitles
2612
2613     def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2614         """
2615         Parse formats from MPD manifest.
2616         References:
2617          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2618             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2619          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2620         """
2621         if not self.get_param('dynamic_mpd', True):
2622             if mpd_doc.get('type') == 'dynamic':
2623                 return [], {}
2624
2625         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2626
2627         def _add_ns(path):
2628             return self._xpath_ns(path, namespace)
2629
2630         def is_drm_protected(element):
2631             return element.find(_add_ns('ContentProtection')) is not None
2632
2633         def extract_multisegment_info(element, ms_parent_info):
2634             ms_info = ms_parent_info.copy()
2635
2636             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2637             # common attributes and elements.  We will only extract relevant
2638             # for us.
2639             def extract_common(source):
2640                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2641                 if segment_timeline is not None:
2642                     s_e = segment_timeline.findall(_add_ns('S'))
2643                     if s_e:
2644                         ms_info['total_number'] = 0
2645                         ms_info['s'] = []
2646                         for s in s_e:
2647                             r = int(s.get('r', 0))
2648                             ms_info['total_number'] += 1 + r
2649                             ms_info['s'].append({
2650                                 't': int(s.get('t', 0)),
2651                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2652                                 'd': int(s.attrib['d']),
2653                                 'r': r,
2654                             })
2655                 start_number = source.get('startNumber')
2656                 if start_number:
2657                     ms_info['start_number'] = int(start_number)
2658                 timescale = source.get('timescale')
2659                 if timescale:
2660                     ms_info['timescale'] = int(timescale)
2661                 segment_duration = source.get('duration')
2662                 if segment_duration:
2663                     ms_info['segment_duration'] = float(segment_duration)
2664
2665             def extract_Initialization(source):
2666                 initialization = source.find(_add_ns('Initialization'))
2667                 if initialization is not None:
2668                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2669
2670             segment_list = element.find(_add_ns('SegmentList'))
2671             if segment_list is not None:
2672                 extract_common(segment_list)
2673                 extract_Initialization(segment_list)
2674                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2675                 if segment_urls_e:
2676                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2677             else:
2678                 segment_template = element.find(_add_ns('SegmentTemplate'))
2679                 if segment_template is not None:
2680                     extract_common(segment_template)
2681                     media = segment_template.get('media')
2682                     if media:
2683                         ms_info['media'] = media
2684                     initialization = segment_template.get('initialization')
2685                     if initialization:
2686                         ms_info['initialization'] = initialization
2687                     else:
2688                         extract_Initialization(segment_template)
2689             return ms_info
2690
2691         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2692         stream_numbers = collections.defaultdict(int)
2693         for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
2694             period_entry = {
2695                 'id': period.get('id', f'period-{period_idx}'),
2696                 'formats': [],
2697                 'subtitles': collections.defaultdict(list),
2698             }
2699             period_duration = parse_duration(period.get('duration')) or mpd_duration
2700             period_ms_info = extract_multisegment_info(period, {
2701                 'start_number': 1,
2702                 'timescale': 1,
2703             })
2704             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2705                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2706                 for representation in adaptation_set.findall(_add_ns('Representation')):
2707                     representation_attrib = adaptation_set.attrib.copy()
2708                     representation_attrib.update(representation.attrib)
2709                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2710                     mime_type = representation_attrib['mimeType']
2711                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2712
2713                     codec_str = representation_attrib.get('codecs', '')
2714                     # Some kind of binary subtitle found in some youtube livestreams
2715                     if mime_type == 'application/x-rawcc':
2716                         codecs = {'scodec': codec_str}
2717                     else:
2718                         codecs = parse_codecs(codec_str)
2719                     if content_type not in ('video', 'audio', 'text'):
2720                         if mime_type == 'image/jpeg':
2721                             content_type = mime_type
2722                         elif codecs.get('vcodec', 'none') != 'none':
2723                             content_type = 'video'
2724                         elif codecs.get('acodec', 'none') != 'none':
2725                             content_type = 'audio'
2726                         elif codecs.get('scodec', 'none') != 'none':
2727                             content_type = 'text'
2728                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2729                             content_type = 'text'
2730                         else:
2731                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2732                             continue
2733
2734                     base_url = ''
2735                     for element in (representation, adaptation_set, period, mpd_doc):
2736                         base_url_e = element.find(_add_ns('BaseURL'))
2737                         if try_call(lambda: base_url_e.text) is not None:
2738                             base_url = base_url_e.text + base_url
2739                             if re.match(r'^https?://', base_url):
2740                                 break
2741                     if mpd_base_url and base_url.startswith('/'):
2742                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2743                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2744                         if not mpd_base_url.endswith('/'):
2745                             mpd_base_url += '/'
2746                         base_url = mpd_base_url + base_url
2747                     representation_id = representation_attrib.get('id')
2748                     lang = representation_attrib.get('lang')
2749                     url_el = representation.find(_add_ns('BaseURL'))
2750                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2751                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2752                     if representation_id is not None:
2753                         format_id = representation_id
2754                     else:
2755                         format_id = content_type
2756                     if mpd_id:
2757                         format_id = mpd_id + '-' + format_id
2758                     if content_type in ('video', 'audio'):
2759                         f = {
2760                             'format_id': format_id,
2761                             'manifest_url': mpd_url,
2762                             'ext': mimetype2ext(mime_type),
2763                             'width': int_or_none(representation_attrib.get('width')),
2764                             'height': int_or_none(representation_attrib.get('height')),
2765                             'tbr': float_or_none(bandwidth, 1000),
2766                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2767                             'fps': int_or_none(representation_attrib.get('frameRate')),
2768                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2769                             'format_note': 'DASH %s' % content_type,
2770                             'filesize': filesize,
2771                             'container': mimetype2ext(mime_type) + '_dash',
2772                             **codecs
2773                         }
2774                     elif content_type == 'text':
2775                         f = {
2776                             'ext': mimetype2ext(mime_type),
2777                             'manifest_url': mpd_url,
2778                             'filesize': filesize,
2779                         }
2780                     elif content_type == 'image/jpeg':
2781                         # See test case in VikiIE
2782                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2783                         f = {
2784                             'format_id': format_id,
2785                             'ext': 'mhtml',
2786                             'manifest_url': mpd_url,
2787                             'format_note': 'DASH storyboards (jpeg)',
2788                             'acodec': 'none',
2789                             'vcodec': 'none',
2790                         }
2791                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2792                         f['has_drm'] = True
2793                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2794
2795                     def prepare_template(template_name, identifiers):
2796                         tmpl = representation_ms_info[template_name]
2797                         if representation_id is not None:
2798                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2799                         # First of, % characters outside $...$ templates
2800                         # must be escaped by doubling for proper processing
2801                         # by % operator string formatting used further (see
2802                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2803                         t = ''
2804                         in_template = False
2805                         for c in tmpl:
2806                             t += c
2807                             if c == '$':
2808                                 in_template = not in_template
2809                             elif c == '%' and not in_template:
2810                                 t += c
2811                         # Next, $...$ templates are translated to their
2812                         # %(...) counterparts to be used with % operator
2813                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2814                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2815                         t.replace('$$', '$')
2816                         return t
2817
2818                     # @initialization is a regular template like @media one
2819                     # so it should be handled just the same way (see
2820                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2821                     if 'initialization' in representation_ms_info:
2822                         initialization_template = prepare_template(
2823                             'initialization',
2824                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2825                             # $Time$ shall not be included for @initialization thus
2826                             # only $Bandwidth$ remains
2827                             ('Bandwidth', ))
2828                         representation_ms_info['initialization_url'] = initialization_template % {
2829                             'Bandwidth': bandwidth,
2830                         }
2831
2832                     def location_key(location):
2833                         return 'url' if re.match(r'^https?://', location) else 'path'
2834
2835                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2836
2837                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2838                         media_location_key = location_key(media_template)
2839
2840                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2841                         # can't be used at the same time
2842                         if '%(Number' in media_template and 's' not in representation_ms_info:
2843                             segment_duration = None
2844                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2845                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2846                                 representation_ms_info['total_number'] = int(math.ceil(
2847                                     float_or_none(period_duration, segment_duration, default=0)))
2848                             representation_ms_info['fragments'] = [{
2849                                 media_location_key: media_template % {
2850                                     'Number': segment_number,
2851                                     'Bandwidth': bandwidth,
2852                                 },
2853                                 'duration': segment_duration,
2854                             } for segment_number in range(
2855                                 representation_ms_info['start_number'],
2856                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2857                         else:
2858                             # $Number*$ or $Time$ in media template with S list available
2859                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2860                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2861                             representation_ms_info['fragments'] = []
2862                             segment_time = 0
2863                             segment_d = None
2864                             segment_number = representation_ms_info['start_number']
2865
2866                             def add_segment_url():
2867                                 segment_url = media_template % {
2868                                     'Time': segment_time,
2869                                     'Bandwidth': bandwidth,
2870                                     'Number': segment_number,
2871                                 }
2872                                 representation_ms_info['fragments'].append({
2873                                     media_location_key: segment_url,
2874                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2875                                 })
2876
2877                             for num, s in enumerate(representation_ms_info['s']):
2878                                 segment_time = s.get('t') or segment_time
2879                                 segment_d = s['d']
2880                                 add_segment_url()
2881                                 segment_number += 1
2882                                 for r in range(s.get('r', 0)):
2883                                     segment_time += segment_d
2884                                     add_segment_url()
2885                                     segment_number += 1
2886                                 segment_time += segment_d
2887                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2888                         # No media template,
2889                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2890                         # or any YouTube dashsegments video
2891                         fragments = []
2892                         segment_index = 0
2893                         timescale = representation_ms_info['timescale']
2894                         for s in representation_ms_info['s']:
2895                             duration = float_or_none(s['d'], timescale)
2896                             for r in range(s.get('r', 0) + 1):
2897                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2898                                 fragments.append({
2899                                     location_key(segment_uri): segment_uri,
2900                                     'duration': duration,
2901                                 })
2902                                 segment_index += 1
2903                         representation_ms_info['fragments'] = fragments
2904                     elif 'segment_urls' in representation_ms_info:
2905                         # Segment URLs with no SegmentTimeline
2906                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2907                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2908                         fragments = []
2909                         segment_duration = float_or_none(
2910                             representation_ms_info['segment_duration'],
2911                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2912                         for segment_url in representation_ms_info['segment_urls']:
2913                             fragment = {
2914                                 location_key(segment_url): segment_url,
2915                             }
2916                             if segment_duration:
2917                                 fragment['duration'] = segment_duration
2918                             fragments.append(fragment)
2919                         representation_ms_info['fragments'] = fragments
2920                     # If there is a fragments key available then we correctly recognized fragmented media.
2921                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2922                     # assumption is not necessarily correct since we may simply have no support for
2923                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2924                     if 'fragments' in representation_ms_info:
2925                         f.update({
2926                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2927                             'url': mpd_url or base_url,
2928                             'fragment_base_url': base_url,
2929                             'fragments': [],
2930                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2931                         })
2932                         if 'initialization_url' in representation_ms_info:
2933                             initialization_url = representation_ms_info['initialization_url']
2934                             if not f.get('url'):
2935                                 f['url'] = initialization_url
2936                             f['fragments'].append({location_key(initialization_url): initialization_url})
2937                         f['fragments'].extend(representation_ms_info['fragments'])
2938                         if not period_duration:
2939                             period_duration = try_get(
2940                                 representation_ms_info,
2941                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2942                     else:
2943                         # Assuming direct URL to unfragmented media.
2944                         f['url'] = base_url
2945                     if content_type in ('video', 'audio', 'image/jpeg'):
2946                         f['manifest_stream_number'] = stream_numbers[f['url']]
2947                         stream_numbers[f['url']] += 1
2948                         period_entry['formats'].append(f)
2949                     elif content_type == 'text':
2950                         period_entry['subtitles'][lang or 'und'].append(f)
2951             yield period_entry
2952
2953     def _extract_ism_formats(self, *args, **kwargs):
2954         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2955         if subs:
2956             self._report_ignoring_subs('ISM')
2957         return fmts
2958
2959     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2960         if self.get_param('ignore_no_formats_error'):
2961             fatal = False
2962
2963         res = self._download_xml_handle(
2964             ism_url, video_id,
2965             note='Downloading ISM manifest' if note is None else note,
2966             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2967             fatal=fatal, data=data, headers=headers, query=query)
2968         if res is False:
2969             return [], {}
2970         ism_doc, urlh = res
2971         if ism_doc is None:
2972             return [], {}
2973
2974         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
2975
2976     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2977         """
2978         Parse formats from ISM manifest.
2979         References:
2980          1. [MS-SSTR]: Smooth Streaming Protocol,
2981             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2982         """
2983         if ism_doc.get('IsLive') == 'TRUE':
2984             return [], {}
2985
2986         duration = int(ism_doc.attrib['Duration'])
2987         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2988
2989         formats = []
2990         subtitles = {}
2991         for stream in ism_doc.findall('StreamIndex'):
2992             stream_type = stream.get('Type')
2993             if stream_type not in ('video', 'audio', 'text'):
2994                 continue
2995             url_pattern = stream.attrib['Url']
2996             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2997             stream_name = stream.get('Name')
2998             stream_language = stream.get('Language', 'und')
2999             for track in stream.findall('QualityLevel'):
3000                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3001                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3002                 # TODO: add support for WVC1 and WMAP
3003                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3004                     self.report_warning('%s is not a supported codec' % fourcc)
3005                     continue
3006                 tbr = int(track.attrib['Bitrate']) // 1000
3007                 # [1] does not mention Width and Height attributes. However,
3008                 # they're often present while MaxWidth and MaxHeight are
3009                 # missing, so should be used as fallbacks
3010                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3011                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3012                 sampling_rate = int_or_none(track.get('SamplingRate'))
3013
3014                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3015                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3016
3017                 fragments = []
3018                 fragment_ctx = {
3019                     'time': 0,
3020                 }
3021                 stream_fragments = stream.findall('c')
3022                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3023                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3024                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3025                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3026                     if not fragment_ctx['duration']:
3027                         try:
3028                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3029                         except IndexError:
3030                             next_fragment_time = duration
3031                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3032                     for _ in range(fragment_repeat):
3033                         fragments.append({
3034                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3035                             'duration': fragment_ctx['duration'] / stream_timescale,
3036                         })
3037                         fragment_ctx['time'] += fragment_ctx['duration']
3038
3039                 if stream_type == 'text':
3040                     subtitles.setdefault(stream_language, []).append({
3041                         'ext': 'ismt',
3042                         'protocol': 'ism',
3043                         'url': ism_url,
3044                         'manifest_url': ism_url,
3045                         'fragments': fragments,
3046                         '_download_params': {
3047                             'stream_type': stream_type,
3048                             'duration': duration,
3049                             'timescale': stream_timescale,
3050                             'fourcc': fourcc,
3051                             'language': stream_language,
3052                             'codec_private_data': track.get('CodecPrivateData'),
3053                         }
3054                     })
3055                 elif stream_type in ('video', 'audio'):
3056                     formats.append({
3057                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3058                         'url': ism_url,
3059                         'manifest_url': ism_url,
3060                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3061                         'width': width,
3062                         'height': height,
3063                         'tbr': tbr,
3064                         'asr': sampling_rate,
3065                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3066                         'acodec': 'none' if stream_type == 'video' else fourcc,
3067                         'protocol': 'ism',
3068                         'fragments': fragments,
3069                         'has_drm': ism_doc.find('Protection') is not None,
3070                         'language': stream_language,
3071                         'audio_channels': int_or_none(track.get('Channels')),
3072                         '_download_params': {
3073                             'stream_type': stream_type,
3074                             'duration': duration,
3075                             'timescale': stream_timescale,
3076                             'width': width or 0,
3077                             'height': height or 0,
3078                             'fourcc': fourcc,
3079                             'language': stream_language,
3080                             'codec_private_data': track.get('CodecPrivateData'),
3081                             'sampling_rate': sampling_rate,
3082                             'channels': int_or_none(track.get('Channels', 2)),
3083                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3084                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3085                         },
3086                     })
3087         return formats, subtitles
3088
3089     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3090         def absolute_url(item_url):
3091             return urljoin(base_url, item_url)
3092
3093         def parse_content_type(content_type):
3094             if not content_type:
3095                 return {}
3096             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3097             if ctr:
3098                 mimetype, codecs = ctr.groups()
3099                 f = parse_codecs(codecs)
3100                 f['ext'] = mimetype2ext(mimetype)
3101                 return f
3102             return {}
3103
3104         def _media_formats(src, cur_media_type, type_info=None):
3105             type_info = type_info or {}
3106             full_url = absolute_url(src)
3107             ext = type_info.get('ext') or determine_ext(full_url)
3108             if ext == 'm3u8':
3109                 is_plain_url = False
3110                 formats = self._extract_m3u8_formats(
3111                     full_url, video_id, ext='mp4',
3112                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3113                     preference=preference, quality=quality, fatal=False)
3114             elif ext == 'mpd':
3115                 is_plain_url = False
3116                 formats = self._extract_mpd_formats(
3117                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3118             else:
3119                 is_plain_url = True
3120                 formats = [{
3121                     'url': full_url,
3122                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3123                     'ext': ext,
3124                 }]
3125             return is_plain_url, formats
3126
3127         entries = []
3128         # amp-video and amp-audio are very similar to their HTML5 counterparts
3129         # so we will include them right here (see
3130         # https://www.ampproject.org/docs/reference/components/amp-video)
3131         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3132         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3133         media_tags = [(media_tag, media_tag_name, media_type, '')
3134                       for media_tag, media_tag_name, media_type
3135                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3136         media_tags.extend(re.findall(
3137             # We only allow video|audio followed by a whitespace or '>'.
3138             # Allowing more characters may end up in significant slow down (see
3139             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3140             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3141             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3142         for media_tag, _, media_type, media_content in media_tags:
3143             media_info = {
3144                 'formats': [],
3145                 'subtitles': {},
3146             }
3147             media_attributes = extract_attributes(media_tag)
3148             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3149             if src:
3150                 f = parse_content_type(media_attributes.get('type'))
3151                 _, formats = _media_formats(src, media_type, f)
3152                 media_info['formats'].extend(formats)
3153             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3154             if media_content:
3155                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3156                     s_attr = extract_attributes(source_tag)
3157                     # data-video-src and data-src are non standard but seen
3158                     # several times in the wild
3159                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3160                     if not src:
3161                         continue
3162                     f = parse_content_type(s_attr.get('type'))
3163                     is_plain_url, formats = _media_formats(src, media_type, f)
3164                     if is_plain_url:
3165                         # width, height, res, label and title attributes are
3166                         # all not standard but seen several times in the wild
3167                         labels = [
3168                             s_attr.get(lbl)
3169                             for lbl in ('label', 'title')
3170                             if str_or_none(s_attr.get(lbl))
3171                         ]
3172                         width = int_or_none(s_attr.get('width'))
3173                         height = (int_or_none(s_attr.get('height'))
3174                                   or int_or_none(s_attr.get('res')))
3175                         if not width or not height:
3176                             for lbl in labels:
3177                                 resolution = parse_resolution(lbl)
3178                                 if not resolution:
3179                                     continue
3180                                 width = width or resolution.get('width')
3181                                 height = height or resolution.get('height')
3182                         for lbl in labels:
3183                             tbr = parse_bitrate(lbl)
3184                             if tbr:
3185                                 break
3186                         else:
3187                             tbr = None
3188                         f.update({
3189                             'width': width,
3190                             'height': height,
3191                             'tbr': tbr,
3192                             'format_id': s_attr.get('label') or s_attr.get('title'),
3193                         })
3194                         f.update(formats[0])
3195                         media_info['formats'].append(f)
3196                     else:
3197                         media_info['formats'].extend(formats)
3198                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3199                     track_attributes = extract_attributes(track_tag)
3200                     kind = track_attributes.get('kind')
3201                     if not kind or kind in ('subtitles', 'captions'):
3202                         src = strip_or_none(track_attributes.get('src'))
3203                         if not src:
3204                             continue
3205                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3206                         media_info['subtitles'].setdefault(lang, []).append({
3207                             'url': absolute_url(src),
3208                         })
3209             for f in media_info['formats']:
3210                 f.setdefault('http_headers', {})['Referer'] = base_url
3211             if media_info['formats'] or media_info['subtitles']:
3212                 entries.append(media_info)
3213         return entries
3214
3215     def _extract_akamai_formats(self, *args, **kwargs):
3216         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3217         if subs:
3218             self._report_ignoring_subs('akamai')
3219         return fmts
3220
3221     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3222         signed = 'hdnea=' in manifest_url
3223         if not signed:
3224             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3225             manifest_url = re.sub(
3226                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3227                 '', manifest_url).strip('?')
3228
3229         formats = []
3230         subtitles = {}
3231
3232         hdcore_sign = 'hdcore=3.7.0'
3233         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3234         hds_host = hosts.get('hds')
3235         if hds_host:
3236             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3237         if 'hdcore=' not in f4m_url:
3238             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3239         f4m_formats = self._extract_f4m_formats(
3240             f4m_url, video_id, f4m_id='hds', fatal=False)
3241         for entry in f4m_formats:
3242             entry.update({'extra_param_to_segment_url': hdcore_sign})
3243         formats.extend(f4m_formats)
3244
3245         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3246         hls_host = hosts.get('hls')
3247         if hls_host:
3248             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3249         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3250             m3u8_url, video_id, 'mp4', 'm3u8_native',
3251             m3u8_id='hls', fatal=False)
3252         formats.extend(m3u8_formats)
3253         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3254
3255         http_host = hosts.get('http')
3256         if http_host and m3u8_formats and not signed:
3257             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3258             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3259             qualities_length = len(qualities)
3260             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3261                 i = 0
3262                 for f in m3u8_formats:
3263                     if f['vcodec'] != 'none':
3264                         for protocol in ('http', 'https'):
3265                             http_f = f.copy()
3266                             del http_f['manifest_url']
3267                             http_url = re.sub(
3268                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3269                             http_f.update({
3270                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3271                                 'url': http_url,
3272                                 'protocol': protocol,
3273                             })
3274                             formats.append(http_f)
3275                         i += 1
3276
3277         return formats, subtitles
3278
3279     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3280         query = urllib.parse.urlparse(url).query
3281         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3282         mobj = re.search(
3283             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3284         url_base = mobj.group('url')
3285         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3286         formats = []
3287
3288         def manifest_url(manifest):
3289             m_url = f'{http_base_url}/{manifest}'
3290             if query:
3291                 m_url += '?%s' % query
3292             return m_url
3293
3294         if 'm3u8' not in skip_protocols:
3295             formats.extend(self._extract_m3u8_formats(
3296                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3297                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3298         if 'f4m' not in skip_protocols:
3299             formats.extend(self._extract_f4m_formats(
3300                 manifest_url('manifest.f4m'),
3301                 video_id, f4m_id='hds', fatal=False))
3302         if 'dash' not in skip_protocols:
3303             formats.extend(self._extract_mpd_formats(
3304                 manifest_url('manifest.mpd'),
3305                 video_id, mpd_id='dash', fatal=False))
3306         if re.search(r'(?:/smil:|\.smil)', url_base):
3307             if 'smil' not in skip_protocols:
3308                 rtmp_formats = self._extract_smil_formats(
3309                     manifest_url('jwplayer.smil'),
3310                     video_id, fatal=False)
3311                 for rtmp_format in rtmp_formats:
3312                     rtsp_format = rtmp_format.copy()
3313                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3314                     del rtsp_format['play_path']
3315                     del rtsp_format['ext']
3316                     rtsp_format.update({
3317                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3318                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3319                         'protocol': 'rtsp',
3320                     })
3321                     formats.extend([rtmp_format, rtsp_format])
3322         else:
3323             for protocol in ('rtmp', 'rtsp'):
3324                 if protocol not in skip_protocols:
3325                     formats.append({
3326                         'url': f'{protocol}:{url_base}',
3327                         'format_id': protocol,
3328                         'protocol': protocol,
3329                     })
3330         return formats
3331
3332     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3333         mobj = re.search(
3334             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3335             webpage)
3336         if mobj:
3337             try:
3338                 jwplayer_data = self._parse_json(mobj.group('options'),
3339                                                  video_id=video_id,
3340                                                  transform_source=transform_source)
3341             except ExtractorError:
3342                 pass
3343             else:
3344                 if isinstance(jwplayer_data, dict):
3345                     return jwplayer_data
3346
3347     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3348         jwplayer_data = self._find_jwplayer_data(
3349             webpage, video_id, transform_source=js_to_json)
3350         return self._parse_jwplayer_data(
3351             jwplayer_data, video_id, *args, **kwargs)
3352
3353     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3354                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3355         entries = []
3356         if not isinstance(jwplayer_data, dict):
3357             return entries
3358
3359         playlist_items = jwplayer_data.get('playlist')
3360         # JWPlayer backward compatibility: single playlist item/flattened playlists
3361         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3362         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3363         if not isinstance(playlist_items, list):
3364             playlist_items = (playlist_items or jwplayer_data, )
3365
3366         for video_data in playlist_items:
3367             if not isinstance(video_data, dict):
3368                 continue
3369             # JWPlayer backward compatibility: flattened sources
3370             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3371             if 'sources' not in video_data:
3372                 video_data['sources'] = [video_data]
3373
3374             this_video_id = video_id or video_data['mediaid']
3375
3376             formats = self._parse_jwplayer_formats(
3377                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3378                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3379
3380             subtitles = {}
3381             tracks = video_data.get('tracks')
3382             if tracks and isinstance(tracks, list):
3383                 for track in tracks:
3384                     if not isinstance(track, dict):
3385                         continue
3386                     track_kind = track.get('kind')
3387                     if not track_kind or not isinstance(track_kind, str):
3388                         continue
3389                     if track_kind.lower() not in ('captions', 'subtitles'):
3390                         continue
3391                     track_url = urljoin(base_url, track.get('file'))
3392                     if not track_url:
3393                         continue
3394                     subtitles.setdefault(track.get('label') or 'en', []).append({
3395                         'url': self._proto_relative_url(track_url)
3396                     })
3397
3398             entry = {
3399                 'id': this_video_id,
3400                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3401                 'description': clean_html(video_data.get('description')),
3402                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3403                 'timestamp': int_or_none(video_data.get('pubdate')),
3404                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3405                 'subtitles': subtitles,
3406                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3407                 'genre': clean_html(video_data.get('genre')),
3408                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3409                 'season_number': int_or_none(video_data.get('season')),
3410                 'episode_number': int_or_none(video_data.get('episode')),
3411                 'release_year': int_or_none(video_data.get('releasedate')),
3412                 'age_limit': int_or_none(video_data.get('age_restriction')),
3413             }
3414             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3415             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3416                 entry.update({
3417                     '_type': 'url_transparent',
3418                     'url': formats[0]['url'],
3419                 })
3420             else:
3421                 entry['formats'] = formats
3422             entries.append(entry)
3423         if len(entries) == 1:
3424             return entries[0]
3425         else:
3426             return self.playlist_result(entries)
3427
3428     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3429                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3430         urls = set()
3431         formats = []
3432         for source in jwplayer_sources_data:
3433             if not isinstance(source, dict):
3434                 continue
3435             source_url = urljoin(
3436                 base_url, self._proto_relative_url(source.get('file')))
3437             if not source_url or source_url in urls:
3438                 continue
3439             urls.add(source_url)
3440             source_type = source.get('type') or ''
3441             ext = mimetype2ext(source_type) or determine_ext(source_url)
3442             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3443                 formats.extend(self._extract_m3u8_formats(
3444                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3445                     m3u8_id=m3u8_id, fatal=False))
3446             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3447                 formats.extend(self._extract_mpd_formats(
3448                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3449             elif ext == 'smil':
3450                 formats.extend(self._extract_smil_formats(
3451                     source_url, video_id, fatal=False))
3452             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3453             elif source_type.startswith('audio') or ext in (
3454                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3455                 formats.append({
3456                     'url': source_url,
3457                     'vcodec': 'none',
3458                     'ext': ext,
3459                 })
3460             else:
3461                 format_id = str_or_none(source.get('label'))
3462                 height = int_or_none(source.get('height'))
3463                 if height is None and format_id:
3464                     # Often no height is provided but there is a label in
3465                     # format like "1080p", "720p SD", or 1080.
3466                     height = parse_resolution(format_id).get('height')
3467                 a_format = {
3468                     'url': source_url,
3469                     'width': int_or_none(source.get('width')),
3470                     'height': height,
3471                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3472                     'filesize': int_or_none(source.get('filesize')),
3473                     'ext': ext,
3474                     'format_id': format_id
3475                 }
3476                 if source_url.startswith('rtmp'):
3477                     a_format['ext'] = 'flv'
3478                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3479                     # of jwplayer.flash.swf
3480                     rtmp_url_parts = re.split(
3481                         r'((?:mp4|mp3|flv):)', source_url, 1)
3482                     if len(rtmp_url_parts) == 3:
3483                         rtmp_url, prefix, play_path = rtmp_url_parts
3484                         a_format.update({
3485                             'url': rtmp_url,
3486                             'play_path': prefix + play_path,
3487                         })
3488                     if rtmp_params:
3489                         a_format.update(rtmp_params)
3490                 formats.append(a_format)
3491         return formats
3492
3493     def _live_title(self, name):
3494         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3495         return name
3496
3497     def _int(self, v, name, fatal=False, **kwargs):
3498         res = int_or_none(v, **kwargs)
3499         if res is None:
3500             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3501             if fatal:
3502                 raise ExtractorError(msg)
3503             else:
3504                 self.report_warning(msg)
3505         return res
3506
3507     def _float(self, v, name, fatal=False, **kwargs):
3508         res = float_or_none(v, **kwargs)
3509         if res is None:
3510             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3511             if fatal:
3512                 raise ExtractorError(msg)
3513             else:
3514                 self.report_warning(msg)
3515         return res
3516
3517     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3518                     path='/', secure=False, discard=False, rest={}, **kwargs):
3519         cookie = http.cookiejar.Cookie(
3520             0, name, value, port, port is not None, domain, True,
3521             domain.startswith('.'), path, True, secure, expire_time,
3522             discard, None, None, rest)
3523         self.cookiejar.set_cookie(cookie)
3524
3525     def _get_cookies(self, url):
3526         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3527         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3528
3529     def _apply_first_set_cookie_header(self, url_handle, cookie):
3530         """
3531         Apply first Set-Cookie header instead of the last. Experimental.
3532
3533         Some sites (e.g. [1-3]) may serve two cookies under the same name
3534         in Set-Cookie header and expect the first (old) one to be set rather
3535         than second (new). However, as of RFC6265 the newer one cookie
3536         should be set into cookie store what actually happens.
3537         We will workaround this issue by resetting the cookie to
3538         the first one manually.
3539         1. https://new.vk.com/
3540         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3541         3. https://learning.oreilly.com/
3542         """
3543         for header, cookies in url_handle.headers.items():
3544             if header.lower() != 'set-cookie':
3545                 continue
3546             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3547             cookie_value = re.search(
3548                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3549             if cookie_value:
3550                 value, domain = cookie_value.groups()
3551                 self._set_cookie(domain, cookie, value)
3552                 break
3553
3554     @classmethod
3555     def get_testcases(cls, include_onlymatching=False):
3556         # Do not look in super classes
3557         t = vars(cls).get('_TEST')
3558         if t:
3559             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3560             tests = [t]
3561         else:
3562             tests = vars(cls).get('_TESTS', [])
3563         for t in tests:
3564             if not include_onlymatching and t.get('only_matching', False):
3565                 continue
3566             t['name'] = cls.ie_key()
3567             yield t
3568         if getattr(cls, '__wrapped__', None):
3569             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3570
3571     @classmethod
3572     def get_webpage_testcases(cls):
3573         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3574         for t in tests:
3575             t['name'] = cls.ie_key()
3576             yield t
3577         if getattr(cls, '__wrapped__', None):
3578             yield from cls.__wrapped__.get_webpage_testcases()
3579
3580     @classproperty(cache=True)
3581     def age_limit(cls):
3582         """Get age limit from the testcases"""
3583         return max(traverse_obj(
3584             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3585             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3586
3587     @classproperty(cache=True)
3588     def _RETURN_TYPE(cls):
3589         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3590         tests = tuple(cls.get_testcases(include_onlymatching=False))
3591         if not tests:
3592             return None
3593         elif not any(k.startswith('playlist') for test in tests for k in test):
3594             return 'video'
3595         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3596             return 'playlist'
3597         return 'any'
3598
3599     @classmethod
3600     def is_single_video(cls, url):
3601         """Returns whether the URL is of a single video, None if unknown"""
3602         if cls.suitable(url):
3603             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3604
3605     @classmethod
3606     def is_suitable(cls, age_limit):
3607         """Test whether the extractor is generally suitable for the given age limit"""
3608         return not age_restricted(cls.age_limit, age_limit)
3609
3610     @classmethod
3611     def description(cls, *, markdown=True, search_examples=None):
3612         """Description of the extractor"""
3613         desc = ''
3614         if cls._NETRC_MACHINE:
3615             if markdown:
3616                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3617             else:
3618                 desc += f' [{cls._NETRC_MACHINE}]'
3619         if cls.IE_DESC is False:
3620             desc += ' [HIDDEN]'
3621         elif cls.IE_DESC:
3622             desc += f' {cls.IE_DESC}'
3623         if cls.SEARCH_KEY:
3624             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3625             if search_examples:
3626                 _COUNTS = ('', '5', '10', 'all')
3627                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3628         if not cls.working():
3629             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3630
3631         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3632         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3633         return f'{name}:{desc}' if desc else name
3634
3635     def extract_subtitles(self, *args, **kwargs):
3636         if (self.get_param('writesubtitles', False)
3637                 or self.get_param('listsubtitles')):
3638             return self._get_subtitles(*args, **kwargs)
3639         return {}
3640
3641     def _get_subtitles(self, *args, **kwargs):
3642         raise NotImplementedError('This method must be implemented by subclasses')
3643
3644     class CommentsDisabled(Exception):
3645         """Raise in _get_comments if comments are disabled for the video"""
3646
3647     def extract_comments(self, *args, **kwargs):
3648         if not self.get_param('getcomments'):
3649             return None
3650         generator = self._get_comments(*args, **kwargs)
3651
3652         def extractor():
3653             comments = []
3654             interrupted = True
3655             try:
3656                 while True:
3657                     comments.append(next(generator))
3658             except StopIteration:
3659                 interrupted = False
3660             except KeyboardInterrupt:
3661                 self.to_screen('Interrupted by user')
3662             except self.CommentsDisabled:
3663                 return {'comments': None, 'comment_count': None}
3664             except Exception as e:
3665                 if self.get_param('ignoreerrors') is not True:
3666                     raise
3667                 self._downloader.report_error(e)
3668             comment_count = len(comments)
3669             self.to_screen(f'Extracted {comment_count} comments')
3670             return {
3671                 'comments': comments,
3672                 'comment_count': None if interrupted else comment_count
3673             }
3674         return extractor
3675
3676     def _get_comments(self, *args, **kwargs):
3677         raise NotImplementedError('This method must be implemented by subclasses')
3678
3679     @staticmethod
3680     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3681         """ Merge subtitle items for one language. Items with duplicated URLs/data
3682         will be dropped. """
3683         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3684         ret = list(subtitle_list1)
3685         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3686         return ret
3687
3688     @classmethod
3689     def _merge_subtitles(cls, *dicts, target=None):
3690         """ Merge subtitle dictionaries, language by language. """
3691         if target is None:
3692             target = {}
3693         for d in dicts:
3694             for lang, subs in d.items():
3695                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3696         return target
3697
3698     def extract_automatic_captions(self, *args, **kwargs):
3699         if (self.get_param('writeautomaticsub', False)
3700                 or self.get_param('listsubtitles')):
3701             return self._get_automatic_captions(*args, **kwargs)
3702         return {}
3703
3704     def _get_automatic_captions(self, *args, **kwargs):
3705         raise NotImplementedError('This method must be implemented by subclasses')
3706
3707     @functools.cached_property
3708     def _cookies_passed(self):
3709         """Whether cookies have been passed to YoutubeDL"""
3710         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3711
3712     def mark_watched(self, *args, **kwargs):
3713         if not self.get_param('mark_watched', False):
3714             return
3715         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3716             self._mark_watched(*args, **kwargs)
3717
3718     def _mark_watched(self, *args, **kwargs):
3719         raise NotImplementedError('This method must be implemented by subclasses')
3720
3721     def geo_verification_headers(self):
3722         headers = {}
3723         geo_verification_proxy = self.get_param('geo_verification_proxy')
3724         if geo_verification_proxy:
3725             headers['Ytdl-request-proxy'] = geo_verification_proxy
3726         return headers
3727
3728     @staticmethod
3729     def _generic_id(url):
3730         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3731
3732     def _generic_title(self, url='', webpage='', *, default=None):
3733         return (self._og_search_title(webpage, default=None)
3734                 or self._html_extract_title(webpage, default=None)
3735                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3736                 or default)
3737
3738     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3739         if not duration:
3740             return
3741         chapter_list = [{
3742             'start_time': start_function(chapter),
3743             'title': title_function(chapter),
3744         } for chapter in chapter_list or []]
3745         if strict:
3746             warn = self.report_warning
3747         else:
3748             warn = self.write_debug
3749             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3750
3751         chapters = [{'start_time': 0}]
3752         for idx, chapter in enumerate(chapter_list):
3753             if chapter['start_time'] is None:
3754                 warn(f'Incomplete chapter {idx}')
3755             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3756                 chapters.append(chapter)
3757             elif chapter not in chapters:
3758                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3759                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3760                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3761         return chapters[1:]
3762
3763     def _extract_chapters_from_description(self, description, duration):
3764         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3765         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3766         return self._extract_chapters_helper(
3767             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3768             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3769             duration=duration, strict=False) or self._extract_chapters_helper(
3770             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3771             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3772             duration=duration, strict=False)
3773
3774     @staticmethod
3775     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3776         all_known = all(map(
3777             lambda x: x is not None,
3778             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3779         return (
3780             'private' if is_private
3781             else 'premium_only' if needs_premium
3782             else 'subscriber_only' if needs_subscription
3783             else 'needs_auth' if needs_auth
3784             else 'unlisted' if is_unlisted
3785             else 'public' if all_known
3786             else None)
3787
3788     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3789         '''
3790         @returns            A list of values for the extractor argument given by "key"
3791                             or "default" if no such key is present
3792         @param default      The default value to return when the key is not present (default: [])
3793         @param casesense    When false, the values are converted to lower case
3794         '''
3795         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3796         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3797         if val is None:
3798             return [] if default is NO_DEFAULT else default
3799         return list(val) if casesense else [x.lower() for x in val]
3800
3801     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3802         if not playlist_id or not video_id:
3803             return not video_id
3804
3805         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3806         if no_playlist is not None:
3807             return not no_playlist
3808
3809         video_id = '' if video_id is True else f' {video_id}'
3810         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3811         if self.get_param('noplaylist'):
3812             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3813             return False
3814         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3815         return True
3816
3817     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3818         RetryManager.report_retry(
3819             err, _count or int(fatal), _retries,
3820             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3821             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3822
3823     def RetryManager(self, **kwargs):
3824         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3825
3826     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3827         display_id = traverse_obj(info_dict, 'display_id', 'id')
3828         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3829         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3830             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3831
3832     @classmethod
3833     def extract_from_webpage(cls, ydl, url, webpage):
3834         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3835               else ydl.get_info_extractor(cls.ie_key()))
3836         for info in ie._extract_from_webpage(url, webpage) or []:
3837             # url = None since we do not want to set (webpage/original)_url
3838             ydl.add_default_extra_info(info, ie, None)
3839             yield info
3840
3841     @classmethod
3842     def _extract_from_webpage(cls, url, webpage):
3843         for embed_url in orderedSet(
3844                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3845             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3846
3847     @classmethod
3848     def _extract_embed_urls(cls, url, webpage):
3849         """@returns all the embed urls on the webpage"""
3850         if '_EMBED_URL_RE' not in cls.__dict__:
3851             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3852             for idx, regex in enumerate(cls._EMBED_REGEX):
3853                 assert regex.count('(?P<url>') == 1, \
3854                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3855             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3856
3857         for regex in cls._EMBED_URL_RE:
3858             for mobj in regex.finditer(webpage):
3859                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3860                 if cls._VALID_URL is False or cls.suitable(embed_url):
3861                     yield embed_url
3862
3863     class StopExtraction(Exception):
3864         pass
3865
3866     @classmethod
3867     def _extract_url(cls, webpage):  # TODO: Remove
3868         """Only for compatibility with some older extractors"""
3869         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3870
3871     @classmethod
3872     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3873         if plugin_name:
3874             mro = inspect.getmro(cls)
3875             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3876             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3877             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3878             while getattr(super_class, '__wrapped__', None):
3879                 super_class = super_class.__wrapped__
3880             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3881             _PLUGIN_OVERRIDES[super_class].append(cls)
3882
3883         return super().__init_subclass__(**kwargs)
3884
3885
3886 class SearchInfoExtractor(InfoExtractor):
3887     """
3888     Base class for paged search queries extractors.
3889     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3890     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3891     """
3892
3893     _MAX_RESULTS = float('inf')
3894     _RETURN_TYPE = 'playlist'
3895
3896     @classproperty
3897     def _VALID_URL(cls):
3898         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3899
3900     def _real_extract(self, query):
3901         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3902         if prefix == '':
3903             return self._get_n_results(query, 1)
3904         elif prefix == 'all':
3905             return self._get_n_results(query, self._MAX_RESULTS)
3906         else:
3907             n = int(prefix)
3908             if n <= 0:
3909                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3910             elif n > self._MAX_RESULTS:
3911                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3912                 n = self._MAX_RESULTS
3913             return self._get_n_results(query, n)
3914
3915     def _get_n_results(self, query, n):
3916         """Get a specified number of results for a query.
3917         Either this function or _search_results must be overridden by subclasses """
3918         return self.playlist_result(
3919             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3920             query, query)
3921
3922     def _search_results(self, query):
3923         """Returns an iterator of search results"""
3924         raise NotImplementedError('This method must be implemented by subclasses')
3925
3926     @classproperty
3927     def SEARCH_KEY(cls):
3928         return cls._SEARCH_KEY
3929
3930
3931 class UnsupportedURLIE(InfoExtractor):
3932     _VALID_URL = '.*'
3933     _ENABLED = False
3934     IE_DESC = False
3935
3936     def _real_extract(self, url):
3937         raise UnsupportedError(url)
3938
3939
3940 _PLUGIN_OVERRIDES = collections.defaultdict(list)