yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import subprocess
  17 import sys
  18 import time
  19 import types
  20 import urllib.parse
  21 import urllib.request
  22 import xml.etree.ElementTree
  23
  24 from ..compat import functools  # isort: split
  25 from ..compat import (
  26     compat_etree_fromstring,
  27     compat_expanduser,
  28     compat_os_name,
  29     urllib_req_to_req,
  30 )
  31 from ..cookies import LenientSimpleCookie
  32 from ..downloader.f4m import get_base_url, remove_encrypted_media
  33 from ..downloader.hls import HlsFD
  34 from ..networking import HEADRequest, Request
  35 from ..networking.exceptions import (
  36     HTTPError,
  37     IncompleteRead,
  38     network_exceptions,
  39 )
  40 from ..utils import (
  41     IDENTITY,
  42     JSON_LD_RE,
  43     NO_DEFAULT,
  44     ExtractorError,
  45     FormatSorter,
  46     GeoRestrictedError,
  47     GeoUtils,
  48     LenientJSONDecoder,
  49     Popen,
  50     RegexNotFoundError,
  51     RetryManager,
  52     UnsupportedError,
  53     age_restricted,
  54     base_url,
  55     bug_reports_message,
  56     classproperty,
  57     clean_html,
  58     deprecation_warning,
  59     determine_ext,
  60     dict_get,
  61     encode_data_uri,
  62     error_to_compat_str,
  63     extract_attributes,
  64     filter_dict,
  65     fix_xml_ampersands,
  66     float_or_none,
  67     format_field,
  68     int_or_none,
  69     join_nonempty,
  70     js_to_json,
  71     mimetype2ext,
  72     netrc_from_content,
  73     orderedSet,
  74     parse_bitrate,
  75     parse_codecs,
  76     parse_duration,
  77     parse_iso8601,
  78     parse_m3u8_attributes,
  79     parse_resolution,
  80     sanitize_filename,
  81     sanitize_url,
  82     smuggle_url,
  83     str_or_none,
  84     str_to_int,
  85     strip_or_none,
  86     traverse_obj,
  87     truncate_string,
  88     try_call,
  89     try_get,
  90     unescapeHTML,
  91     unified_strdate,
  92     unified_timestamp,
  93     url_basename,
  94     url_or_none,
  95     urlhandle_detect_ext,
  96     urljoin,
  97     variadic,
  98     xpath_element,
  99     xpath_text,
 100     xpath_with_ns,
 101 )
 102
 103
 104 class InfoExtractor:
 105     """Information Extractor class.
 106
 107     Information extractors are the classes that, given a URL, extract
 108     information about the video (or videos) the URL refers to. This
 109     information includes the real video URL, the video title, author and
 110     others. The information is stored in a dictionary which is then
 111     passed to the YoutubeDL. The YoutubeDL processes this
 112     information possibly downloading the video to the file system, among
 113     other possible outcomes.
 114
 115     The type field determines the type of the result.
 116     By far the most common value (and the default if _type is missing) is
 117     "video", which indicates a single video.
 118
 119     For a video, the dictionaries must include the following fields:
 120
 121     id:             Video identifier.
 122     title:          Video title, unescaped. Set to an empty string if video has
 123                     no title as opposed to "None" which signifies that the
 124                     extractor failed to obtain a title
 125
 126     Additionally, it must contain either a formats entry or a url one:
 127
 128     formats:        A list of dictionaries for each format available, ordered
 129                     from worst to best quality.
 130
 131                     Potential fields:
 132                     * url        The mandatory URL representing the media:
 133                                    for plain file media - HTTP URL of this file,
 134                                    for RTMP - RTMP URL,
 135                                    for HLS - URL of the M3U8 media playlist,
 136                                    for HDS - URL of the F4M manifest,
 137                                    for DASH
 138                                      - HTTP URL to plain file media (in case of
 139                                        unfragmented media)
 140                                      - URL of the MPD manifest or base URL
 141                                        representing the media if MPD manifest
 142                                        is parsed from a string (in case of
 143                                        fragmented media)
 144                                    for MSS - URL of the ISM manifest.
 145                     * request_data  Data to send in POST request to the URL
 146                     * manifest_url
 147                                  The URL of the manifest file in case of
 148                                  fragmented media:
 149                                    for HLS - URL of the M3U8 master playlist,
 150                                    for HDS - URL of the F4M manifest,
 151                                    for DASH - URL of the MPD manifest,
 152                                    for MSS - URL of the ISM manifest.
 153                     * manifest_stream_number  (For internal use only)
 154                                  The index of the stream in the manifest file
 155                     * ext        Will be calculated from URL if missing
 156                     * format     A human-readable description of the format
 157                                  ("mp4 container with h264/opus").
 158                                  Calculated from the format_id, width, height.
 159                                  and format_note fields if missing.
 160                     * format_id  A short description of the format
 161                                  ("mp4_h264_opus" or "19").
 162                                 Technically optional, but strongly recommended.
 163                     * format_note Additional info about the format
 164                                  ("3D" or "DASH video")
 165                     * width      Width of the video, if known
 166                     * height     Height of the video, if known
 167                     * aspect_ratio  Aspect ratio of the video, if known
 168                                  Automatically calculated from width and height
 169                     * resolution Textual description of width and height
 170                                  Automatically calculated from width and height
 171                     * dynamic_range The dynamic range of the video. One of:
 172                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 173                     * tbr        Average bitrate of audio and video in KBit/s
 174                     * abr        Average audio bitrate in KBit/s
 175                     * acodec     Name of the audio codec in use
 176                     * asr        Audio sampling rate in Hertz
 177                     * audio_channels  Number of audio channels
 178                     * vbr        Average video bitrate in KBit/s
 179                     * fps        Frame rate
 180                     * vcodec     Name of the video codec in use
 181                     * container  Name of the container format
 182                     * filesize   The number of bytes, if known in advance
 183                     * filesize_approx  An estimate for the number of bytes
 184                     * player_url SWF Player URL (used for rtmpdump).
 185                     * protocol   The protocol that will be used for the actual
 186                                  download, lower-case. One of "http", "https" or
 187                                  one of the protocols defined in downloader.PROTOCOL_MAP
 188                     * fragment_base_url
 189                                  Base URL for fragments. Each fragment's path
 190                                  value (if present) will be relative to
 191                                  this URL.
 192                     * fragments  A list of fragments of a fragmented media.
 193                                  Each fragment entry must contain either an url
 194                                  or a path. If an url is present it should be
 195                                  considered by a client. Otherwise both path and
 196                                  fragment_base_url must be present. Here is
 197                                  the list of all potential fields:
 198                                  * "url" - fragment's URL
 199                                  * "path" - fragment's path relative to
 200                                             fragment_base_url
 201                                  * "duration" (optional, int or float)
 202                                  * "filesize" (optional, int)
 203                     * is_from_start  Is a live format that can be downloaded
 204                                 from the start. Boolean
 205                     * preference Order number of this format. If this field is
 206                                  present and not None, the formats get sorted
 207                                  by this field, regardless of all other values.
 208                                  -1 for default (order by other properties),
 209                                  -2 or smaller for less than default.
 210                                  < -1000 to hide the format (if there is
 211                                     another one which is strictly better)
 212                     * language   Language code, e.g. "de" or "en-US".
 213                     * language_preference  Is this in the language mentioned in
 214                                  the URL?
 215                                  10 if it's what the URL is about,
 216                                  -1 for default (don't know),
 217                                  -10 otherwise, other values reserved for now.
 218                     * quality    Order number of the video quality of this
 219                                  format, irrespective of the file format.
 220                                  -1 for default (order by other properties),
 221                                  -2 or smaller for less than default.
 222                     * source_preference  Order number for this video source
 223                                   (quality takes higher priority)
 224                                  -1 for default (order by other properties),
 225                                  -2 or smaller for less than default.
 226                     * http_headers  A dictionary of additional HTTP headers
 227                                  to add to the request.
 228                     * stretched_ratio  If given and not 1, indicates that the
 229                                  video's pixels are not square.
 230                                  width : height ratio as float.
 231                     * no_resume  The server does not support resuming the
 232                                  (HTTP or RTMP) download. Boolean.
 233                     * has_drm    True if the format has DRM and cannot be downloaded.
 234                                  'maybe' if the format may have DRM and has to be tested before download.
 235                     * extra_param_to_segment_url  A query string to append to each
 236                                  fragment's URL, or to update each existing query string
 237                                  with. Only applied by the native HLS/DASH downloaders.
 238                     * hls_aes    A dictionary of HLS AES-128 decryption information
 239                                  used by the native HLS downloader to override the
 240                                  values in the media playlist when an '#EXT-X-KEY' tag
 241                                  is present in the playlist:
 242                                  * uri  The URI from which the key will be downloaded
 243                                  * key  The key (as hex) used to decrypt fragments.
 244                                         If `key` is given, any key URI will be ignored
 245                                  * iv   The IV (as hex) used to decrypt fragments
 246                     * downloader_options  A dictionary of downloader options
 247                                  (For internal use only)
 248                                  * http_chunk_size Chunk size for HTTP downloads
 249                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 250                     RTMP formats can also have the additional fields: page_url,
 251                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 252                     rtmp_protocol, rtmp_real_time
 253
 254     url:            Final video URL.
 255     ext:            Video filename extension.
 256     format:         The video format, defaults to ext (used for --get-format)
 257     player_url:     SWF Player URL (used for rtmpdump).
 258
 259     The following fields are optional:
 260
 261     direct:         True if a direct video file was given (must only be set by GenericIE)
 262     alt_title:      A secondary title of the video.
 263     display_id      An alternative identifier for the video, not necessarily
 264                     unique, but available before title. Typically, id is
 265                     something like "4234987", title "Dancing naked mole rats",
 266                     and display_id "dancing-naked-mole-rats"
 267     thumbnails:     A list of dictionaries, with the following entries:
 268                         * "id" (optional, string) - Thumbnail format ID
 269                         * "url"
 270                         * "preference" (optional, int) - quality of the image
 271                         * "width" (optional, int)
 272                         * "height" (optional, int)
 273                         * "resolution" (optional, string "{width}x{height}",
 274                                         deprecated)
 275                         * "filesize" (optional, int)
 276                         * "http_headers" (dict) - HTTP headers for the request
 277     thumbnail:      Full URL to a video thumbnail image.
 278     description:    Full video description.
 279     uploader:       Full name of the video uploader.
 280     license:        License name the video is licensed under.
 281     creator:        The creator of the video.
 282     timestamp:      UNIX timestamp of the moment the video was uploaded
 283     upload_date:    Video upload date in UTC (YYYYMMDD).
 284                     If not explicitly set, calculated from timestamp
 285     release_timestamp: UNIX timestamp of the moment the video was released.
 286                     If it is not clear whether to use timestamp or this, use the former
 287     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 288                     If not explicitly set, calculated from release_timestamp
 289     release_year:   Year (YYYY) as integer when the video or album was released.
 290                     To be used if no exact release date is known.
 291                     If not explicitly set, calculated from release_date.
 292     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 293     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 294                     If not explicitly set, calculated from modified_timestamp
 295     uploader_id:    Nickname or id of the video uploader.
 296     uploader_url:   Full URL to a personal webpage of the video uploader.
 297     channel:        Full name of the channel the video is uploaded on.
 298                     Note that channel fields may or may not repeat uploader
 299                     fields. This depends on a particular extractor.
 300     channel_id:     Id of the channel.
 301     channel_url:    Full URL to a channel webpage.
 302     channel_follower_count: Number of followers of the channel.
 303     channel_is_verified: Whether the channel is verified on the platform.
 304     location:       Physical location where the video was filmed.
 305     subtitles:      The available subtitles as a dictionary in the format
 306                     {tag: subformats}. "tag" is usually a language code, and
 307                     "subformats" is a list sorted from lower to higher
 308                     preference, each element is a dictionary with the "ext"
 309                     entry and one of:
 310                         * "data": The subtitles file contents
 311                         * "url": A URL pointing to the subtitles file
 312                     It can optionally also have:
 313                         * "name": Name or description of the subtitles
 314                         * "http_headers": A dictionary of additional HTTP headers
 315                                   to add to the request.
 316                     "ext" will be calculated from URL if missing
 317     automatic_captions: Like 'subtitles'; contains automatically generated
 318                     captions instead of normal subtitles
 319     duration:       Length of the video in seconds, as an integer or float.
 320     view_count:     How many users have watched the video on the platform.
 321     concurrent_view_count: How many users are currently watching the video on the platform.
 322     like_count:     Number of positive ratings of the video
 323     dislike_count:  Number of negative ratings of the video
 324     repost_count:   Number of reposts of the video
 325     average_rating: Average rating give by users, the scale used depends on the webpage
 326     comment_count:  Number of comments on the video
 327     comments:       A list of comments, each with one or more of the following
 328                     properties (all but one of text or html optional):
 329                         * "author" - human-readable name of the comment author
 330                         * "author_id" - user ID of the comment author
 331                         * "author_thumbnail" - The thumbnail of the comment author
 332                         * "author_url" - The url to the comment author's page
 333                         * "author_is_verified" - Whether the author is verified
 334                                                  on the platform
 335                         * "author_is_uploader" - Whether the comment is made by
 336                                                  the video uploader
 337                         * "id" - Comment ID
 338                         * "html" - Comment as HTML
 339                         * "text" - Plain text of the comment
 340                         * "timestamp" - UNIX timestamp of comment
 341                         * "parent" - ID of the comment this one is replying to.
 342                                      Set to "root" to indicate that this is a
 343                                      comment to the original video.
 344                         * "like_count" - Number of positive ratings of the comment
 345                         * "dislike_count" - Number of negative ratings of the comment
 346                         * "is_favorited" - Whether the comment is marked as
 347                                            favorite by the video uploader
 348                         * "is_pinned" - Whether the comment is pinned to
 349                                         the top of the comments
 350     age_limit:      Age restriction for the video, as an integer (years)
 351     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 352                     should allow to get the same result again. (It will be set
 353                     by YoutubeDL if it's missing)
 354     categories:     A list of categories that the video falls in, for example
 355                     ["Sports", "Berlin"]
 356     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 357     cast:           A list of the video cast
 358     is_live:        True, False, or None (=unknown). Whether this video is a
 359                     live stream that goes on instead of a fixed-length video.
 360     was_live:       True, False, or None (=unknown). Whether this video was
 361                     originally a live stream.
 362     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 363                     or 'post_live' (was live, but VOD is not yet processed)
 364                     If absent, automatically set from is_live, was_live
 365     start_time:     Time in seconds where the reproduction should start, as
 366                     specified in the URL.
 367     end_time:       Time in seconds where the reproduction should end, as
 368                     specified in the URL.
 369     chapters:       A list of dictionaries, with the following entries:
 370                         * "start_time" - The start time of the chapter in seconds
 371                         * "end_time" - The end time of the chapter in seconds
 372                         * "title" (optional, string)
 373     heatmap:        A list of dictionaries, with the following entries:
 374                         * "start_time" - The start time of the data point in seconds
 375                         * "end_time" - The end time of the data point in seconds
 376                         * "value" - The normalized value of the data point (float between 0 and 1)
 377     playable_in_embed: Whether this video is allowed to play in embedded
 378                     players on other sites. Can be True (=always allowed),
 379                     False (=never allowed), None (=unknown), or a string
 380                     specifying the criteria for embedability; e.g. 'whitelist'
 381     availability:   Under what condition the video is available. One of
 382                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 383                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 384                     to set it
 385     media_type:     The type of media as classified by the site, e.g. "episode", "clip", "trailer"
 386     _old_archive_ids: A list of old archive ids needed for backward compatibility
 387     _format_sort_fields: A list of fields to use for sorting formats
 388     __post_extractor: A function to be called just before the metadata is
 389                     written to either disk, logger or console. The function
 390                     must return a dict which will be added to the info_dict.
 391                     This is usefull for additional information that is
 392                     time-consuming to extract. Note that the fields thus
 393                     extracted will not be available to output template and
 394                     match_filter. So, only "comments" and "comment_count" are
 395                     currently allowed to be extracted via this method.
 396
 397     The following fields should only be used when the video belongs to some logical
 398     chapter or section:
 399
 400     chapter:        Name or title of the chapter the video belongs to.
 401     chapter_number: Number of the chapter the video belongs to, as an integer.
 402     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 403
 404     The following fields should only be used when the video is an episode of some
 405     series, programme or podcast:
 406
 407     series:         Title of the series or programme the video episode belongs to.
 408     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 409     season:         Title of the season the video episode belongs to.
 410     season_number:  Number of the season the video episode belongs to, as an integer.
 411     season_id:      Id of the season the video episode belongs to, as a unicode string.
 412     episode:        Title of the video episode. Unlike mandatory video title field,
 413                     this field should denote the exact title of the video episode
 414                     without any kind of decoration.
 415     episode_number: Number of the video episode within a season, as an integer.
 416     episode_id:     Id of the video episode, as a unicode string.
 417
 418     The following fields should only be used when the media is a track or a part of
 419     a music album:
 420
 421     track:          Title of the track.
 422     track_number:   Number of the track within an album or a disc, as an integer.
 423     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 424                     as a unicode string.
 425     artist:         Artist(s) of the track.
 426     genre:          Genre(s) of the track.
 427     album:          Title of the album the track belongs to.
 428     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 429     album_artist:   List of all artists appeared on the album (e.g.
 430                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 431                     and compilations).
 432     disc_number:    Number of the disc or other physical medium the track belongs to,
 433                     as an integer.
 434     composer:       Composer of the piece
 435
 436     The following fields should only be set for clips that should be cut from the original video:
 437
 438     section_start:  Start time of the section in seconds
 439     section_end:    End time of the section in seconds
 440
 441     The following fields should only be set for storyboards:
 442     rows:           Number of rows in each storyboard fragment, as an integer
 443     columns:        Number of columns in each storyboard fragment, as an integer
 444
 445     Unless mentioned otherwise, the fields should be Unicode strings.
 446
 447     Unless mentioned otherwise, None is equivalent to absence of information.
 448
 449
 450     _type "playlist" indicates multiple videos.
 451     There must be a key "entries", which is a list, an iterable, or a PagedList
 452     object, each element of which is a valid dictionary by this specification.
 453
 454     Additionally, playlists can have "id", "title", and any other relevant
 455     attributes with the same semantics as videos (see above).
 456
 457     It can also have the following optional fields:
 458
 459     playlist_count: The total number of videos in a playlist. If not given,
 460                     YoutubeDL tries to calculate it from "entries"
 461
 462
 463     _type "multi_video" indicates that there are multiple videos that
 464     form a single show, for examples multiple acts of an opera or TV episode.
 465     It must have an entries key like a playlist and contain all the keys
 466     required for a video at the same time.
 467
 468
 469     _type "url" indicates that the video must be extracted from another
 470     location, possibly by a different extractor. Its only required key is:
 471     "url" - the next URL to extract.
 472     The key "ie_key" can be set to the class name (minus the trailing "IE",
 473     e.g. "Youtube") if the extractor class is known in advance.
 474     Additionally, the dictionary may have any properties of the resolved entity
 475     known in advance, for example "title" if the title of the referred video is
 476     known ahead of time.
 477
 478
 479     _type "url_transparent" entities have the same specification as "url", but
 480     indicate that the given additional information is more precise than the one
 481     associated with the resolved URL.
 482     This is useful when a site employs a video service that hosts the video and
 483     its technical metadata, but that video service does not embed a useful
 484     title, description etc.
 485
 486
 487     Subclasses of this should also be added to the list of extractors and
 488     should define _VALID_URL as a regexp or a Sequence of regexps, and
 489     re-define the _real_extract() and (optionally) _real_initialize() methods.
 490
 491     Subclasses may also override suitable() if necessary, but ensure the function
 492     signature is preserved and that this function imports everything it needs
 493     (except other extractors), so that lazy_extractors works correctly.
 494
 495     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 496     the HTML of Generic webpages. It may also override _extract_embed_urls
 497     or _extract_from_webpage as necessary. While these are normally classmethods,
 498     _extract_from_webpage is allowed to be an instance method.
 499
 500     _extract_from_webpage may raise self.StopExtraction() to stop further
 501     processing of the webpage and obtain exclusive rights to it. This is useful
 502     when the extractor cannot reliably be matched using just the URL,
 503     e.g. invidious/peertube instances
 504
 505     Embed-only extractors can be defined by setting _VALID_URL = False.
 506
 507     To support username + password (or netrc) login, the extractor must define a
 508     _NETRC_MACHINE and re-define _perform_login(username, password) and
 509     (optionally) _initialize_pre_login() methods. The _perform_login method will
 510     be called between _initialize_pre_login and _real_initialize if credentials
 511     are passed by the user. In cases where it is necessary to have the login
 512     process as part of the extraction rather than initialization, _perform_login
 513     can be left undefined.
 514
 515     _GEO_BYPASS attribute may be set to False in order to disable
 516     geo restriction bypass mechanisms for a particular extractor.
 517     Though it won't disable explicit geo restriction bypass based on
 518     country code provided with geo_bypass_country.
 519
 520     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 521     countries for this extractor. One of these countries will be used by
 522     geo restriction bypass mechanism right away in order to bypass
 523     geo restriction, of course, if the mechanism is not disabled.
 524
 525     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 526     IP blocks in CIDR notation for this extractor. One of these IP blocks
 527     will be used by geo restriction bypass mechanism similarly
 528     to _GEO_COUNTRIES.
 529
 530     The _ENABLED attribute should be set to False for IEs that
 531     are disabled by default and must be explicitly enabled.
 532
 533     The _WORKING attribute should be set to False for broken IEs
 534     in order to warn the users and skip the tests.
 535     """
 536
 537     _ready = False
 538     _downloader = None
 539     _x_forwarded_for_ip = None
 540     _GEO_BYPASS = True
 541     _GEO_COUNTRIES = None
 542     _GEO_IP_BLOCKS = None
 543     _WORKING = True
 544     _ENABLED = True
 545     _NETRC_MACHINE = None
 546     IE_DESC = None
 547     SEARCH_KEY = None
 548     _VALID_URL = None
 549     _EMBED_REGEX = []
 550
 551     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 552         password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 553         return {
 554             None: '',
 555             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 556             'password': f'Use {password_hint}',
 557             'cookies': (
 558                 'Use --cookies-from-browser or --cookies for the authentication. '
 559                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 560         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 561
 562     def __init__(self, downloader=None):
 563         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 564         If a downloader is not passed during initialization,
 565         it must be set using "set_downloader()" before "extract()" is called"""
 566         self._ready = False
 567         self._x_forwarded_for_ip = None
 568         self._printed_messages = set()
 569         self.set_downloader(downloader)
 570
 571     @classmethod
 572     def _match_valid_url(cls, url):
 573         if cls._VALID_URL is False:
 574             return None
 575         # This does not use has/getattr intentionally - we want to know whether
 576         # we have cached the regexp for *this* class, whereas getattr would also
 577         # match the superclass
 578         if '_VALID_URL_RE' not in cls.__dict__:
 579             cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
 580         return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
 581
 582     @classmethod
 583     def suitable(cls, url):
 584         """Receives a URL and returns True if suitable for this IE."""
 585         # This function must import everything it needs (except other extractors),
 586         # so that lazy_extractors works correctly
 587         return cls._match_valid_url(url) is not None
 588
 589     @classmethod
 590     def _match_id(cls, url):
 591         return cls._match_valid_url(url).group('id')
 592
 593     @classmethod
 594     def get_temp_id(cls, url):
 595         try:
 596             return cls._match_id(url)
 597         except (IndexError, AttributeError):
 598             return None
 599
 600     @classmethod
 601     def working(cls):
 602         """Getter method for _WORKING."""
 603         return cls._WORKING
 604
 605     @classmethod
 606     def supports_login(cls):
 607         return bool(cls._NETRC_MACHINE)
 608
 609     def initialize(self):
 610         """Initializes an instance (authentication, etc)."""
 611         self._printed_messages = set()
 612         self._initialize_geo_bypass({
 613             'countries': self._GEO_COUNTRIES,
 614             'ip_blocks': self._GEO_IP_BLOCKS,
 615         })
 616         if not self._ready:
 617             self._initialize_pre_login()
 618             if self.supports_login():
 619                 username, password = self._get_login_info()
 620                 if username:
 621                     self._perform_login(username, password)
 622             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 623                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 624             self._real_initialize()
 625             self._ready = True
 626
 627     def _initialize_geo_bypass(self, geo_bypass_context):
 628         """
 629         Initialize geo restriction bypass mechanism.
 630
 631         This method is used to initialize geo bypass mechanism based on faking
 632         X-Forwarded-For HTTP header. A random country from provided country list
 633         is selected and a random IP belonging to this country is generated. This
 634         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 635         HTTP requests.
 636
 637         This method will be used for initial geo bypass mechanism initialization
 638         during the instance initialization with _GEO_COUNTRIES and
 639         _GEO_IP_BLOCKS.
 640
 641         You may also manually call it from extractor's code if geo bypass
 642         information is not available beforehand (e.g. obtained during
 643         extraction) or due to some other reason. In this case you should pass
 644         this information in geo bypass context passed as first argument. It may
 645         contain following fields:
 646
 647         countries:  List of geo unrestricted countries (similar
 648                     to _GEO_COUNTRIES)
 649         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 650                     (similar to _GEO_IP_BLOCKS)
 651
 652         """
 653         if not self._x_forwarded_for_ip:
 654
 655             # Geo bypass mechanism is explicitly disabled by user
 656             if not self.get_param('geo_bypass', True):
 657                 return
 658
 659             if not geo_bypass_context:
 660                 geo_bypass_context = {}
 661
 662             # Backward compatibility: previously _initialize_geo_bypass
 663             # expected a list of countries, some 3rd party code may still use
 664             # it this way
 665             if isinstance(geo_bypass_context, (list, tuple)):
 666                 geo_bypass_context = {
 667                     'countries': geo_bypass_context,
 668                 }
 669
 670             # The whole point of geo bypass mechanism is to fake IP
 671             # as X-Forwarded-For HTTP header based on some IP block or
 672             # country code.
 673
 674             # Path 1: bypassing based on IP block in CIDR notation
 675
 676             # Explicit IP block specified by user, use it right away
 677             # regardless of whether extractor is geo bypassable or not
 678             ip_block = self.get_param('geo_bypass_ip_block', None)
 679
 680             # Otherwise use random IP block from geo bypass context but only
 681             # if extractor is known as geo bypassable
 682             if not ip_block:
 683                 ip_blocks = geo_bypass_context.get('ip_blocks')
 684                 if self._GEO_BYPASS and ip_blocks:
 685                     ip_block = random.choice(ip_blocks)
 686
 687             if ip_block:
 688                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 689                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 690                 return
 691
 692             # Path 2: bypassing based on country code
 693
 694             # Explicit country code specified by user, use it right away
 695             # regardless of whether extractor is geo bypassable or not
 696             country = self.get_param('geo_bypass_country', None)
 697
 698             # Otherwise use random country code from geo bypass context but
 699             # only if extractor is known as geo bypassable
 700             if not country:
 701                 countries = geo_bypass_context.get('countries')
 702                 if self._GEO_BYPASS and countries:
 703                     country = random.choice(countries)
 704
 705             if country:
 706                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 707                 self._downloader.write_debug(
 708                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 709
 710     def extract(self, url):
 711         """Extracts URL information and returns it in list of dicts."""
 712         try:
 713             for _ in range(2):
 714                 try:
 715                     self.initialize()
 716                     self.to_screen('Extracting URL: %s' % (
 717                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 718                     ie_result = self._real_extract(url)
 719                     if ie_result is None:
 720                         return None
 721                     if self._x_forwarded_for_ip:
 722                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 723                     subtitles = ie_result.get('subtitles') or {}
 724                     if 'no-live-chat' in self.get_param('compat_opts'):
 725                         for lang in ('live_chat', 'comments', 'danmaku'):
 726                             subtitles.pop(lang, None)
 727                     return ie_result
 728                 except GeoRestrictedError as e:
 729                     if self.__maybe_fake_ip_and_retry(e.countries):
 730                         continue
 731                     raise
 732         except UnsupportedError:
 733             raise
 734         except ExtractorError as e:
 735             e.video_id = e.video_id or self.get_temp_id(url)
 736             e.ie = e.ie or self.IE_NAME,
 737             e.traceback = e.traceback or sys.exc_info()[2]
 738             raise
 739         except IncompleteRead as e:
 740             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 741         except (KeyError, StopIteration) as e:
 742             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 743
 744     def __maybe_fake_ip_and_retry(self, countries):
 745         if (not self.get_param('geo_bypass_country', None)
 746                 and self._GEO_BYPASS
 747                 and self.get_param('geo_bypass', True)
 748                 and not self._x_forwarded_for_ip
 749                 and countries):
 750             country_code = random.choice(countries)
 751             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 752             if self._x_forwarded_for_ip:
 753                 self.report_warning(
 754                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 755                     % (self._x_forwarded_for_ip, country_code.upper()))
 756                 return True
 757         return False
 758
 759     def set_downloader(self, downloader):
 760         """Sets a YoutubeDL instance as the downloader for this IE."""
 761         self._downloader = downloader
 762
 763     @property
 764     def cache(self):
 765         return self._downloader.cache
 766
 767     @property
 768     def cookiejar(self):
 769         return self._downloader.cookiejar
 770
 771     def _initialize_pre_login(self):
 772         """ Initialization before login. Redefine in subclasses."""
 773         pass
 774
 775     def _perform_login(self, username, password):
 776         """ Login with username and password. Redefine in subclasses."""
 777         pass
 778
 779     def _real_initialize(self):
 780         """Real initialization process. Redefine in subclasses."""
 781         pass
 782
 783     def _real_extract(self, url):
 784         """Real extraction process. Redefine in subclasses."""
 785         raise NotImplementedError('This method must be implemented by subclasses')
 786
 787     @classmethod
 788     def ie_key(cls):
 789         """A string for getting the InfoExtractor with get_info_extractor"""
 790         return cls.__name__[:-2]
 791
 792     @classproperty
 793     def IE_NAME(cls):
 794         return cls.__name__[:-2]
 795
 796     @staticmethod
 797     def __can_accept_status_code(err, expected_status):
 798         assert isinstance(err, HTTPError)
 799         if expected_status is None:
 800             return False
 801         elif callable(expected_status):
 802             return expected_status(err.status) is True
 803         else:
 804             return err.status in variadic(expected_status)
 805
 806     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 807         if isinstance(url_or_request, urllib.request.Request):
 808             self._downloader.deprecation_warning(
 809                 'Passing a urllib.request.Request to _create_request() is deprecated. '
 810                 'Use yt_dlp.networking.common.Request instead.')
 811             url_or_request = urllib_req_to_req(url_or_request)
 812         elif not isinstance(url_or_request, Request):
 813             url_or_request = Request(url_or_request)
 814
 815         url_or_request.update(data=data, headers=headers, query=query)
 816         return url_or_request
 817
 818     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 819         """
 820         Return the response handle.
 821
 822         See _download_webpage docstring for arguments specification.
 823         """
 824         if not self._downloader._first_webpage_request:
 825             sleep_interval = self.get_param('sleep_interval_requests') or 0
 826             if sleep_interval > 0:
 827                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 828                 time.sleep(sleep_interval)
 829         else:
 830             self._downloader._first_webpage_request = False
 831
 832         if note is None:
 833             self.report_download_webpage(video_id)
 834         elif note is not False:
 835             if video_id is None:
 836                 self.to_screen(str(note))
 837             else:
 838                 self.to_screen(f'{video_id}: {note}')
 839
 840         # Some sites check X-Forwarded-For HTTP header in order to figure out
 841         # the origin of the client behind proxy. This allows bypassing geo
 842         # restriction by faking this header's value to IP that belongs to some
 843         # geo unrestricted country. We will do so once we encounter any
 844         # geo restriction error.
 845         if self._x_forwarded_for_ip:
 846             headers = (headers or {}).copy()
 847             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 848
 849         try:
 850             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 851         except network_exceptions as err:
 852             if isinstance(err, HTTPError):
 853                 if self.__can_accept_status_code(err, expected_status):
 854                     return err.response
 855
 856             if errnote is False:
 857                 return False
 858             if errnote is None:
 859                 errnote = 'Unable to download webpage'
 860
 861             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 862             if fatal:
 863                 raise ExtractorError(errmsg, cause=err)
 864             else:
 865                 self.report_warning(errmsg)
 866                 return False
 867
 868     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 869                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 870         """
 871         Return a tuple (page content as string, URL handle).
 872
 873         Arguments:
 874         url_or_request -- plain text URL as a string or
 875             a urllib.request.Request object
 876         video_id -- Video/playlist/item identifier (string)
 877
 878         Keyword arguments:
 879         note -- note printed before downloading (string)
 880         errnote -- note printed in case of an error (string)
 881         fatal -- flag denoting whether error should be considered fatal,
 882             i.e. whether it should cause ExtractionError to be raised,
 883             otherwise a warning will be reported and extraction continued
 884         encoding -- encoding for a page content decoding, guessed automatically
 885             when not explicitly specified
 886         data -- POST data (bytes)
 887         headers -- HTTP headers (dict)
 888         query -- URL query (dict)
 889         expected_status -- allows to accept failed HTTP requests (non 2xx
 890             status code) by explicitly specifying a set of accepted status
 891             codes. Can be any of the following entities:
 892                 - an integer type specifying an exact failed status code to
 893                   accept
 894                 - a list or a tuple of integer types specifying a list of
 895                   failed status codes to accept
 896                 - a callable accepting an actual failed status code and
 897                   returning True if it should be accepted
 898             Note that this argument does not affect success status codes (2xx)
 899             which are always accepted.
 900         """
 901
 902         # Strip hashes from the URL (#1038)
 903         if isinstance(url_or_request, str):
 904             url_or_request = url_or_request.partition('#')[0]
 905
 906         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 907         if urlh is False:
 908             assert not fatal
 909             return False
 910         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 911         return (content, urlh)
 912
 913     @staticmethod
 914     def _guess_encoding_from_content(content_type, webpage_bytes):
 915         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 916         if m:
 917             encoding = m.group(1)
 918         else:
 919             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 920                           webpage_bytes[:1024])
 921             if m:
 922                 encoding = m.group(1).decode('ascii')
 923             elif webpage_bytes.startswith(b'\xff\xfe'):
 924                 encoding = 'utf-16'
 925             else:
 926                 encoding = 'utf-8'
 927
 928         return encoding
 929
 930     def __check_blocked(self, content):
 931         first_block = content[:512]
 932         if ('<title>Access to this site is blocked</title>' in content
 933                 and 'Websense' in first_block):
 934             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 935             blocked_iframe = self._html_search_regex(
 936                 r'<iframe src="([^"]+)"', content,
 937                 'Websense information URL', default=None)
 938             if blocked_iframe:
 939                 msg += ' Visit %s for more details' % blocked_iframe
 940             raise ExtractorError(msg, expected=True)
 941         if '<title>The URL you requested has been blocked</title>' in first_block:
 942             msg = (
 943                 'Access to this webpage has been blocked by Indian censorship. '
 944                 'Use a VPN or proxy server (with --proxy) to route around it.')
 945             block_msg = self._html_search_regex(
 946                 r'</h1><p>(.*?)</p>',
 947                 content, 'block message', default=None)
 948             if block_msg:
 949                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 950             raise ExtractorError(msg, expected=True)
 951         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 952                 and 'blocklist.rkn.gov.ru' in content):
 953             raise ExtractorError(
 954                 'Access to this webpage has been blocked by decision of the Russian government. '
 955                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 956                 expected=True)
 957
 958     def _request_dump_filename(self, url, video_id):
 959         basen = f'{video_id}_{url}'
 960         trim_length = self.get_param('trim_file_name') or 240
 961         if len(basen) > trim_length:
 962             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 963             basen = basen[:trim_length - len(h)] + h
 964         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 965         # Working around MAX_PATH limitation on Windows (see
 966         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 967         if compat_os_name == 'nt':
 968             absfilepath = os.path.abspath(filename)
 969             if len(absfilepath) > 259:
 970                 filename = fR'\\?\{absfilepath}'
 971         return filename
 972
 973     def __decode_webpage(self, webpage_bytes, encoding, headers):
 974         if not encoding:
 975             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 976         try:
 977             return webpage_bytes.decode(encoding, 'replace')
 978         except LookupError:
 979             return webpage_bytes.decode('utf-8', 'replace')
 980
 981     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 982         webpage_bytes = urlh.read()
 983         if prefix is not None:
 984             webpage_bytes = prefix + webpage_bytes
 985         if self.get_param('dump_intermediate_pages', False):
 986             self.to_screen('Dumping request to ' + urlh.url)
 987             dump = base64.b64encode(webpage_bytes).decode('ascii')
 988             self._downloader.to_screen(dump)
 989         if self.get_param('write_pages'):
 990             filename = self._request_dump_filename(urlh.url, video_id)
 991             self.to_screen(f'Saving request to {filename}')
 992             with open(filename, 'wb') as outf:
 993                 outf.write(webpage_bytes)
 994
 995         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 996         self.__check_blocked(content)
 997
 998         return content
 999
1000     def __print_error(self, errnote, fatal, video_id, err):
1001         if fatal:
1002             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
1003         elif errnote:
1004             self.report_warning(f'{video_id}: {errnote}: {err}')
1005
1006     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
1007         if transform_source:
1008             xml_string = transform_source(xml_string)
1009         try:
1010             return compat_etree_fromstring(xml_string.encode('utf-8'))
1011         except xml.etree.ElementTree.ParseError as ve:
1012             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1013
1014     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1015         try:
1016             return json.loads(
1017                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1018         except ValueError as ve:
1019             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1020
1021     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1022         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1023
1024     def __create_download_methods(name, parser, note, errnote, return_value):
1025
1026         def parse(ie, content, *args, errnote=errnote, **kwargs):
1027             if parser is None:
1028                 return content
1029             if errnote is False:
1030                 kwargs['errnote'] = errnote
1031             # parser is fetched by name so subclasses can override it
1032             return getattr(ie, parser)(content, *args, **kwargs)
1033
1034         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1035                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1036             res = self._download_webpage_handle(
1037                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1038                 data=data, headers=headers, query=query, expected_status=expected_status)
1039             if res is False:
1040                 return res
1041             content, urlh = res
1042             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1043
1044         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1045                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1046             if self.get_param('load_pages'):
1047                 url_or_request = self._create_request(url_or_request, data, headers, query)
1048                 filename = self._request_dump_filename(url_or_request.url, video_id)
1049                 self.to_screen(f'Loading request from {filename}')
1050                 try:
1051                     with open(filename, 'rb') as dumpf:
1052                         webpage_bytes = dumpf.read()
1053                 except OSError as e:
1054                     self.report_warning(f'Unable to load request from disk: {e}')
1055                 else:
1056                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1057                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1058             kwargs = {
1059                 'note': note,
1060                 'errnote': errnote,
1061                 'transform_source': transform_source,
1062                 'fatal': fatal,
1063                 'encoding': encoding,
1064                 'data': data,
1065                 'headers': headers,
1066                 'query': query,
1067                 'expected_status': expected_status,
1068             }
1069             if parser is None:
1070                 kwargs.pop('transform_source')
1071             # The method is fetched by name so subclasses can override _download_..._handle
1072             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1073             return res if res is False else res[0]
1074
1075         def impersonate(func, name, return_value):
1076             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1077             func.__doc__ = f'''
1078                 @param transform_source     Apply this transformation before parsing
1079                 @returns                    {return_value}
1080
1081                 See _download_webpage_handle docstring for other arguments specification
1082             '''
1083
1084         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1085         impersonate(download_content, f'_download_{name}', f'{return_value}')
1086         return download_handle, download_content
1087
1088     _download_xml_handle, _download_xml = __create_download_methods(
1089         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1090     _download_json_handle, _download_json = __create_download_methods(
1091         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1092     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1093         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1094     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1095
1096     def _download_webpage(
1097             self, url_or_request, video_id, note=None, errnote=None,
1098             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1099         """
1100         Return the data of the page as a string.
1101
1102         Keyword arguments:
1103         tries -- number of tries
1104         timeout -- sleep interval between tries
1105
1106         See _download_webpage_handle docstring for other arguments specification.
1107         """
1108
1109         R''' # NB: These are unused; should they be deprecated?
1110         if tries != 1:
1111             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1112         if timeout is NO_DEFAULT:
1113             timeout = 5
1114         else:
1115             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1116         '''
1117
1118         try_count = 0
1119         while True:
1120             try:
1121                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1122             except IncompleteRead as e:
1123                 try_count += 1
1124                 if try_count >= tries:
1125                     raise e
1126                 self._sleep(timeout, video_id)
1127
1128     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1129         idstr = format_field(video_id, None, '%s: ')
1130         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1131         if only_once:
1132             if f'WARNING: {msg}' in self._printed_messages:
1133                 return
1134             self._printed_messages.add(f'WARNING: {msg}')
1135         self._downloader.report_warning(msg, *args, **kwargs)
1136
1137     def to_screen(self, msg, *args, **kwargs):
1138         """Print msg to screen, prefixing it with '[ie_name]'"""
1139         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1140
1141     def write_debug(self, msg, *args, **kwargs):
1142         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1143
1144     def get_param(self, name, default=None, *args, **kwargs):
1145         if self._downloader:
1146             return self._downloader.params.get(name, default, *args, **kwargs)
1147         return default
1148
1149     def report_drm(self, video_id, partial=NO_DEFAULT):
1150         if partial is not NO_DEFAULT:
1151             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1152         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1153
1154     def report_extraction(self, id_or_name):
1155         """Report information extraction."""
1156         self.to_screen('%s: Extracting information' % id_or_name)
1157
1158     def report_download_webpage(self, video_id):
1159         """Report webpage download."""
1160         self.to_screen('%s: Downloading webpage' % video_id)
1161
1162     def report_age_confirmation(self):
1163         """Report attempt to confirm age."""
1164         self.to_screen('Confirming age')
1165
1166     def report_login(self):
1167         """Report attempt to log in."""
1168         self.to_screen('Logging in')
1169
1170     def raise_login_required(
1171             self, msg='This video is only available for registered users',
1172             metadata_available=False, method=NO_DEFAULT):
1173         if metadata_available and (
1174                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1175             self.report_warning(msg)
1176             return
1177         msg += format_field(self._login_hint(method), None, '. %s')
1178         raise ExtractorError(msg, expected=True)
1179
1180     def raise_geo_restricted(
1181             self, msg='This video is not available from your location due to geo restriction',
1182             countries=None, metadata_available=False):
1183         if metadata_available and (
1184                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1185             self.report_warning(msg)
1186         else:
1187             raise GeoRestrictedError(msg, countries=countries)
1188
1189     def raise_no_formats(self, msg, expected=False, video_id=None):
1190         if expected and (
1191                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1192             self.report_warning(msg, video_id)
1193         elif isinstance(msg, ExtractorError):
1194             raise msg
1195         else:
1196             raise ExtractorError(msg, expected=expected, video_id=video_id)
1197
1198     # Methods for following #608
1199     @staticmethod
1200     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1201         """Returns a URL that points to a page that should be processed"""
1202         if ie is not None:
1203             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1204         if video_id is not None:
1205             kwargs['id'] = video_id
1206         if video_title is not None:
1207             kwargs['title'] = video_title
1208         return {
1209             **kwargs,
1210             '_type': 'url_transparent' if url_transparent else 'url',
1211             'url': url,
1212         }
1213
1214     @classmethod
1215     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1216                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1217         return cls.playlist_result(
1218             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1219             playlist_id, playlist_title, **kwargs)
1220
1221     @staticmethod
1222     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1223         """Returns a playlist"""
1224         if playlist_id:
1225             kwargs['id'] = playlist_id
1226         if playlist_title:
1227             kwargs['title'] = playlist_title
1228         if playlist_description is not None:
1229             kwargs['description'] = playlist_description
1230         return {
1231             **kwargs,
1232             '_type': 'multi_video' if multi_video else 'playlist',
1233             'entries': entries,
1234         }
1235
1236     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1237         """
1238         Perform a regex search on the given string, using a single or a list of
1239         patterns returning the first matching group.
1240         In case of failure return a default value or raise a WARNING or a
1241         RegexNotFoundError, depending on fatal, specifying the field name.
1242         """
1243         if string is None:
1244             mobj = None
1245         elif isinstance(pattern, (str, re.Pattern)):
1246             mobj = re.search(pattern, string, flags)
1247         else:
1248             for p in pattern:
1249                 mobj = re.search(p, string, flags)
1250                 if mobj:
1251                     break
1252
1253         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1254
1255         if mobj:
1256             if group is None:
1257                 # return the first matching group
1258                 return next(g for g in mobj.groups() if g is not None)
1259             elif isinstance(group, (list, tuple)):
1260                 return tuple(mobj.group(g) for g in group)
1261             else:
1262                 return mobj.group(group)
1263         elif default is not NO_DEFAULT:
1264             return default
1265         elif fatal:
1266             raise RegexNotFoundError('Unable to extract %s' % _name)
1267         else:
1268             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1269             return None
1270
1271     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1272                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1273         """Searches string for the JSON object specified by start_pattern"""
1274         # NB: end_pattern is only used to reduce the size of the initial match
1275         if default is NO_DEFAULT:
1276             default, has_default = {}, False
1277         else:
1278             fatal, has_default = False, True
1279
1280         json_string = self._search_regex(
1281             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1282             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1283         if not json_string:
1284             return default
1285
1286         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1287         try:
1288             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1289         except ExtractorError as e:
1290             if fatal:
1291                 raise ExtractorError(
1292                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1293             elif not has_default:
1294                 self.report_warning(
1295                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1296         return default
1297
1298     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1299         """
1300         Like _search_regex, but strips HTML tags and unescapes entities.
1301         """
1302         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1303         if isinstance(res, tuple):
1304             return tuple(map(clean_html, res))
1305         return clean_html(res)
1306
1307     def _get_netrc_login_info(self, netrc_machine=None):
1308         netrc_machine = netrc_machine or self._NETRC_MACHINE
1309
1310         cmd = self.get_param('netrc_cmd')
1311         if cmd:
1312             cmd = cmd.replace('{}', netrc_machine)
1313             self.to_screen(f'Executing command: {cmd}')
1314             stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1315             if ret != 0:
1316                 raise OSError(f'Command returned error code {ret}')
1317             info = netrc_from_content(stdout).authenticators(netrc_machine)
1318
1319         elif self.get_param('usenetrc', False):
1320             netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1321             if os.path.isdir(netrc_file):
1322                 netrc_file = os.path.join(netrc_file, '.netrc')
1323             info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1324
1325         else:
1326             return None, None
1327         if not info:
1328             raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')
1329         return info[0], info[2]
1330
1331     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1332         """
1333         Get the login info as (username, password)
1334         First look for the manually specified credentials using username_option
1335         and password_option as keys in params dictionary. If no such credentials
1336         are available try the netrc_cmd if it is defined or look in the
1337         netrc file using the netrc_machine or _NETRC_MACHINE value.
1338         If there's no info available, return (None, None)
1339         """
1340
1341         username = self.get_param(username_option)
1342         if username is not None:
1343             password = self.get_param(password_option)
1344         else:
1345             try:
1346                 username, password = self._get_netrc_login_info(netrc_machine)
1347             except (OSError, netrc.NetrcParseError) as err:
1348                 self.report_warning(f'Failed to parse .netrc: {err}')
1349                 return None, None
1350         return username, password
1351
1352     def _get_tfa_info(self, note='two-factor verification code'):
1353         """
1354         Get the two-factor authentication info
1355         TODO - asking the user will be required for sms/phone verify
1356         currently just uses the command line option
1357         If there's no info available, return None
1358         """
1359
1360         tfa = self.get_param('twofactor')
1361         if tfa is not None:
1362             return tfa
1363
1364         return getpass.getpass('Type %s and press [Return]: ' % note)
1365
1366     # Helper functions for extracting OpenGraph info
1367     @staticmethod
1368     def _og_regexes(prop):
1369         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1370         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1371                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1372         template = r'<meta[^>]+?%s[^>]+?%s'
1373         return [
1374             template % (property_re, content_re),
1375             template % (content_re, property_re),
1376         ]
1377
1378     @staticmethod
1379     def _meta_regex(prop):
1380         return r'''(?isx)<meta
1381                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1382                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1383
1384     def _og_search_property(self, prop, html, name=None, **kargs):
1385         prop = variadic(prop)
1386         if name is None:
1387             name = 'OpenGraph %s' % prop[0]
1388         og_regexes = []
1389         for p in prop:
1390             og_regexes.extend(self._og_regexes(p))
1391         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1392         if escaped is None:
1393             return None
1394         return unescapeHTML(escaped)
1395
1396     def _og_search_thumbnail(self, html, **kargs):
1397         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1398
1399     def _og_search_description(self, html, **kargs):
1400         return self._og_search_property('description', html, fatal=False, **kargs)
1401
1402     def _og_search_title(self, html, *, fatal=False, **kargs):
1403         return self._og_search_property('title', html, fatal=fatal, **kargs)
1404
1405     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1406         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1407         if secure:
1408             regexes = self._og_regexes('video:secure_url') + regexes
1409         return self._html_search_regex(regexes, html, name, **kargs)
1410
1411     def _og_search_url(self, html, **kargs):
1412         return self._og_search_property('url', html, **kargs)
1413
1414     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1415         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1416
1417     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1418         name = variadic(name)
1419         if display_name is None:
1420             display_name = name[0]
1421         return self._html_search_regex(
1422             [self._meta_regex(n) for n in name],
1423             html, display_name, fatal=fatal, group='content', **kwargs)
1424
1425     def _dc_search_uploader(self, html):
1426         return self._html_search_meta('dc.creator', html, 'uploader')
1427
1428     @staticmethod
1429     def _rta_search(html):
1430         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1431         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1432                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1433                      html):
1434             return 18
1435
1436         # And then there are the jokers who advertise that they use RTA, but actually don't.
1437         AGE_LIMIT_MARKERS = [
1438             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1439             r'>[^<]*you acknowledge you are at least (\d+) years old',
1440             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1441         ]
1442
1443         age_limit = 0
1444         for marker in AGE_LIMIT_MARKERS:
1445             mobj = re.search(marker, html)
1446             if mobj:
1447                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1448         return age_limit
1449
1450     def _media_rating_search(self, html):
1451         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1452         rating = self._html_search_meta('rating', html)
1453
1454         if not rating:
1455             return None
1456
1457         RATING_TABLE = {
1458             'safe for kids': 0,
1459             'general': 8,
1460             '14 years': 14,
1461             'mature': 17,
1462             'restricted': 19,
1463         }
1464         return RATING_TABLE.get(rating.lower())
1465
1466     def _family_friendly_search(self, html):
1467         # See http://schema.org/VideoObject
1468         family_friendly = self._html_search_meta(
1469             'isFamilyFriendly', html, default=None)
1470
1471         if not family_friendly:
1472             return None
1473
1474         RATING_TABLE = {
1475             '1': 0,
1476             'true': 0,
1477             '0': 18,
1478             'false': 18,
1479         }
1480         return RATING_TABLE.get(family_friendly.lower())
1481
1482     def _twitter_search_player(self, html):
1483         return self._html_search_meta('twitter:player', html,
1484                                       'twitter card player')
1485
1486     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1487         """Yield all json ld objects in the html"""
1488         if default is not NO_DEFAULT:
1489             fatal = False
1490         for mobj in re.finditer(JSON_LD_RE, html):
1491             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1492             for json_ld in variadic(json_ld_item):
1493                 if isinstance(json_ld, dict):
1494                     yield json_ld
1495
1496     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1497         """Search for a video in any json ld in the html"""
1498         if default is not NO_DEFAULT:
1499             fatal = False
1500         info = self._json_ld(
1501             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1502             video_id, fatal=fatal, expected_type=expected_type)
1503         if info:
1504             return info
1505         if default is not NO_DEFAULT:
1506             return default
1507         elif fatal:
1508             raise RegexNotFoundError('Unable to extract JSON-LD')
1509         else:
1510             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1511             return {}
1512
1513     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1514         if isinstance(json_ld, str):
1515             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1516         if not json_ld:
1517             return {}
1518         info = {}
1519
1520         INTERACTION_TYPE_MAP = {
1521             'CommentAction': 'comment',
1522             'AgreeAction': 'like',
1523             'DisagreeAction': 'dislike',
1524             'LikeAction': 'like',
1525             'DislikeAction': 'dislike',
1526             'ListenAction': 'view',
1527             'WatchAction': 'view',
1528             'ViewAction': 'view',
1529         }
1530
1531         def is_type(e, *expected_types):
1532             type = variadic(traverse_obj(e, '@type'))
1533             return any(x in type for x in expected_types)
1534
1535         def extract_interaction_type(e):
1536             interaction_type = e.get('interactionType')
1537             if isinstance(interaction_type, dict):
1538                 interaction_type = interaction_type.get('@type')
1539             return str_or_none(interaction_type)
1540
1541         def extract_interaction_statistic(e):
1542             interaction_statistic = e.get('interactionStatistic')
1543             if isinstance(interaction_statistic, dict):
1544                 interaction_statistic = [interaction_statistic]
1545             if not isinstance(interaction_statistic, list):
1546                 return
1547             for is_e in interaction_statistic:
1548                 if not is_type(is_e, 'InteractionCounter'):
1549                     continue
1550                 interaction_type = extract_interaction_type(is_e)
1551                 if not interaction_type:
1552                     continue
1553                 # For interaction count some sites provide string instead of
1554                 # an integer (as per spec) with non digit characters (e.g. ",")
1555                 # so extracting count with more relaxed str_to_int
1556                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1557                 if interaction_count is None:
1558                     continue
1559                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1560                 if not count_kind:
1561                     continue
1562                 count_key = '%s_count' % count_kind
1563                 if info.get(count_key) is not None:
1564                     continue
1565                 info[count_key] = interaction_count
1566
1567         def extract_chapter_information(e):
1568             chapters = [{
1569                 'title': part.get('name'),
1570                 'start_time': part.get('startOffset'),
1571                 'end_time': part.get('endOffset'),
1572             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1573             for idx, (last_c, current_c, next_c) in enumerate(zip(
1574                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1575                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1576                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1577                 if None in current_c.values():
1578                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1579                     return
1580             if chapters:
1581                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1582                 info['chapters'] = chapters
1583
1584         def extract_video_object(e):
1585             author = e.get('author')
1586             info.update({
1587                 'url': url_or_none(e.get('contentUrl')),
1588                 'ext': mimetype2ext(e.get('encodingFormat')),
1589                 'title': unescapeHTML(e.get('name')),
1590                 'description': unescapeHTML(e.get('description')),
1591                 'thumbnails': [{'url': unescapeHTML(url)}
1592                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1593                                if url_or_none(url)],
1594                 'duration': parse_duration(e.get('duration')),
1595                 'timestamp': unified_timestamp(e.get('uploadDate')),
1596                 # author can be an instance of 'Organization' or 'Person' types.
1597                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1598                 # however some websites are using 'Text' type instead.
1599                 # 1. https://schema.org/VideoObject
1600                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1601                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1602                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1603                 'tbr': int_or_none(e.get('bitrate')),
1604                 'width': int_or_none(e.get('width')),
1605                 'height': int_or_none(e.get('height')),
1606                 'view_count': int_or_none(e.get('interactionCount')),
1607                 'tags': try_call(lambda: e.get('keywords').split(',')),
1608             })
1609             if is_type(e, 'AudioObject'):
1610                 info.update({
1611                     'vcodec': 'none',
1612                     'abr': int_or_none(e.get('bitrate')),
1613                 })
1614             extract_interaction_statistic(e)
1615             extract_chapter_information(e)
1616
1617         def traverse_json_ld(json_ld, at_top_level=True):
1618             for e in variadic(json_ld):
1619                 if not isinstance(e, dict):
1620                     continue
1621                 if at_top_level and '@context' not in e:
1622                     continue
1623                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1624                     traverse_json_ld(e['@graph'], at_top_level=False)
1625                     continue
1626                 if expected_type is not None and not is_type(e, expected_type):
1627                     continue
1628                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1629                 if rating is not None:
1630                     info['average_rating'] = rating
1631                 if is_type(e, 'TVEpisode', 'Episode'):
1632                     episode_name = unescapeHTML(e.get('name'))
1633                     info.update({
1634                         'episode': episode_name,
1635                         'episode_number': int_or_none(e.get('episodeNumber')),
1636                         'description': unescapeHTML(e.get('description')),
1637                     })
1638                     if not info.get('title') and episode_name:
1639                         info['title'] = episode_name
1640                     part_of_season = e.get('partOfSeason')
1641                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1642                         info.update({
1643                             'season': unescapeHTML(part_of_season.get('name')),
1644                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1645                         })
1646                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1647                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1648                         info['series'] = unescapeHTML(part_of_series.get('name'))
1649                 elif is_type(e, 'Movie'):
1650                     info.update({
1651                         'title': unescapeHTML(e.get('name')),
1652                         'description': unescapeHTML(e.get('description')),
1653                         'duration': parse_duration(e.get('duration')),
1654                         'timestamp': unified_timestamp(e.get('dateCreated')),
1655                     })
1656                 elif is_type(e, 'Article', 'NewsArticle'):
1657                     info.update({
1658                         'timestamp': parse_iso8601(e.get('datePublished')),
1659                         'title': unescapeHTML(e.get('headline')),
1660                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1661                     })
1662                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1663                         extract_video_object(e['video'][0])
1664                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1665                         extract_video_object(e['subjectOf'][0])
1666                 elif is_type(e, 'VideoObject', 'AudioObject'):
1667                     extract_video_object(e)
1668                     if expected_type is None:
1669                         continue
1670                     else:
1671                         break
1672                 video = e.get('video')
1673                 if is_type(video, 'VideoObject'):
1674                     extract_video_object(video)
1675                 if expected_type is None:
1676                     continue
1677                 else:
1678                     break
1679
1680         traverse_json_ld(json_ld)
1681         return filter_dict(info)
1682
1683     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1684         return self._parse_json(
1685             self._search_regex(
1686                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1687                 webpage, 'next.js data', fatal=fatal, **kw),
1688             video_id, transform_source=transform_source, fatal=fatal)
1689
1690     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1691         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1692         rectx = re.escape(context_name)
1693         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1694         js, arg_keys, arg_vals = self._search_regex(
1695             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1696             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1697             default=NO_DEFAULT if fatal else (None, None, None))
1698         if js is None:
1699             return {}
1700
1701         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1702             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1703
1704         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1705         return traverse_obj(ret, traverse) or {}
1706
1707     @staticmethod
1708     def _hidden_inputs(html):
1709         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1710         hidden_inputs = {}
1711         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1712             attrs = extract_attributes(input)
1713             if not input:
1714                 continue
1715             if attrs.get('type') not in ('hidden', 'submit'):
1716                 continue
1717             name = attrs.get('name') or attrs.get('id')
1718             value = attrs.get('value')
1719             if name and value is not None:
1720                 hidden_inputs[name] = value
1721         return hidden_inputs
1722
1723     def _form_hidden_inputs(self, form_id, html):
1724         form = self._search_regex(
1725             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1726             html, '%s form' % form_id, group='form')
1727         return self._hidden_inputs(form)
1728
1729     @classproperty(cache=True)
1730     def FormatSort(cls):
1731         class FormatSort(FormatSorter):
1732             def __init__(ie, *args, **kwargs):
1733                 super().__init__(ie._downloader, *args, **kwargs)
1734
1735         deprecation_warning(
1736             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1737             'Use yt_dlp.utils.FormatSorter instead')
1738         return FormatSort
1739
1740     def _sort_formats(self, formats, field_preference=[]):
1741         if not field_preference:
1742             self._downloader.deprecation_warning(
1743                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1744             return
1745         self._downloader.deprecation_warning(
1746             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1747             'Return _format_sort_fields in the info_dict instead')
1748         if formats:
1749             formats[0]['__sort_fields'] = field_preference
1750
1751     def _check_formats(self, formats, video_id):
1752         if formats:
1753             formats[:] = filter(
1754                 lambda f: self._is_valid_url(
1755                     f['url'], video_id,
1756                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1757                 formats)
1758
1759     @staticmethod
1760     def _remove_duplicate_formats(formats):
1761         format_urls = set()
1762         unique_formats = []
1763         for f in formats:
1764             if f['url'] not in format_urls:
1765                 format_urls.add(f['url'])
1766                 unique_formats.append(f)
1767         formats[:] = unique_formats
1768
1769     def _is_valid_url(self, url, video_id, item='video', headers={}):
1770         url = self._proto_relative_url(url, scheme='http:')
1771         # For now assume non HTTP(S) URLs always valid
1772         if not (url.startswith('http://') or url.startswith('https://')):
1773             return True
1774         try:
1775             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1776             return True
1777         except ExtractorError as e:
1778             self.to_screen(
1779                 '%s: %s URL is invalid, skipping: %s'
1780                 % (video_id, item, error_to_compat_str(e.cause)))
1781             return False
1782
1783     def http_scheme(self):
1784         """ Either "http:" or "https:", depending on the user's preferences """
1785         return (
1786             'http:'
1787             if self.get_param('prefer_insecure', False)
1788             else 'https:')
1789
1790     def _proto_relative_url(self, url, scheme=None):
1791         scheme = scheme or self.http_scheme()
1792         assert scheme.endswith(':')
1793         return sanitize_url(url, scheme=scheme[:-1])
1794
1795     def _sleep(self, timeout, video_id, msg_template=None):
1796         if msg_template is None:
1797             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1798         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1799         self.to_screen(msg)
1800         time.sleep(timeout)
1801
1802     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1803                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1804                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1805         if self.get_param('ignore_no_formats_error'):
1806             fatal = False
1807
1808         res = self._download_xml_handle(
1809             manifest_url, video_id, 'Downloading f4m manifest',
1810             'Unable to download f4m manifest',
1811             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1812             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1813             transform_source=transform_source,
1814             fatal=fatal, data=data, headers=headers, query=query)
1815         if res is False:
1816             return []
1817
1818         manifest, urlh = res
1819         manifest_url = urlh.url
1820
1821         return self._parse_f4m_formats(
1822             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1823             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1824
1825     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1826                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1827                            fatal=True, m3u8_id=None):
1828         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1829             return []
1830
1831         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1832         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1833         if akamai_pv is not None and ';' in akamai_pv.text:
1834             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1835             if playerVerificationChallenge.strip() != '':
1836                 return []
1837
1838         formats = []
1839         manifest_version = '1.0'
1840         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1841         if not media_nodes:
1842             manifest_version = '2.0'
1843             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1844         # Remove unsupported DRM protected media from final formats
1845         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1846         media_nodes = remove_encrypted_media(media_nodes)
1847         if not media_nodes:
1848             return formats
1849
1850         manifest_base_url = get_base_url(manifest)
1851
1852         bootstrap_info = xpath_element(
1853             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1854             'bootstrap info', default=None)
1855
1856         vcodec = None
1857         mime_type = xpath_text(
1858             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1859             'base URL', default=None)
1860         if mime_type and mime_type.startswith('audio/'):
1861             vcodec = 'none'
1862
1863         for i, media_el in enumerate(media_nodes):
1864             tbr = int_or_none(media_el.attrib.get('bitrate'))
1865             width = int_or_none(media_el.attrib.get('width'))
1866             height = int_or_none(media_el.attrib.get('height'))
1867             format_id = join_nonempty(f4m_id, tbr or i)
1868             # If <bootstrapInfo> is present, the specified f4m is a
1869             # stream-level manifest, and only set-level manifests may refer to
1870             # external resources.  See section 11.4 and section 4 of F4M spec
1871             if bootstrap_info is None:
1872                 media_url = None
1873                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1874                 if manifest_version == '2.0':
1875                     media_url = media_el.attrib.get('href')
1876                 if media_url is None:
1877                     media_url = media_el.attrib.get('url')
1878                 if not media_url:
1879                     continue
1880                 manifest_url = (
1881                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1882                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1883                 # If media_url is itself a f4m manifest do the recursive extraction
1884                 # since bitrates in parent manifest (this one) and media_url manifest
1885                 # may differ leading to inability to resolve the format by requested
1886                 # bitrate in f4m downloader
1887                 ext = determine_ext(manifest_url)
1888                 if ext == 'f4m':
1889                     f4m_formats = self._extract_f4m_formats(
1890                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1891                         transform_source=transform_source, fatal=fatal)
1892                     # Sometimes stream-level manifest contains single media entry that
1893                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1894                     # At the same time parent's media entry in set-level manifest may
1895                     # contain it. We will copy it from parent in such cases.
1896                     if len(f4m_formats) == 1:
1897                         f = f4m_formats[0]
1898                         f.update({
1899                             'tbr': f.get('tbr') or tbr,
1900                             'width': f.get('width') or width,
1901                             'height': f.get('height') or height,
1902                             'format_id': f.get('format_id') if not tbr else format_id,
1903                             'vcodec': vcodec,
1904                         })
1905                     formats.extend(f4m_formats)
1906                     continue
1907                 elif ext == 'm3u8':
1908                     formats.extend(self._extract_m3u8_formats(
1909                         manifest_url, video_id, 'mp4', preference=preference,
1910                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1911                     continue
1912             formats.append({
1913                 'format_id': format_id,
1914                 'url': manifest_url,
1915                 'manifest_url': manifest_url,
1916                 'ext': 'flv' if bootstrap_info is not None else None,
1917                 'protocol': 'f4m',
1918                 'tbr': tbr,
1919                 'width': width,
1920                 'height': height,
1921                 'vcodec': vcodec,
1922                 'preference': preference,
1923                 'quality': quality,
1924             })
1925         return formats
1926
1927     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1928         return {
1929             'format_id': join_nonempty(m3u8_id, 'meta'),
1930             'url': m3u8_url,
1931             'ext': ext,
1932             'protocol': 'm3u8',
1933             'preference': preference - 100 if preference else -100,
1934             'quality': quality,
1935             'resolution': 'multiple',
1936             'format_note': 'Quality selection URL',
1937         }
1938
1939     def _report_ignoring_subs(self, name):
1940         self.report_warning(bug_reports_message(
1941             f'Ignoring subtitle tracks found in the {name} manifest; '
1942             'if any subtitle tracks are missing,'
1943         ), only_once=True)
1944
1945     def _extract_m3u8_formats(self, *args, **kwargs):
1946         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1947         if subs:
1948             self._report_ignoring_subs('HLS')
1949         return fmts
1950
1951     def _extract_m3u8_formats_and_subtitles(
1952             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1953             preference=None, quality=None, m3u8_id=None, note=None,
1954             errnote=None, fatal=True, live=False, data=None, headers={},
1955             query={}):
1956
1957         if self.get_param('ignore_no_formats_error'):
1958             fatal = False
1959
1960         if not m3u8_url:
1961             if errnote is not False:
1962                 errnote = errnote or 'Failed to obtain m3u8 URL'
1963                 if fatal:
1964                     raise ExtractorError(errnote, video_id=video_id)
1965                 self.report_warning(f'{errnote}{bug_reports_message()}')
1966             return [], {}
1967
1968         res = self._download_webpage_handle(
1969             m3u8_url, video_id,
1970             note='Downloading m3u8 information' if note is None else note,
1971             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1972             fatal=fatal, data=data, headers=headers, query=query)
1973
1974         if res is False:
1975             return [], {}
1976
1977         m3u8_doc, urlh = res
1978         m3u8_url = urlh.url
1979
1980         return self._parse_m3u8_formats_and_subtitles(
1981             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1982             preference=preference, quality=quality, m3u8_id=m3u8_id,
1983             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1984             headers=headers, query=query, video_id=video_id)
1985
1986     def _parse_m3u8_formats_and_subtitles(
1987             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
1988             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1989             errnote=None, fatal=True, data=None, headers={}, query={},
1990             video_id=None):
1991         formats, subtitles = [], {}
1992         has_drm = HlsFD._has_drm(m3u8_doc)
1993
1994         def format_url(url):
1995             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
1996
1997         if self.get_param('hls_split_discontinuity', False):
1998             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1999                 if not m3u8_doc:
2000                     if not manifest_url:
2001                         return []
2002                     m3u8_doc = self._download_webpage(
2003                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2004                         note=False, errnote='Failed to download m3u8 playlist information')
2005                     if m3u8_doc is False:
2006                         return []
2007                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2008
2009         else:
2010             def _extract_m3u8_playlist_indices(*args, **kwargs):
2011                 return [None]
2012
2013         # References:
2014         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2015         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2016         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2017
2018         # We should try extracting formats only from master playlists [1, 4.3.4],
2019         # i.e. playlists that describe available qualities. On the other hand
2020         # media playlists [1, 4.3.3] should be returned as is since they contain
2021         # just the media without qualities renditions.
2022         # Fortunately, master playlist can be easily distinguished from media
2023         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2024         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2025         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2026         # media playlist and MUST NOT appear in master playlist thus we can
2027         # clearly detect media playlist with this criterion.
2028
2029         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2030             formats = [{
2031                 'format_id': join_nonempty(m3u8_id, idx),
2032                 'format_index': idx,
2033                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2034                 'ext': ext,
2035                 'protocol': entry_protocol,
2036                 'preference': preference,
2037                 'quality': quality,
2038                 'has_drm': has_drm,
2039             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2040
2041             return formats, subtitles
2042
2043         groups = {}
2044         last_stream_inf = {}
2045
2046         def extract_media(x_media_line):
2047             media = parse_m3u8_attributes(x_media_line)
2048             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2049             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2050             if not (media_type and group_id and name):
2051                 return
2052             groups.setdefault(group_id, []).append(media)
2053             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2054             if media_type == 'SUBTITLES':
2055                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2056                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2057                 # However, lack of URI has been spotted in the wild.
2058                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2059                 if not media.get('URI'):
2060                     return
2061                 url = format_url(media['URI'])
2062                 sub_info = {
2063                     'url': url,
2064                     'ext': determine_ext(url),
2065                 }
2066                 if sub_info['ext'] == 'm3u8':
2067                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2068                     # files may contain is WebVTT:
2069                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2070                     sub_info['ext'] = 'vtt'
2071                     sub_info['protocol'] = 'm3u8_native'
2072                 lang = media.get('LANGUAGE') or 'und'
2073                 subtitles.setdefault(lang, []).append(sub_info)
2074             if media_type not in ('VIDEO', 'AUDIO'):
2075                 return
2076             media_url = media.get('URI')
2077             if media_url:
2078                 manifest_url = format_url(media_url)
2079                 formats.extend({
2080                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2081                     'format_note': name,
2082                     'format_index': idx,
2083                     'url': manifest_url,
2084                     'manifest_url': m3u8_url,
2085                     'language': media.get('LANGUAGE'),
2086                     'ext': ext,
2087                     'protocol': entry_protocol,
2088                     'preference': preference,
2089                     'quality': quality,
2090                     'has_drm': has_drm,
2091                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2092                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2093
2094         def build_stream_name():
2095             # Despite specification does not mention NAME attribute for
2096             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2097             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2098             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2099             stream_name = last_stream_inf.get('NAME')
2100             if stream_name:
2101                 return stream_name
2102             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2103             # from corresponding rendition group
2104             stream_group_id = last_stream_inf.get('VIDEO')
2105             if not stream_group_id:
2106                 return
2107             stream_group = groups.get(stream_group_id)
2108             if not stream_group:
2109                 return stream_group_id
2110             rendition = stream_group[0]
2111             return rendition.get('NAME') or stream_group_id
2112
2113         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2114         # chance to detect video only formats when EXT-X-STREAM-INF tags
2115         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2116         for line in m3u8_doc.splitlines():
2117             if line.startswith('#EXT-X-MEDIA:'):
2118                 extract_media(line)
2119
2120         for line in m3u8_doc.splitlines():
2121             if line.startswith('#EXT-X-STREAM-INF:'):
2122                 last_stream_inf = parse_m3u8_attributes(line)
2123             elif line.startswith('#') or not line.strip():
2124                 continue
2125             else:
2126                 tbr = float_or_none(
2127                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2128                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2129                 manifest_url = format_url(line.strip())
2130
2131                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2132                     format_id = [m3u8_id, None, idx]
2133                     # Bandwidth of live streams may differ over time thus making
2134                     # format_id unpredictable. So it's better to keep provided
2135                     # format_id intact.
2136                     if not live:
2137                         stream_name = build_stream_name()
2138                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2139                     f = {
2140                         'format_id': join_nonempty(*format_id),
2141                         'format_index': idx,
2142                         'url': manifest_url,
2143                         'manifest_url': m3u8_url,
2144                         'tbr': tbr,
2145                         'ext': ext,
2146                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2147                         'protocol': entry_protocol,
2148                         'preference': preference,
2149                         'quality': quality,
2150                         'has_drm': has_drm,
2151                     }
2152                     resolution = last_stream_inf.get('RESOLUTION')
2153                     if resolution:
2154                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2155                         if mobj:
2156                             f['width'] = int(mobj.group('width'))
2157                             f['height'] = int(mobj.group('height'))
2158                     # Unified Streaming Platform
2159                     mobj = re.search(
2160                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2161                     if mobj:
2162                         abr, vbr = mobj.groups()
2163                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2164                         f.update({
2165                             'vbr': vbr,
2166                             'abr': abr,
2167                         })
2168                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2169                     f.update(codecs)
2170                     audio_group_id = last_stream_inf.get('AUDIO')
2171                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2172                     # references a rendition group MUST have a CODECS attribute.
2173                     # However, this is not always respected. E.g. [2]
2174                     # contains EXT-X-STREAM-INF tag which references AUDIO
2175                     # rendition group but does not have CODECS and despite
2176                     # referencing an audio group it represents a complete
2177                     # (with audio and video) format. So, for such cases we will
2178                     # ignore references to rendition groups and treat them
2179                     # as complete formats.
2180                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2181                         audio_group = groups.get(audio_group_id)
2182                         if audio_group and audio_group[0].get('URI'):
2183                             # TODO: update acodec for audio only formats with
2184                             # the same GROUP-ID
2185                             f['acodec'] = 'none'
2186                     if not f.get('ext'):
2187                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2188                     formats.append(f)
2189
2190                     # for DailyMotion
2191                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2192                     if progressive_uri:
2193                         http_f = f.copy()
2194                         del http_f['manifest_url']
2195                         http_f.update({
2196                             'format_id': f['format_id'].replace('hls-', 'http-'),
2197                             'protocol': 'http',
2198                             'url': progressive_uri,
2199                         })
2200                         formats.append(http_f)
2201
2202                 last_stream_inf = {}
2203         return formats, subtitles
2204
2205     def _extract_m3u8_vod_duration(
2206             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2207
2208         m3u8_vod = self._download_webpage(
2209             m3u8_vod_url, video_id,
2210             note='Downloading m3u8 VOD manifest' if note is None else note,
2211             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2212             fatal=False, data=data, headers=headers, query=query)
2213
2214         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2215
2216     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2217         if '#EXT-X-ENDLIST' not in m3u8_vod:
2218             return None
2219
2220         return int(sum(
2221             float(line[len('#EXTINF:'):].split(',')[0])
2222             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2223
2224     def _extract_mpd_vod_duration(
2225             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2226
2227         mpd_doc = self._download_xml(
2228             mpd_url, video_id,
2229             note='Downloading MPD VOD manifest' if note is None else note,
2230             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2231             fatal=False, data=data, headers=headers, query=query)
2232         if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2233             return None
2234         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2235
2236     @staticmethod
2237     def _xpath_ns(path, namespace=None):
2238         if not namespace:
2239             return path
2240         out = []
2241         for c in path.split('/'):
2242             if not c or c == '.':
2243                 out.append(c)
2244             else:
2245                 out.append('{%s}%s' % (namespace, c))
2246         return '/'.join(out)
2247
2248     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2249         if self.get_param('ignore_no_formats_error'):
2250             fatal = False
2251
2252         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2253         if res is False:
2254             assert not fatal
2255             return [], {}
2256         smil, urlh = res
2257
2258         return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2259                                                       namespace=self._parse_smil_namespace(smil))
2260
2261     def _extract_smil_formats(self, *args, **kwargs):
2262         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2263         if subs:
2264             self._report_ignoring_subs('SMIL')
2265         return fmts
2266
2267     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2268         res = self._download_smil(smil_url, video_id, fatal=fatal)
2269         if res is False:
2270             return {}
2271
2272         smil, urlh = res
2273         smil_url = urlh.url
2274
2275         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2276
2277     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2278         return self._download_xml_handle(
2279             smil_url, video_id, 'Downloading SMIL file',
2280             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2281
2282     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2283         namespace = self._parse_smil_namespace(smil)
2284
2285         formats, subtitles = self._parse_smil_formats_and_subtitles(
2286             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2287
2288         video_id = os.path.splitext(url_basename(smil_url))[0]
2289         title = None
2290         description = None
2291         upload_date = None
2292         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2293             name = meta.attrib.get('name')
2294             content = meta.attrib.get('content')
2295             if not name or not content:
2296                 continue
2297             if not title and name == 'title':
2298                 title = content
2299             elif not description and name in ('description', 'abstract'):
2300                 description = content
2301             elif not upload_date and name == 'date':
2302                 upload_date = unified_strdate(content)
2303
2304         thumbnails = [{
2305             'id': image.get('type'),
2306             'url': image.get('src'),
2307             'width': int_or_none(image.get('width')),
2308             'height': int_or_none(image.get('height')),
2309         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2310
2311         return {
2312             'id': video_id,
2313             'title': title or video_id,
2314             'description': description,
2315             'upload_date': upload_date,
2316             'thumbnails': thumbnails,
2317             'formats': formats,
2318             'subtitles': subtitles,
2319         }
2320
2321     def _parse_smil_namespace(self, smil):
2322         return self._search_regex(
2323             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2324
2325     def _parse_smil_formats(self, *args, **kwargs):
2326         fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2327         if subs:
2328             self._report_ignoring_subs('SMIL')
2329         return fmts
2330
2331     def _parse_smil_formats_and_subtitles(
2332             self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2333         base = smil_url
2334         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2335             b = meta.get('base') or meta.get('httpBase')
2336             if b:
2337                 base = b
2338                 break
2339
2340         formats, subtitles = [], {}
2341         rtmp_count = 0
2342         http_count = 0
2343         m3u8_count = 0
2344         imgs_count = 0
2345
2346         srcs = set()
2347         media = itertools.chain.from_iterable(
2348             smil.findall(self._xpath_ns(arg, namespace))
2349             for arg in ['.//video', './/audio', './/media'])
2350         for medium in media:
2351             src = medium.get('src')
2352             if not src or src in srcs:
2353                 continue
2354             srcs.add(src)
2355
2356             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2357             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2358             width = int_or_none(medium.get('width'))
2359             height = int_or_none(medium.get('height'))
2360             proto = medium.get('proto')
2361             ext = medium.get('ext')
2362             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2363                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2364             streamer = medium.get('streamer') or base
2365
2366             if proto == 'rtmp' or streamer.startswith('rtmp'):
2367                 rtmp_count += 1
2368                 formats.append({
2369                     'url': streamer,
2370                     'play_path': src,
2371                     'ext': 'flv',
2372                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2373                     'tbr': bitrate,
2374                     'filesize': filesize,
2375                     'width': width,
2376                     'height': height,
2377                 })
2378                 if transform_rtmp_url:
2379                     streamer, src = transform_rtmp_url(streamer, src)
2380                     formats[-1].update({
2381                         'url': streamer,
2382                         'play_path': src,
2383                     })
2384                 continue
2385
2386             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2387             src_url = src_url.strip()
2388
2389             if proto == 'm3u8' or src_ext == 'm3u8':
2390                 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
2391                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2392                 self._merge_subtitles(m3u8_subs, target=subtitles)
2393                 if len(m3u8_formats) == 1:
2394                     m3u8_count += 1
2395                     m3u8_formats[0].update({
2396                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2397                         'tbr': bitrate,
2398                         'width': width,
2399                         'height': height,
2400                     })
2401                 formats.extend(m3u8_formats)
2402             elif src_ext == 'f4m':
2403                 f4m_url = src_url
2404                 if not f4m_params:
2405                     f4m_params = {
2406                         'hdcore': '3.2.0',
2407                         'plugin': 'flowplayer-3.2.0.1',
2408                     }
2409                 f4m_url += '&' if '?' in f4m_url else '?'
2410                 f4m_url += urllib.parse.urlencode(f4m_params)
2411                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2412             elif src_ext == 'mpd':
2413                 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2414                     src_url, video_id, mpd_id='dash', fatal=False)
2415                 formats.extend(mpd_formats)
2416                 self._merge_subtitles(mpd_subs, target=subtitles)
2417             elif re.search(r'\.ism/[Mm]anifest', src_url):
2418                 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2419                     src_url, video_id, ism_id='mss', fatal=False)
2420                 formats.extend(ism_formats)
2421                 self._merge_subtitles(ism_subs, target=subtitles)
2422             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2423                 http_count += 1
2424                 formats.append({
2425                     'url': src_url,
2426                     'ext': ext or src_ext or 'flv',
2427                     'format_id': 'http-%d' % (bitrate or http_count),
2428                     'tbr': bitrate,
2429                     'filesize': filesize,
2430                     'width': width,
2431                     'height': height,
2432                 })
2433
2434         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2435             src = medium.get('src')
2436             if not src or src in srcs:
2437                 continue
2438             srcs.add(src)
2439
2440             imgs_count += 1
2441             formats.append({
2442                 'format_id': 'imagestream-%d' % (imgs_count),
2443                 'url': src,
2444                 'ext': mimetype2ext(medium.get('type')),
2445                 'acodec': 'none',
2446                 'vcodec': 'none',
2447                 'width': int_or_none(medium.get('width')),
2448                 'height': int_or_none(medium.get('height')),
2449                 'format_note': 'SMIL storyboards',
2450             })
2451
2452         smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2453         self._merge_subtitles(smil_subs, target=subtitles)
2454
2455         return formats, subtitles
2456
2457     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2458         urls = []
2459         subtitles = {}
2460         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2461             src = textstream.get('src')
2462             if not src or src in urls:
2463                 continue
2464             urls.append(src)
2465             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2466             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2467             subtitles.setdefault(lang, []).append({
2468                 'url': src,
2469                 'ext': ext,
2470             })
2471         return subtitles
2472
2473     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2474         res = self._download_xml_handle(
2475             xspf_url, playlist_id, 'Downloading xpsf playlist',
2476             'Unable to download xspf manifest', fatal=fatal)
2477         if res is False:
2478             return []
2479
2480         xspf, urlh = res
2481         xspf_url = urlh.url
2482
2483         return self._parse_xspf(
2484             xspf, playlist_id, xspf_url=xspf_url,
2485             xspf_base_url=base_url(xspf_url))
2486
2487     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2488         NS_MAP = {
2489             'xspf': 'http://xspf.org/ns/0/',
2490             's1': 'http://static.streamone.nl/player/ns/0',
2491         }
2492
2493         entries = []
2494         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2495             title = xpath_text(
2496                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2497             description = xpath_text(
2498                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2499             thumbnail = xpath_text(
2500                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2501             duration = float_or_none(
2502                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2503
2504             formats = []
2505             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2506                 format_url = urljoin(xspf_base_url, location.text)
2507                 if not format_url:
2508                     continue
2509                 formats.append({
2510                     'url': format_url,
2511                     'manifest_url': xspf_url,
2512                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2513                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2514                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2515                 })
2516
2517             entries.append({
2518                 'id': playlist_id,
2519                 'title': title,
2520                 'description': description,
2521                 'thumbnail': thumbnail,
2522                 'duration': duration,
2523                 'formats': formats,
2524             })
2525         return entries
2526
2527     def _extract_mpd_formats(self, *args, **kwargs):
2528         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2529         if subs:
2530             self._report_ignoring_subs('DASH')
2531         return fmts
2532
2533     def _extract_mpd_formats_and_subtitles(
2534             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2535             fatal=True, data=None, headers={}, query={}):
2536
2537         if self.get_param('ignore_no_formats_error'):
2538             fatal = False
2539
2540         res = self._download_xml_handle(
2541             mpd_url, video_id,
2542             note='Downloading MPD manifest' if note is None else note,
2543             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2544             fatal=fatal, data=data, headers=headers, query=query)
2545         if res is False:
2546             return [], {}
2547         mpd_doc, urlh = res
2548         if mpd_doc is None:
2549             return [], {}
2550
2551         # We could have been redirected to a new url when we retrieved our mpd file.
2552         mpd_url = urlh.url
2553         mpd_base_url = base_url(mpd_url)
2554
2555         return self._parse_mpd_formats_and_subtitles(
2556             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2557
2558     def _parse_mpd_formats(self, *args, **kwargs):
2559         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2560         if subs:
2561             self._report_ignoring_subs('DASH')
2562         return fmts
2563
2564     def _parse_mpd_formats_and_subtitles(
2565             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2566         """
2567         Parse formats from MPD manifest.
2568         References:
2569          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2570             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2571          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2572         """
2573         if not self.get_param('dynamic_mpd', True):
2574             if mpd_doc.get('type') == 'dynamic':
2575                 return [], {}
2576
2577         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2578
2579         def _add_ns(path):
2580             return self._xpath_ns(path, namespace)
2581
2582         def is_drm_protected(element):
2583             return element.find(_add_ns('ContentProtection')) is not None
2584
2585         def extract_multisegment_info(element, ms_parent_info):
2586             ms_info = ms_parent_info.copy()
2587
2588             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2589             # common attributes and elements.  We will only extract relevant
2590             # for us.
2591             def extract_common(source):
2592                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2593                 if segment_timeline is not None:
2594                     s_e = segment_timeline.findall(_add_ns('S'))
2595                     if s_e:
2596                         ms_info['total_number'] = 0
2597                         ms_info['s'] = []
2598                         for s in s_e:
2599                             r = int(s.get('r', 0))
2600                             ms_info['total_number'] += 1 + r
2601                             ms_info['s'].append({
2602                                 't': int(s.get('t', 0)),
2603                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2604                                 'd': int(s.attrib['d']),
2605                                 'r': r,
2606                             })
2607                 start_number = source.get('startNumber')
2608                 if start_number:
2609                     ms_info['start_number'] = int(start_number)
2610                 timescale = source.get('timescale')
2611                 if timescale:
2612                     ms_info['timescale'] = int(timescale)
2613                 segment_duration = source.get('duration')
2614                 if segment_duration:
2615                     ms_info['segment_duration'] = float(segment_duration)
2616
2617             def extract_Initialization(source):
2618                 initialization = source.find(_add_ns('Initialization'))
2619                 if initialization is not None:
2620                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2621
2622             segment_list = element.find(_add_ns('SegmentList'))
2623             if segment_list is not None:
2624                 extract_common(segment_list)
2625                 extract_Initialization(segment_list)
2626                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2627                 if segment_urls_e:
2628                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2629             else:
2630                 segment_template = element.find(_add_ns('SegmentTemplate'))
2631                 if segment_template is not None:
2632                     extract_common(segment_template)
2633                     media = segment_template.get('media')
2634                     if media:
2635                         ms_info['media'] = media
2636                     initialization = segment_template.get('initialization')
2637                     if initialization:
2638                         ms_info['initialization'] = initialization
2639                     else:
2640                         extract_Initialization(segment_template)
2641             return ms_info
2642
2643         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2644         formats, subtitles = [], {}
2645         stream_numbers = collections.defaultdict(int)
2646         for period in mpd_doc.findall(_add_ns('Period')):
2647             period_duration = parse_duration(period.get('duration')) or mpd_duration
2648             period_ms_info = extract_multisegment_info(period, {
2649                 'start_number': 1,
2650                 'timescale': 1,
2651             })
2652             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2653                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2654                 for representation in adaptation_set.findall(_add_ns('Representation')):
2655                     representation_attrib = adaptation_set.attrib.copy()
2656                     representation_attrib.update(representation.attrib)
2657                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2658                     mime_type = representation_attrib['mimeType']
2659                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2660
2661                     codec_str = representation_attrib.get('codecs', '')
2662                     # Some kind of binary subtitle found in some youtube livestreams
2663                     if mime_type == 'application/x-rawcc':
2664                         codecs = {'scodec': codec_str}
2665                     else:
2666                         codecs = parse_codecs(codec_str)
2667                     if content_type not in ('video', 'audio', 'text'):
2668                         if mime_type == 'image/jpeg':
2669                             content_type = mime_type
2670                         elif codecs.get('vcodec', 'none') != 'none':
2671                             content_type = 'video'
2672                         elif codecs.get('acodec', 'none') != 'none':
2673                             content_type = 'audio'
2674                         elif codecs.get('scodec', 'none') != 'none':
2675                             content_type = 'text'
2676                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2677                             content_type = 'text'
2678                         else:
2679                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2680                             continue
2681
2682                     base_url = ''
2683                     for element in (representation, adaptation_set, period, mpd_doc):
2684                         base_url_e = element.find(_add_ns('BaseURL'))
2685                         if try_call(lambda: base_url_e.text) is not None:
2686                             base_url = base_url_e.text + base_url
2687                             if re.match(r'^https?://', base_url):
2688                                 break
2689                     if mpd_base_url and base_url.startswith('/'):
2690                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2691                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2692                         if not mpd_base_url.endswith('/'):
2693                             mpd_base_url += '/'
2694                         base_url = mpd_base_url + base_url
2695                     representation_id = representation_attrib.get('id')
2696                     lang = representation_attrib.get('lang')
2697                     url_el = representation.find(_add_ns('BaseURL'))
2698                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2699                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2700                     if representation_id is not None:
2701                         format_id = representation_id
2702                     else:
2703                         format_id = content_type
2704                     if mpd_id:
2705                         format_id = mpd_id + '-' + format_id
2706                     if content_type in ('video', 'audio'):
2707                         f = {
2708                             'format_id': format_id,
2709                             'manifest_url': mpd_url,
2710                             'ext': mimetype2ext(mime_type),
2711                             'width': int_or_none(representation_attrib.get('width')),
2712                             'height': int_or_none(representation_attrib.get('height')),
2713                             'tbr': float_or_none(bandwidth, 1000),
2714                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2715                             'fps': int_or_none(representation_attrib.get('frameRate')),
2716                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2717                             'format_note': 'DASH %s' % content_type,
2718                             'filesize': filesize,
2719                             'container': mimetype2ext(mime_type) + '_dash',
2720                             **codecs
2721                         }
2722                     elif content_type == 'text':
2723                         f = {
2724                             'ext': mimetype2ext(mime_type),
2725                             'manifest_url': mpd_url,
2726                             'filesize': filesize,
2727                         }
2728                     elif content_type == 'image/jpeg':
2729                         # See test case in VikiIE
2730                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2731                         f = {
2732                             'format_id': format_id,
2733                             'ext': 'mhtml',
2734                             'manifest_url': mpd_url,
2735                             'format_note': 'DASH storyboards (jpeg)',
2736                             'acodec': 'none',
2737                             'vcodec': 'none',
2738                         }
2739                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2740                         f['has_drm'] = True
2741                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2742
2743                     def prepare_template(template_name, identifiers):
2744                         tmpl = representation_ms_info[template_name]
2745                         if representation_id is not None:
2746                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2747                         # First of, % characters outside $...$ templates
2748                         # must be escaped by doubling for proper processing
2749                         # by % operator string formatting used further (see
2750                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2751                         t = ''
2752                         in_template = False
2753                         for c in tmpl:
2754                             t += c
2755                             if c == '$':
2756                                 in_template = not in_template
2757                             elif c == '%' and not in_template:
2758                                 t += c
2759                         # Next, $...$ templates are translated to their
2760                         # %(...) counterparts to be used with % operator
2761                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2762                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2763                         t.replace('$$', '$')
2764                         return t
2765
2766                     # @initialization is a regular template like @media one
2767                     # so it should be handled just the same way (see
2768                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2769                     if 'initialization' in representation_ms_info:
2770                         initialization_template = prepare_template(
2771                             'initialization',
2772                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2773                             # $Time$ shall not be included for @initialization thus
2774                             # only $Bandwidth$ remains
2775                             ('Bandwidth', ))
2776                         representation_ms_info['initialization_url'] = initialization_template % {
2777                             'Bandwidth': bandwidth,
2778                         }
2779
2780                     def location_key(location):
2781                         return 'url' if re.match(r'^https?://', location) else 'path'
2782
2783                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2784
2785                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2786                         media_location_key = location_key(media_template)
2787
2788                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2789                         # can't be used at the same time
2790                         if '%(Number' in media_template and 's' not in representation_ms_info:
2791                             segment_duration = None
2792                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2793                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2794                                 representation_ms_info['total_number'] = int(math.ceil(
2795                                     float_or_none(period_duration, segment_duration, default=0)))
2796                             representation_ms_info['fragments'] = [{
2797                                 media_location_key: media_template % {
2798                                     'Number': segment_number,
2799                                     'Bandwidth': bandwidth,
2800                                 },
2801                                 'duration': segment_duration,
2802                             } for segment_number in range(
2803                                 representation_ms_info['start_number'],
2804                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2805                         else:
2806                             # $Number*$ or $Time$ in media template with S list available
2807                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2808                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2809                             representation_ms_info['fragments'] = []
2810                             segment_time = 0
2811                             segment_d = None
2812                             segment_number = representation_ms_info['start_number']
2813
2814                             def add_segment_url():
2815                                 segment_url = media_template % {
2816                                     'Time': segment_time,
2817                                     'Bandwidth': bandwidth,
2818                                     'Number': segment_number,
2819                                 }
2820                                 representation_ms_info['fragments'].append({
2821                                     media_location_key: segment_url,
2822                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2823                                 })
2824
2825                             for num, s in enumerate(representation_ms_info['s']):
2826                                 segment_time = s.get('t') or segment_time
2827                                 segment_d = s['d']
2828                                 add_segment_url()
2829                                 segment_number += 1
2830                                 for r in range(s.get('r', 0)):
2831                                     segment_time += segment_d
2832                                     add_segment_url()
2833                                     segment_number += 1
2834                                 segment_time += segment_d
2835                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2836                         # No media template,
2837                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2838                         # or any YouTube dashsegments video
2839                         fragments = []
2840                         segment_index = 0
2841                         timescale = representation_ms_info['timescale']
2842                         for s in representation_ms_info['s']:
2843                             duration = float_or_none(s['d'], timescale)
2844                             for r in range(s.get('r', 0) + 1):
2845                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2846                                 fragments.append({
2847                                     location_key(segment_uri): segment_uri,
2848                                     'duration': duration,
2849                                 })
2850                                 segment_index += 1
2851                         representation_ms_info['fragments'] = fragments
2852                     elif 'segment_urls' in representation_ms_info:
2853                         # Segment URLs with no SegmentTimeline
2854                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2855                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2856                         fragments = []
2857                         segment_duration = float_or_none(
2858                             representation_ms_info['segment_duration'],
2859                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2860                         for segment_url in representation_ms_info['segment_urls']:
2861                             fragment = {
2862                                 location_key(segment_url): segment_url,
2863                             }
2864                             if segment_duration:
2865                                 fragment['duration'] = segment_duration
2866                             fragments.append(fragment)
2867                         representation_ms_info['fragments'] = fragments
2868                     # If there is a fragments key available then we correctly recognized fragmented media.
2869                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2870                     # assumption is not necessarily correct since we may simply have no support for
2871                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2872                     if 'fragments' in representation_ms_info:
2873                         f.update({
2874                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2875                             'url': mpd_url or base_url,
2876                             'fragment_base_url': base_url,
2877                             'fragments': [],
2878                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2879                         })
2880                         if 'initialization_url' in representation_ms_info:
2881                             initialization_url = representation_ms_info['initialization_url']
2882                             if not f.get('url'):
2883                                 f['url'] = initialization_url
2884                             f['fragments'].append({location_key(initialization_url): initialization_url})
2885                         f['fragments'].extend(representation_ms_info['fragments'])
2886                         if not period_duration:
2887                             period_duration = try_get(
2888                                 representation_ms_info,
2889                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2890                     else:
2891                         # Assuming direct URL to unfragmented media.
2892                         f['url'] = base_url
2893                     if content_type in ('video', 'audio', 'image/jpeg'):
2894                         f['manifest_stream_number'] = stream_numbers[f['url']]
2895                         stream_numbers[f['url']] += 1
2896                         formats.append(f)
2897                     elif content_type == 'text':
2898                         subtitles.setdefault(lang or 'und', []).append(f)
2899
2900         return formats, subtitles
2901
2902     def _extract_ism_formats(self, *args, **kwargs):
2903         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2904         if subs:
2905             self._report_ignoring_subs('ISM')
2906         return fmts
2907
2908     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2909         if self.get_param('ignore_no_formats_error'):
2910             fatal = False
2911
2912         res = self._download_xml_handle(
2913             ism_url, video_id,
2914             note='Downloading ISM manifest' if note is None else note,
2915             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2916             fatal=fatal, data=data, headers=headers, query=query)
2917         if res is False:
2918             return [], {}
2919         ism_doc, urlh = res
2920         if ism_doc is None:
2921             return [], {}
2922
2923         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
2924
2925     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2926         """
2927         Parse formats from ISM manifest.
2928         References:
2929          1. [MS-SSTR]: Smooth Streaming Protocol,
2930             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2931         """
2932         if ism_doc.get('IsLive') == 'TRUE':
2933             return [], {}
2934
2935         duration = int(ism_doc.attrib['Duration'])
2936         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2937
2938         formats = []
2939         subtitles = {}
2940         for stream in ism_doc.findall('StreamIndex'):
2941             stream_type = stream.get('Type')
2942             if stream_type not in ('video', 'audio', 'text'):
2943                 continue
2944             url_pattern = stream.attrib['Url']
2945             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2946             stream_name = stream.get('Name')
2947             stream_language = stream.get('Language', 'und')
2948             for track in stream.findall('QualityLevel'):
2949                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2950                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
2951                 # TODO: add support for WVC1 and WMAP
2952                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
2953                     self.report_warning('%s is not a supported codec' % fourcc)
2954                     continue
2955                 tbr = int(track.attrib['Bitrate']) // 1000
2956                 # [1] does not mention Width and Height attributes. However,
2957                 # they're often present while MaxWidth and MaxHeight are
2958                 # missing, so should be used as fallbacks
2959                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2960                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2961                 sampling_rate = int_or_none(track.get('SamplingRate'))
2962
2963                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2964                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
2965
2966                 fragments = []
2967                 fragment_ctx = {
2968                     'time': 0,
2969                 }
2970                 stream_fragments = stream.findall('c')
2971                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2972                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2973                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2974                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2975                     if not fragment_ctx['duration']:
2976                         try:
2977                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2978                         except IndexError:
2979                             next_fragment_time = duration
2980                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2981                     for _ in range(fragment_repeat):
2982                         fragments.append({
2983                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
2984                             'duration': fragment_ctx['duration'] / stream_timescale,
2985                         })
2986                         fragment_ctx['time'] += fragment_ctx['duration']
2987
2988                 if stream_type == 'text':
2989                     subtitles.setdefault(stream_language, []).append({
2990                         'ext': 'ismt',
2991                         'protocol': 'ism',
2992                         'url': ism_url,
2993                         'manifest_url': ism_url,
2994                         'fragments': fragments,
2995                         '_download_params': {
2996                             'stream_type': stream_type,
2997                             'duration': duration,
2998                             'timescale': stream_timescale,
2999                             'fourcc': fourcc,
3000                             'language': stream_language,
3001                             'codec_private_data': track.get('CodecPrivateData'),
3002                         }
3003                     })
3004                 elif stream_type in ('video', 'audio'):
3005                     formats.append({
3006                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3007                         'url': ism_url,
3008                         'manifest_url': ism_url,
3009                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3010                         'width': width,
3011                         'height': height,
3012                         'tbr': tbr,
3013                         'asr': sampling_rate,
3014                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3015                         'acodec': 'none' if stream_type == 'video' else fourcc,
3016                         'protocol': 'ism',
3017                         'fragments': fragments,
3018                         'has_drm': ism_doc.find('Protection') is not None,
3019                         'language': stream_language,
3020                         'audio_channels': int_or_none(track.get('Channels')),
3021                         '_download_params': {
3022                             'stream_type': stream_type,
3023                             'duration': duration,
3024                             'timescale': stream_timescale,
3025                             'width': width or 0,
3026                             'height': height or 0,
3027                             'fourcc': fourcc,
3028                             'language': stream_language,
3029                             'codec_private_data': track.get('CodecPrivateData'),
3030                             'sampling_rate': sampling_rate,
3031                             'channels': int_or_none(track.get('Channels', 2)),
3032                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3033                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3034                         },
3035                     })
3036         return formats, subtitles
3037
3038     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3039         def absolute_url(item_url):
3040             return urljoin(base_url, item_url)
3041
3042         def parse_content_type(content_type):
3043             if not content_type:
3044                 return {}
3045             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3046             if ctr:
3047                 mimetype, codecs = ctr.groups()
3048                 f = parse_codecs(codecs)
3049                 f['ext'] = mimetype2ext(mimetype)
3050                 return f
3051             return {}
3052
3053         def _media_formats(src, cur_media_type, type_info=None):
3054             type_info = type_info or {}
3055             full_url = absolute_url(src)
3056             ext = type_info.get('ext') or determine_ext(full_url)
3057             if ext == 'm3u8':
3058                 is_plain_url = False
3059                 formats = self._extract_m3u8_formats(
3060                     full_url, video_id, ext='mp4',
3061                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3062                     preference=preference, quality=quality, fatal=False)
3063             elif ext == 'mpd':
3064                 is_plain_url = False
3065                 formats = self._extract_mpd_formats(
3066                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3067             else:
3068                 is_plain_url = True
3069                 formats = [{
3070                     'url': full_url,
3071                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3072                     'ext': ext,
3073                 }]
3074             return is_plain_url, formats
3075
3076         entries = []
3077         # amp-video and amp-audio are very similar to their HTML5 counterparts
3078         # so we will include them right here (see
3079         # https://www.ampproject.org/docs/reference/components/amp-video)
3080         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3081         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3082         media_tags = [(media_tag, media_tag_name, media_type, '')
3083                       for media_tag, media_tag_name, media_type
3084                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3085         media_tags.extend(re.findall(
3086             # We only allow video|audio followed by a whitespace or '>'.
3087             # Allowing more characters may end up in significant slow down (see
3088             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3089             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3090             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3091         for media_tag, _, media_type, media_content in media_tags:
3092             media_info = {
3093                 'formats': [],
3094                 'subtitles': {},
3095             }
3096             media_attributes = extract_attributes(media_tag)
3097             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3098             if src:
3099                 f = parse_content_type(media_attributes.get('type'))
3100                 _, formats = _media_formats(src, media_type, f)
3101                 media_info['formats'].extend(formats)
3102             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3103             if media_content:
3104                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3105                     s_attr = extract_attributes(source_tag)
3106                     # data-video-src and data-src are non standard but seen
3107                     # several times in the wild
3108                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3109                     if not src:
3110                         continue
3111                     f = parse_content_type(s_attr.get('type'))
3112                     is_plain_url, formats = _media_formats(src, media_type, f)
3113                     if is_plain_url:
3114                         # width, height, res, label and title attributes are
3115                         # all not standard but seen several times in the wild
3116                         labels = [
3117                             s_attr.get(lbl)
3118                             for lbl in ('label', 'title')
3119                             if str_or_none(s_attr.get(lbl))
3120                         ]
3121                         width = int_or_none(s_attr.get('width'))
3122                         height = (int_or_none(s_attr.get('height'))
3123                                   or int_or_none(s_attr.get('res')))
3124                         if not width or not height:
3125                             for lbl in labels:
3126                                 resolution = parse_resolution(lbl)
3127                                 if not resolution:
3128                                     continue
3129                                 width = width or resolution.get('width')
3130                                 height = height or resolution.get('height')
3131                         for lbl in labels:
3132                             tbr = parse_bitrate(lbl)
3133                             if tbr:
3134                                 break
3135                         else:
3136                             tbr = None
3137                         f.update({
3138                             'width': width,
3139                             'height': height,
3140                             'tbr': tbr,
3141                             'format_id': s_attr.get('label') or s_attr.get('title'),
3142                         })
3143                         f.update(formats[0])
3144                         media_info['formats'].append(f)
3145                     else:
3146                         media_info['formats'].extend(formats)
3147                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3148                     track_attributes = extract_attributes(track_tag)
3149                     kind = track_attributes.get('kind')
3150                     if not kind or kind in ('subtitles', 'captions'):
3151                         src = strip_or_none(track_attributes.get('src'))
3152                         if not src:
3153                             continue
3154                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3155                         media_info['subtitles'].setdefault(lang, []).append({
3156                             'url': absolute_url(src),
3157                         })
3158             for f in media_info['formats']:
3159                 f.setdefault('http_headers', {})['Referer'] = base_url
3160             if media_info['formats'] or media_info['subtitles']:
3161                 entries.append(media_info)
3162         return entries
3163
3164     def _extract_akamai_formats(self, *args, **kwargs):
3165         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3166         if subs:
3167             self._report_ignoring_subs('akamai')
3168         return fmts
3169
3170     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3171         signed = 'hdnea=' in manifest_url
3172         if not signed:
3173             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3174             manifest_url = re.sub(
3175                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3176                 '', manifest_url).strip('?')
3177
3178         formats = []
3179         subtitles = {}
3180
3181         hdcore_sign = 'hdcore=3.7.0'
3182         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3183         hds_host = hosts.get('hds')
3184         if hds_host:
3185             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3186         if 'hdcore=' not in f4m_url:
3187             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3188         f4m_formats = self._extract_f4m_formats(
3189             f4m_url, video_id, f4m_id='hds', fatal=False)
3190         for entry in f4m_formats:
3191             entry.update({'extra_param_to_segment_url': hdcore_sign})
3192         formats.extend(f4m_formats)
3193
3194         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3195         hls_host = hosts.get('hls')
3196         if hls_host:
3197             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3198         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3199             m3u8_url, video_id, 'mp4', 'm3u8_native',
3200             m3u8_id='hls', fatal=False)
3201         formats.extend(m3u8_formats)
3202         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3203
3204         http_host = hosts.get('http')
3205         if http_host and m3u8_formats and not signed:
3206             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3207             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3208             qualities_length = len(qualities)
3209             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3210                 i = 0
3211                 for f in m3u8_formats:
3212                     if f['vcodec'] != 'none':
3213                         for protocol in ('http', 'https'):
3214                             http_f = f.copy()
3215                             del http_f['manifest_url']
3216                             http_url = re.sub(
3217                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3218                             http_f.update({
3219                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3220                                 'url': http_url,
3221                                 'protocol': protocol,
3222                             })
3223                             formats.append(http_f)
3224                         i += 1
3225
3226         return formats, subtitles
3227
3228     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3229         query = urllib.parse.urlparse(url).query
3230         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3231         mobj = re.search(
3232             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3233         url_base = mobj.group('url')
3234         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3235         formats = []
3236
3237         def manifest_url(manifest):
3238             m_url = f'{http_base_url}/{manifest}'
3239             if query:
3240                 m_url += '?%s' % query
3241             return m_url
3242
3243         if 'm3u8' not in skip_protocols:
3244             formats.extend(self._extract_m3u8_formats(
3245                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3246                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3247         if 'f4m' not in skip_protocols:
3248             formats.extend(self._extract_f4m_formats(
3249                 manifest_url('manifest.f4m'),
3250                 video_id, f4m_id='hds', fatal=False))
3251         if 'dash' not in skip_protocols:
3252             formats.extend(self._extract_mpd_formats(
3253                 manifest_url('manifest.mpd'),
3254                 video_id, mpd_id='dash', fatal=False))
3255         if re.search(r'(?:/smil:|\.smil)', url_base):
3256             if 'smil' not in skip_protocols:
3257                 rtmp_formats = self._extract_smil_formats(
3258                     manifest_url('jwplayer.smil'),
3259                     video_id, fatal=False)
3260                 for rtmp_format in rtmp_formats:
3261                     rtsp_format = rtmp_format.copy()
3262                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3263                     del rtsp_format['play_path']
3264                     del rtsp_format['ext']
3265                     rtsp_format.update({
3266                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3267                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3268                         'protocol': 'rtsp',
3269                     })
3270                     formats.extend([rtmp_format, rtsp_format])
3271         else:
3272             for protocol in ('rtmp', 'rtsp'):
3273                 if protocol not in skip_protocols:
3274                     formats.append({
3275                         'url': f'{protocol}:{url_base}',
3276                         'format_id': protocol,
3277                         'protocol': protocol,
3278                     })
3279         return formats
3280
3281     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3282         mobj = re.search(
3283             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3284             webpage)
3285         if mobj:
3286             try:
3287                 jwplayer_data = self._parse_json(mobj.group('options'),
3288                                                  video_id=video_id,
3289                                                  transform_source=transform_source)
3290             except ExtractorError:
3291                 pass
3292             else:
3293                 if isinstance(jwplayer_data, dict):
3294                     return jwplayer_data
3295
3296     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3297         jwplayer_data = self._find_jwplayer_data(
3298             webpage, video_id, transform_source=js_to_json)
3299         return self._parse_jwplayer_data(
3300             jwplayer_data, video_id, *args, **kwargs)
3301
3302     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3303                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3304         entries = []
3305         if not isinstance(jwplayer_data, dict):
3306             return entries
3307
3308         playlist_items = jwplayer_data.get('playlist')
3309         # JWPlayer backward compatibility: single playlist item/flattened playlists
3310         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3311         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3312         if not isinstance(playlist_items, list):
3313             playlist_items = (playlist_items or jwplayer_data, )
3314
3315         for video_data in playlist_items:
3316             if not isinstance(video_data, dict):
3317                 continue
3318             # JWPlayer backward compatibility: flattened sources
3319             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3320             if 'sources' not in video_data:
3321                 video_data['sources'] = [video_data]
3322
3323             this_video_id = video_id or video_data['mediaid']
3324
3325             formats = self._parse_jwplayer_formats(
3326                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3327                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3328
3329             subtitles = {}
3330             tracks = video_data.get('tracks')
3331             if tracks and isinstance(tracks, list):
3332                 for track in tracks:
3333                     if not isinstance(track, dict):
3334                         continue
3335                     track_kind = track.get('kind')
3336                     if not track_kind or not isinstance(track_kind, str):
3337                         continue
3338                     if track_kind.lower() not in ('captions', 'subtitles'):
3339                         continue
3340                     track_url = urljoin(base_url, track.get('file'))
3341                     if not track_url:
3342                         continue
3343                     subtitles.setdefault(track.get('label') or 'en', []).append({
3344                         'url': self._proto_relative_url(track_url)
3345                     })
3346
3347             entry = {
3348                 'id': this_video_id,
3349                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3350                 'description': clean_html(video_data.get('description')),
3351                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3352                 'timestamp': int_or_none(video_data.get('pubdate')),
3353                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3354                 'subtitles': subtitles,
3355                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3356                 'genre': clean_html(video_data.get('genre')),
3357                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3358                 'season_number': int_or_none(video_data.get('season')),
3359                 'episode_number': int_or_none(video_data.get('episode')),
3360                 'release_year': int_or_none(video_data.get('releasedate')),
3361                 'age_limit': int_or_none(video_data.get('age_restriction')),
3362             }
3363             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3364             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3365                 entry.update({
3366                     '_type': 'url_transparent',
3367                     'url': formats[0]['url'],
3368                 })
3369             else:
3370                 entry['formats'] = formats
3371             entries.append(entry)
3372         if len(entries) == 1:
3373             return entries[0]
3374         else:
3375             return self.playlist_result(entries)
3376
3377     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3378                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3379         urls = set()
3380         formats = []
3381         for source in jwplayer_sources_data:
3382             if not isinstance(source, dict):
3383                 continue
3384             source_url = urljoin(
3385                 base_url, self._proto_relative_url(source.get('file')))
3386             if not source_url or source_url in urls:
3387                 continue
3388             urls.add(source_url)
3389             source_type = source.get('type') or ''
3390             ext = mimetype2ext(source_type) or determine_ext(source_url)
3391             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3392                 formats.extend(self._extract_m3u8_formats(
3393                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3394                     m3u8_id=m3u8_id, fatal=False))
3395             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3396                 formats.extend(self._extract_mpd_formats(
3397                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3398             elif ext == 'smil':
3399                 formats.extend(self._extract_smil_formats(
3400                     source_url, video_id, fatal=False))
3401             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3402             elif source_type.startswith('audio') or ext in (
3403                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3404                 formats.append({
3405                     'url': source_url,
3406                     'vcodec': 'none',
3407                     'ext': ext,
3408                 })
3409             else:
3410                 format_id = str_or_none(source.get('label'))
3411                 height = int_or_none(source.get('height'))
3412                 if height is None and format_id:
3413                     # Often no height is provided but there is a label in
3414                     # format like "1080p", "720p SD", or 1080.
3415                     height = parse_resolution(format_id).get('height')
3416                 a_format = {
3417                     'url': source_url,
3418                     'width': int_or_none(source.get('width')),
3419                     'height': height,
3420                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3421                     'filesize': int_or_none(source.get('filesize')),
3422                     'ext': ext,
3423                     'format_id': format_id
3424                 }
3425                 if source_url.startswith('rtmp'):
3426                     a_format['ext'] = 'flv'
3427                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3428                     # of jwplayer.flash.swf
3429                     rtmp_url_parts = re.split(
3430                         r'((?:mp4|mp3|flv):)', source_url, 1)
3431                     if len(rtmp_url_parts) == 3:
3432                         rtmp_url, prefix, play_path = rtmp_url_parts
3433                         a_format.update({
3434                             'url': rtmp_url,
3435                             'play_path': prefix + play_path,
3436                         })
3437                     if rtmp_params:
3438                         a_format.update(rtmp_params)
3439                 formats.append(a_format)
3440         return formats
3441
3442     def _live_title(self, name):
3443         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3444         return name
3445
3446     def _int(self, v, name, fatal=False, **kwargs):
3447         res = int_or_none(v, **kwargs)
3448         if res is None:
3449             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3450             if fatal:
3451                 raise ExtractorError(msg)
3452             else:
3453                 self.report_warning(msg)
3454         return res
3455
3456     def _float(self, v, name, fatal=False, **kwargs):
3457         res = float_or_none(v, **kwargs)
3458         if res is None:
3459             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3460             if fatal:
3461                 raise ExtractorError(msg)
3462             else:
3463                 self.report_warning(msg)
3464         return res
3465
3466     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3467                     path='/', secure=False, discard=False, rest={}, **kwargs):
3468         cookie = http.cookiejar.Cookie(
3469             0, name, value, port, port is not None, domain, True,
3470             domain.startswith('.'), path, True, secure, expire_time,
3471             discard, None, None, rest)
3472         self.cookiejar.set_cookie(cookie)
3473
3474     def _get_cookies(self, url):
3475         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3476         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3477
3478     def _apply_first_set_cookie_header(self, url_handle, cookie):
3479         """
3480         Apply first Set-Cookie header instead of the last. Experimental.
3481
3482         Some sites (e.g. [1-3]) may serve two cookies under the same name
3483         in Set-Cookie header and expect the first (old) one to be set rather
3484         than second (new). However, as of RFC6265 the newer one cookie
3485         should be set into cookie store what actually happens.
3486         We will workaround this issue by resetting the cookie to
3487         the first one manually.
3488         1. https://new.vk.com/
3489         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3490         3. https://learning.oreilly.com/
3491         """
3492         for header, cookies in url_handle.headers.items():
3493             if header.lower() != 'set-cookie':
3494                 continue
3495             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3496             cookie_value = re.search(
3497                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3498             if cookie_value:
3499                 value, domain = cookie_value.groups()
3500                 self._set_cookie(domain, cookie, value)
3501                 break
3502
3503     @classmethod
3504     def get_testcases(cls, include_onlymatching=False):
3505         # Do not look in super classes
3506         t = vars(cls).get('_TEST')
3507         if t:
3508             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3509             tests = [t]
3510         else:
3511             tests = vars(cls).get('_TESTS', [])
3512         for t in tests:
3513             if not include_onlymatching and t.get('only_matching', False):
3514                 continue
3515             t['name'] = cls.ie_key()
3516             yield t
3517         if getattr(cls, '__wrapped__', None):
3518             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3519
3520     @classmethod
3521     def get_webpage_testcases(cls):
3522         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3523         for t in tests:
3524             t['name'] = cls.ie_key()
3525             yield t
3526         if getattr(cls, '__wrapped__', None):
3527             yield from cls.__wrapped__.get_webpage_testcases()
3528
3529     @classproperty(cache=True)
3530     def age_limit(cls):
3531         """Get age limit from the testcases"""
3532         return max(traverse_obj(
3533             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3534             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3535
3536     @classproperty(cache=True)
3537     def _RETURN_TYPE(cls):
3538         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3539         tests = tuple(cls.get_testcases(include_onlymatching=False))
3540         if not tests:
3541             return None
3542         elif not any(k.startswith('playlist') for test in tests for k in test):
3543             return 'video'
3544         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3545             return 'playlist'
3546         return 'any'
3547
3548     @classmethod
3549     def is_single_video(cls, url):
3550         """Returns whether the URL is of a single video, None if unknown"""
3551         if cls.suitable(url):
3552             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3553
3554     @classmethod
3555     def is_suitable(cls, age_limit):
3556         """Test whether the extractor is generally suitable for the given age limit"""
3557         return not age_restricted(cls.age_limit, age_limit)
3558
3559     @classmethod
3560     def description(cls, *, markdown=True, search_examples=None):
3561         """Description of the extractor"""
3562         desc = ''
3563         if cls._NETRC_MACHINE:
3564             if markdown:
3565                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3566             else:
3567                 desc += f' [{cls._NETRC_MACHINE}]'
3568         if cls.IE_DESC is False:
3569             desc += ' [HIDDEN]'
3570         elif cls.IE_DESC:
3571             desc += f' {cls.IE_DESC}'
3572         if cls.SEARCH_KEY:
3573             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3574             if search_examples:
3575                 _COUNTS = ('', '5', '10', 'all')
3576                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3577         if not cls.working():
3578             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3579
3580         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3581         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3582         return f'{name}:{desc}' if desc else name
3583
3584     def extract_subtitles(self, *args, **kwargs):
3585         if (self.get_param('writesubtitles', False)
3586                 or self.get_param('listsubtitles')):
3587             return self._get_subtitles(*args, **kwargs)
3588         return {}
3589
3590     def _get_subtitles(self, *args, **kwargs):
3591         raise NotImplementedError('This method must be implemented by subclasses')
3592
3593     class CommentsDisabled(Exception):
3594         """Raise in _get_comments if comments are disabled for the video"""
3595
3596     def extract_comments(self, *args, **kwargs):
3597         if not self.get_param('getcomments'):
3598             return None
3599         generator = self._get_comments(*args, **kwargs)
3600
3601         def extractor():
3602             comments = []
3603             interrupted = True
3604             try:
3605                 while True:
3606                     comments.append(next(generator))
3607             except StopIteration:
3608                 interrupted = False
3609             except KeyboardInterrupt:
3610                 self.to_screen('Interrupted by user')
3611             except self.CommentsDisabled:
3612                 return {'comments': None, 'comment_count': None}
3613             except Exception as e:
3614                 if self.get_param('ignoreerrors') is not True:
3615                     raise
3616                 self._downloader.report_error(e)
3617             comment_count = len(comments)
3618             self.to_screen(f'Extracted {comment_count} comments')
3619             return {
3620                 'comments': comments,
3621                 'comment_count': None if interrupted else comment_count
3622             }
3623         return extractor
3624
3625     def _get_comments(self, *args, **kwargs):
3626         raise NotImplementedError('This method must be implemented by subclasses')
3627
3628     @staticmethod
3629     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3630         """ Merge subtitle items for one language. Items with duplicated URLs/data
3631         will be dropped. """
3632         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3633         ret = list(subtitle_list1)
3634         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3635         return ret
3636
3637     @classmethod
3638     def _merge_subtitles(cls, *dicts, target=None):
3639         """ Merge subtitle dictionaries, language by language. """
3640         if target is None:
3641             target = {}
3642         for d in dicts:
3643             for lang, subs in d.items():
3644                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3645         return target
3646
3647     def extract_automatic_captions(self, *args, **kwargs):
3648         if (self.get_param('writeautomaticsub', False)
3649                 or self.get_param('listsubtitles')):
3650             return self._get_automatic_captions(*args, **kwargs)
3651         return {}
3652
3653     def _get_automatic_captions(self, *args, **kwargs):
3654         raise NotImplementedError('This method must be implemented by subclasses')
3655
3656     @functools.cached_property
3657     def _cookies_passed(self):
3658         """Whether cookies have been passed to YoutubeDL"""
3659         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3660
3661     def mark_watched(self, *args, **kwargs):
3662         if not self.get_param('mark_watched', False):
3663             return
3664         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3665             self._mark_watched(*args, **kwargs)
3666
3667     def _mark_watched(self, *args, **kwargs):
3668         raise NotImplementedError('This method must be implemented by subclasses')
3669
3670     def geo_verification_headers(self):
3671         headers = {}
3672         geo_verification_proxy = self.get_param('geo_verification_proxy')
3673         if geo_verification_proxy:
3674             headers['Ytdl-request-proxy'] = geo_verification_proxy
3675         return headers
3676
3677     @staticmethod
3678     def _generic_id(url):
3679         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3680
3681     def _generic_title(self, url='', webpage='', *, default=None):
3682         return (self._og_search_title(webpage, default=None)
3683                 or self._html_extract_title(webpage, default=None)
3684                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3685                 or default)
3686
3687     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3688         if not duration:
3689             return
3690         chapter_list = [{
3691             'start_time': start_function(chapter),
3692             'title': title_function(chapter),
3693         } for chapter in chapter_list or []]
3694         if strict:
3695             warn = self.report_warning
3696         else:
3697             warn = self.write_debug
3698             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3699
3700         chapters = [{'start_time': 0}]
3701         for idx, chapter in enumerate(chapter_list):
3702             if chapter['start_time'] is None:
3703                 warn(f'Incomplete chapter {idx}')
3704             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3705                 chapters.append(chapter)
3706             elif chapter not in chapters:
3707                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3708                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3709                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3710         return chapters[1:]
3711
3712     def _extract_chapters_from_description(self, description, duration):
3713         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3714         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3715         return self._extract_chapters_helper(
3716             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3717             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3718             duration=duration, strict=False) or self._extract_chapters_helper(
3719             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3720             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3721             duration=duration, strict=False)
3722
3723     @staticmethod
3724     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3725         all_known = all(map(
3726             lambda x: x is not None,
3727             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3728         return (
3729             'private' if is_private
3730             else 'premium_only' if needs_premium
3731             else 'subscriber_only' if needs_subscription
3732             else 'needs_auth' if needs_auth
3733             else 'unlisted' if is_unlisted
3734             else 'public' if all_known
3735             else None)
3736
3737     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3738         '''
3739         @returns            A list of values for the extractor argument given by "key"
3740                             or "default" if no such key is present
3741         @param default      The default value to return when the key is not present (default: [])
3742         @param casesense    When false, the values are converted to lower case
3743         '''
3744         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3745         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3746         if val is None:
3747             return [] if default is NO_DEFAULT else default
3748         return list(val) if casesense else [x.lower() for x in val]
3749
3750     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3751         if not playlist_id or not video_id:
3752             return not video_id
3753
3754         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3755         if no_playlist is not None:
3756             return not no_playlist
3757
3758         video_id = '' if video_id is True else f' {video_id}'
3759         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3760         if self.get_param('noplaylist'):
3761             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3762             return False
3763         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3764         return True
3765
3766     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3767         RetryManager.report_retry(
3768             err, _count or int(fatal), _retries,
3769             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3770             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3771
3772     def RetryManager(self, **kwargs):
3773         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3774
3775     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3776         display_id = traverse_obj(info_dict, 'display_id', 'id')
3777         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3778         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3779             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3780
3781     @classmethod
3782     def extract_from_webpage(cls, ydl, url, webpage):
3783         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3784               else ydl.get_info_extractor(cls.ie_key()))
3785         for info in ie._extract_from_webpage(url, webpage) or []:
3786             # url = None since we do not want to set (webpage/original)_url
3787             ydl.add_default_extra_info(info, ie, None)
3788             yield info
3789
3790     @classmethod
3791     def _extract_from_webpage(cls, url, webpage):
3792         for embed_url in orderedSet(
3793                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3794             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3795
3796     @classmethod
3797     def _extract_embed_urls(cls, url, webpage):
3798         """@returns all the embed urls on the webpage"""
3799         if '_EMBED_URL_RE' not in cls.__dict__:
3800             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3801             for idx, regex in enumerate(cls._EMBED_REGEX):
3802                 assert regex.count('(?P<url>') == 1, \
3803                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3804             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3805
3806         for regex in cls._EMBED_URL_RE:
3807             for mobj in regex.finditer(webpage):
3808                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3809                 if cls._VALID_URL is False or cls.suitable(embed_url):
3810                     yield embed_url
3811
3812     class StopExtraction(Exception):
3813         pass
3814
3815     @classmethod
3816     def _extract_url(cls, webpage):  # TODO: Remove
3817         """Only for compatibility with some older extractors"""
3818         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3819
3820     @classmethod
3821     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3822         if plugin_name:
3823             mro = inspect.getmro(cls)
3824             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3825             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3826             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3827             while getattr(super_class, '__wrapped__', None):
3828                 super_class = super_class.__wrapped__
3829             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3830             _PLUGIN_OVERRIDES[super_class].append(cls)
3831
3832         return super().__init_subclass__(**kwargs)
3833
3834
3835 class SearchInfoExtractor(InfoExtractor):
3836     """
3837     Base class for paged search queries extractors.
3838     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3839     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3840     """
3841
3842     _MAX_RESULTS = float('inf')
3843     _RETURN_TYPE = 'playlist'
3844
3845     @classproperty
3846     def _VALID_URL(cls):
3847         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3848
3849     def _real_extract(self, query):
3850         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3851         if prefix == '':
3852             return self._get_n_results(query, 1)
3853         elif prefix == 'all':
3854             return self._get_n_results(query, self._MAX_RESULTS)
3855         else:
3856             n = int(prefix)
3857             if n <= 0:
3858                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3859             elif n > self._MAX_RESULTS:
3860                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3861                 n = self._MAX_RESULTS
3862             return self._get_n_results(query, n)
3863
3864     def _get_n_results(self, query, n):
3865         """Get a specified number of results for a query.
3866         Either this function or _search_results must be overridden by subclasses """
3867         return self.playlist_result(
3868             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3869             query, query)
3870
3871     def _search_results(self, query):
3872         """Returns an iterator of search results"""
3873         raise NotImplementedError('This method must be implemented by subclasses')
3874
3875     @classproperty
3876     def SEARCH_KEY(cls):
3877         return cls._SEARCH_KEY
3878
3879
3880 class UnsupportedURLIE(InfoExtractor):
3881     _VALID_URL = '.*'
3882     _ENABLED = False
3883     IE_DESC = False
3884
3885     def _real_extract(self, url):
3886         raise UnsupportedError(url)
3887
3888
3889 _PLUGIN_OVERRIDES = collections.defaultdict(list)