yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import subprocess
  17 import sys
  18 import time
  19 import types
  20 import urllib.parse
  21 import urllib.request
  22 import xml.etree.ElementTree
  23
  24 from ..compat import functools  # isort: split
  25 from ..compat import (
  26     compat_etree_fromstring,
  27     compat_expanduser,
  28     compat_os_name,
  29     urllib_req_to_req,
  30 )
  31 from ..cookies import LenientSimpleCookie
  32 from ..downloader.f4m import get_base_url, remove_encrypted_media
  33 from ..downloader.hls import HlsFD
  34 from ..networking import HEADRequest, Request
  35 from ..networking.exceptions import (
  36     HTTPError,
  37     IncompleteRead,
  38     network_exceptions,
  39 )
  40 from ..utils import (
  41     IDENTITY,
  42     JSON_LD_RE,
  43     NO_DEFAULT,
  44     ExtractorError,
  45     FormatSorter,
  46     GeoRestrictedError,
  47     GeoUtils,
  48     LenientJSONDecoder,
  49     Popen,
  50     RegexNotFoundError,
  51     RetryManager,
  52     UnsupportedError,
  53     age_restricted,
  54     base_url,
  55     bug_reports_message,
  56     classproperty,
  57     clean_html,
  58     deprecation_warning,
  59     determine_ext,
  60     dict_get,
  61     encode_data_uri,
  62     error_to_compat_str,
  63     extract_attributes,
  64     filter_dict,
  65     fix_xml_ampersands,
  66     float_or_none,
  67     format_field,
  68     int_or_none,
  69     join_nonempty,
  70     js_to_json,
  71     mimetype2ext,
  72     netrc_from_content,
  73     orderedSet,
  74     parse_bitrate,
  75     parse_codecs,
  76     parse_duration,
  77     parse_iso8601,
  78     parse_m3u8_attributes,
  79     parse_resolution,
  80     sanitize_filename,
  81     sanitize_url,
  82     smuggle_url,
  83     str_or_none,
  84     str_to_int,
  85     strip_or_none,
  86     traverse_obj,
  87     truncate_string,
  88     try_call,
  89     try_get,
  90     unescapeHTML,
  91     unified_strdate,
  92     unified_timestamp,
  93     url_basename,
  94     url_or_none,
  95     urlhandle_detect_ext,
  96     urljoin,
  97     variadic,
  98     xpath_element,
  99     xpath_text,
 100     xpath_with_ns,
 101 )
 102
 103
 104 class InfoExtractor:
 105     """Information Extractor class.
 106
 107     Information extractors are the classes that, given a URL, extract
 108     information about the video (or videos) the URL refers to. This
 109     information includes the real video URL, the video title, author and
 110     others. The information is stored in a dictionary which is then
 111     passed to the YoutubeDL. The YoutubeDL processes this
 112     information possibly downloading the video to the file system, among
 113     other possible outcomes.
 114
 115     The type field determines the type of the result.
 116     By far the most common value (and the default if _type is missing) is
 117     "video", which indicates a single video.
 118
 119     For a video, the dictionaries must include the following fields:
 120
 121     id:             Video identifier.
 122     title:          Video title, unescaped. Set to an empty string if video has
 123                     no title as opposed to "None" which signifies that the
 124                     extractor failed to obtain a title
 125
 126     Additionally, it must contain either a formats entry or a url one:
 127
 128     formats:        A list of dictionaries for each format available, ordered
 129                     from worst to best quality.
 130
 131                     Potential fields:
 132                     * url        The mandatory URL representing the media:
 133                                    for plain file media - HTTP URL of this file,
 134                                    for RTMP - RTMP URL,
 135                                    for HLS - URL of the M3U8 media playlist,
 136                                    for HDS - URL of the F4M manifest,
 137                                    for DASH
 138                                      - HTTP URL to plain file media (in case of
 139                                        unfragmented media)
 140                                      - URL of the MPD manifest or base URL
 141                                        representing the media if MPD manifest
 142                                        is parsed from a string (in case of
 143                                        fragmented media)
 144                                    for MSS - URL of the ISM manifest.
 145                     * request_data  Data to send in POST request to the URL
 146                     * manifest_url
 147                                  The URL of the manifest file in case of
 148                                  fragmented media:
 149                                    for HLS - URL of the M3U8 master playlist,
 150                                    for HDS - URL of the F4M manifest,
 151                                    for DASH - URL of the MPD manifest,
 152                                    for MSS - URL of the ISM manifest.
 153                     * manifest_stream_number  (For internal use only)
 154                                  The index of the stream in the manifest file
 155                     * ext        Will be calculated from URL if missing
 156                     * format     A human-readable description of the format
 157                                  ("mp4 container with h264/opus").
 158                                  Calculated from the format_id, width, height.
 159                                  and format_note fields if missing.
 160                     * format_id  A short description of the format
 161                                  ("mp4_h264_opus" or "19").
 162                                 Technically optional, but strongly recommended.
 163                     * format_note Additional info about the format
 164                                  ("3D" or "DASH video")
 165                     * width      Width of the video, if known
 166                     * height     Height of the video, if known
 167                     * aspect_ratio  Aspect ratio of the video, if known
 168                                  Automatically calculated from width and height
 169                     * resolution Textual description of width and height
 170                                  Automatically calculated from width and height
 171                     * dynamic_range The dynamic range of the video. One of:
 172                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 173                     * tbr        Average bitrate of audio and video in KBit/s
 174                     * abr        Average audio bitrate in KBit/s
 175                     * acodec     Name of the audio codec in use
 176                     * asr        Audio sampling rate in Hertz
 177                     * audio_channels  Number of audio channels
 178                     * vbr        Average video bitrate in KBit/s
 179                     * fps        Frame rate
 180                     * vcodec     Name of the video codec in use
 181                     * container  Name of the container format
 182                     * filesize   The number of bytes, if known in advance
 183                     * filesize_approx  An estimate for the number of bytes
 184                     * player_url SWF Player URL (used for rtmpdump).
 185                     * protocol   The protocol that will be used for the actual
 186                                  download, lower-case. One of "http", "https" or
 187                                  one of the protocols defined in downloader.PROTOCOL_MAP
 188                     * fragment_base_url
 189                                  Base URL for fragments. Each fragment's path
 190                                  value (if present) will be relative to
 191                                  this URL.
 192                     * fragments  A list of fragments of a fragmented media.
 193                                  Each fragment entry must contain either an url
 194                                  or a path. If an url is present it should be
 195                                  considered by a client. Otherwise both path and
 196                                  fragment_base_url must be present. Here is
 197                                  the list of all potential fields:
 198                                  * "url" - fragment's URL
 199                                  * "path" - fragment's path relative to
 200                                             fragment_base_url
 201                                  * "duration" (optional, int or float)
 202                                  * "filesize" (optional, int)
 203                     * is_from_start  Is a live format that can be downloaded
 204                                 from the start. Boolean
 205                     * preference Order number of this format. If this field is
 206                                  present and not None, the formats get sorted
 207                                  by this field, regardless of all other values.
 208                                  -1 for default (order by other properties),
 209                                  -2 or smaller for less than default.
 210                                  < -1000 to hide the format (if there is
 211                                     another one which is strictly better)
 212                     * language   Language code, e.g. "de" or "en-US".
 213                     * language_preference  Is this in the language mentioned in
 214                                  the URL?
 215                                  10 if it's what the URL is about,
 216                                  -1 for default (don't know),
 217                                  -10 otherwise, other values reserved for now.
 218                     * quality    Order number of the video quality of this
 219                                  format, irrespective of the file format.
 220                                  -1 for default (order by other properties),
 221                                  -2 or smaller for less than default.
 222                     * source_preference  Order number for this video source
 223                                   (quality takes higher priority)
 224                                  -1 for default (order by other properties),
 225                                  -2 or smaller for less than default.
 226                     * http_headers  A dictionary of additional HTTP headers
 227                                  to add to the request.
 228                     * stretched_ratio  If given and not 1, indicates that the
 229                                  video's pixels are not square.
 230                                  width : height ratio as float.
 231                     * no_resume  The server does not support resuming the
 232                                  (HTTP or RTMP) download. Boolean.
 233                     * has_drm    True if the format has DRM and cannot be downloaded.
 234                                  'maybe' if the format may have DRM and has to be tested before download.
 235                     * extra_param_to_segment_url  A query string to append to each
 236                                  fragment's URL, or to update each existing query string
 237                                  with. Only applied by the native HLS/DASH downloaders.
 238                     * hls_aes    A dictionary of HLS AES-128 decryption information
 239                                  used by the native HLS downloader to override the
 240                                  values in the media playlist when an '#EXT-X-KEY' tag
 241                                  is present in the playlist:
 242                                  * uri  The URI from which the key will be downloaded
 243                                  * key  The key (as hex) used to decrypt fragments.
 244                                         If `key` is given, any key URI will be ignored
 245                                  * iv   The IV (as hex) used to decrypt fragments
 246                     * downloader_options  A dictionary of downloader options
 247                                  (For internal use only)
 248                                  * http_chunk_size Chunk size for HTTP downloads
 249                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 250                     RTMP formats can also have the additional fields: page_url,
 251                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 252                     rtmp_protocol, rtmp_real_time
 253
 254     url:            Final video URL.
 255     ext:            Video filename extension.
 256     format:         The video format, defaults to ext (used for --get-format)
 257     player_url:     SWF Player URL (used for rtmpdump).
 258
 259     The following fields are optional:
 260
 261     direct:         True if a direct video file was given (must only be set by GenericIE)
 262     alt_title:      A secondary title of the video.
 263     display_id      An alternative identifier for the video, not necessarily
 264                     unique, but available before title. Typically, id is
 265                     something like "4234987", title "Dancing naked mole rats",
 266                     and display_id "dancing-naked-mole-rats"
 267     thumbnails:     A list of dictionaries, with the following entries:
 268                         * "id" (optional, string) - Thumbnail format ID
 269                         * "url"
 270                         * "preference" (optional, int) - quality of the image
 271                         * "width" (optional, int)
 272                         * "height" (optional, int)
 273                         * "resolution" (optional, string "{width}x{height}",
 274                                         deprecated)
 275                         * "filesize" (optional, int)
 276                         * "http_headers" (dict) - HTTP headers for the request
 277     thumbnail:      Full URL to a video thumbnail image.
 278     description:    Full video description.
 279     uploader:       Full name of the video uploader.
 280     license:        License name the video is licensed under.
 281     creator:        The creator of the video.
 282     timestamp:      UNIX timestamp of the moment the video was uploaded
 283     upload_date:    Video upload date in UTC (YYYYMMDD).
 284                     If not explicitly set, calculated from timestamp
 285     release_timestamp: UNIX timestamp of the moment the video was released.
 286                     If it is not clear whether to use timestamp or this, use the former
 287     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 288                     If not explicitly set, calculated from release_timestamp
 289     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 290     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 291                     If not explicitly set, calculated from modified_timestamp
 292     uploader_id:    Nickname or id of the video uploader.
 293     uploader_url:   Full URL to a personal webpage of the video uploader.
 294     channel:        Full name of the channel the video is uploaded on.
 295                     Note that channel fields may or may not repeat uploader
 296                     fields. This depends on a particular extractor.
 297     channel_id:     Id of the channel.
 298     channel_url:    Full URL to a channel webpage.
 299     channel_follower_count: Number of followers of the channel.
 300     channel_is_verified: Whether the channel is verified on the platform.
 301     location:       Physical location where the video was filmed.
 302     subtitles:      The available subtitles as a dictionary in the format
 303                     {tag: subformats}. "tag" is usually a language code, and
 304                     "subformats" is a list sorted from lower to higher
 305                     preference, each element is a dictionary with the "ext"
 306                     entry and one of:
 307                         * "data": The subtitles file contents
 308                         * "url": A URL pointing to the subtitles file
 309                     It can optionally also have:
 310                         * "name": Name or description of the subtitles
 311                         * "http_headers": A dictionary of additional HTTP headers
 312                                   to add to the request.
 313                     "ext" will be calculated from URL if missing
 314     automatic_captions: Like 'subtitles'; contains automatically generated
 315                     captions instead of normal subtitles
 316     duration:       Length of the video in seconds, as an integer or float.
 317     view_count:     How many users have watched the video on the platform.
 318     concurrent_view_count: How many users are currently watching the video on the platform.
 319     like_count:     Number of positive ratings of the video
 320     dislike_count:  Number of negative ratings of the video
 321     repost_count:   Number of reposts of the video
 322     average_rating: Average rating give by users, the scale used depends on the webpage
 323     comment_count:  Number of comments on the video
 324     comments:       A list of comments, each with one or more of the following
 325                     properties (all but one of text or html optional):
 326                         * "author" - human-readable name of the comment author
 327                         * "author_id" - user ID of the comment author
 328                         * "author_thumbnail" - The thumbnail of the comment author
 329                         * "author_url" - The url to the comment author's page
 330                         * "author_is_verified" - Whether the author is verified
 331                                                  on the platform
 332                         * "author_is_uploader" - Whether the comment is made by
 333                                                  the video uploader
 334                         * "id" - Comment ID
 335                         * "html" - Comment as HTML
 336                         * "text" - Plain text of the comment
 337                         * "timestamp" - UNIX timestamp of comment
 338                         * "parent" - ID of the comment this one is replying to.
 339                                      Set to "root" to indicate that this is a
 340                                      comment to the original video.
 341                         * "like_count" - Number of positive ratings of the comment
 342                         * "dislike_count" - Number of negative ratings of the comment
 343                         * "is_favorited" - Whether the comment is marked as
 344                                            favorite by the video uploader
 345                         * "is_pinned" - Whether the comment is pinned to
 346                                         the top of the comments
 347     age_limit:      Age restriction for the video, as an integer (years)
 348     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 349                     should allow to get the same result again. (It will be set
 350                     by YoutubeDL if it's missing)
 351     categories:     A list of categories that the video falls in, for example
 352                     ["Sports", "Berlin"]
 353     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 354     cast:           A list of the video cast
 355     is_live:        True, False, or None (=unknown). Whether this video is a
 356                     live stream that goes on instead of a fixed-length video.
 357     was_live:       True, False, or None (=unknown). Whether this video was
 358                     originally a live stream.
 359     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 360                     or 'post_live' (was live, but VOD is not yet processed)
 361                     If absent, automatically set from is_live, was_live
 362     start_time:     Time in seconds where the reproduction should start, as
 363                     specified in the URL.
 364     end_time:       Time in seconds where the reproduction should end, as
 365                     specified in the URL.
 366     chapters:       A list of dictionaries, with the following entries:
 367                         * "start_time" - The start time of the chapter in seconds
 368                         * "end_time" - The end time of the chapter in seconds
 369                         * "title" (optional, string)
 370     heatmap:        A list of dictionaries, with the following entries:
 371                         * "start_time" - The start time of the data point in seconds
 372                         * "end_time" - The end time of the data point in seconds
 373                         * "value" - The normalized value of the data point (float between 0 and 1)
 374     playable_in_embed: Whether this video is allowed to play in embedded
 375                     players on other sites. Can be True (=always allowed),
 376                     False (=never allowed), None (=unknown), or a string
 377                     specifying the criteria for embedability; e.g. 'whitelist'
 378     availability:   Under what condition the video is available. One of
 379                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 380                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 381                     to set it
 382     _old_archive_ids: A list of old archive ids needed for backward compatibility
 383     _format_sort_fields: A list of fields to use for sorting formats
 384     __post_extractor: A function to be called just before the metadata is
 385                     written to either disk, logger or console. The function
 386                     must return a dict which will be added to the info_dict.
 387                     This is usefull for additional information that is
 388                     time-consuming to extract. Note that the fields thus
 389                     extracted will not be available to output template and
 390                     match_filter. So, only "comments" and "comment_count" are
 391                     currently allowed to be extracted via this method.
 392
 393     The following fields should only be used when the video belongs to some logical
 394     chapter or section:
 395
 396     chapter:        Name or title of the chapter the video belongs to.
 397     chapter_number: Number of the chapter the video belongs to, as an integer.
 398     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 399
 400     The following fields should only be used when the video is an episode of some
 401     series, programme or podcast:
 402
 403     series:         Title of the series or programme the video episode belongs to.
 404     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 405     season:         Title of the season the video episode belongs to.
 406     season_number:  Number of the season the video episode belongs to, as an integer.
 407     season_id:      Id of the season the video episode belongs to, as a unicode string.
 408     episode:        Title of the video episode. Unlike mandatory video title field,
 409                     this field should denote the exact title of the video episode
 410                     without any kind of decoration.
 411     episode_number: Number of the video episode within a season, as an integer.
 412     episode_id:     Id of the video episode, as a unicode string.
 413
 414     The following fields should only be used when the media is a track or a part of
 415     a music album:
 416
 417     track:          Title of the track.
 418     track_number:   Number of the track within an album or a disc, as an integer.
 419     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 420                     as a unicode string.
 421     artist:         Artist(s) of the track.
 422     genre:          Genre(s) of the track.
 423     album:          Title of the album the track belongs to.
 424     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 425     album_artist:   List of all artists appeared on the album (e.g.
 426                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 427                     and compilations).
 428     disc_number:    Number of the disc or other physical medium the track belongs to,
 429                     as an integer.
 430     release_year:   Year (YYYY) when the album was released.
 431     composer:       Composer of the piece
 432
 433     The following fields should only be set for clips that should be cut from the original video:
 434
 435     section_start:  Start time of the section in seconds
 436     section_end:    End time of the section in seconds
 437
 438     The following fields should only be set for storyboards:
 439     rows:           Number of rows in each storyboard fragment, as an integer
 440     columns:        Number of columns in each storyboard fragment, as an integer
 441
 442     Unless mentioned otherwise, the fields should be Unicode strings.
 443
 444     Unless mentioned otherwise, None is equivalent to absence of information.
 445
 446
 447     _type "playlist" indicates multiple videos.
 448     There must be a key "entries", which is a list, an iterable, or a PagedList
 449     object, each element of which is a valid dictionary by this specification.
 450
 451     Additionally, playlists can have "id", "title", and any other relevant
 452     attributes with the same semantics as videos (see above).
 453
 454     It can also have the following optional fields:
 455
 456     playlist_count: The total number of videos in a playlist. If not given,
 457                     YoutubeDL tries to calculate it from "entries"
 458
 459
 460     _type "multi_video" indicates that there are multiple videos that
 461     form a single show, for examples multiple acts of an opera or TV episode.
 462     It must have an entries key like a playlist and contain all the keys
 463     required for a video at the same time.
 464
 465
 466     _type "url" indicates that the video must be extracted from another
 467     location, possibly by a different extractor. Its only required key is:
 468     "url" - the next URL to extract.
 469     The key "ie_key" can be set to the class name (minus the trailing "IE",
 470     e.g. "Youtube") if the extractor class is known in advance.
 471     Additionally, the dictionary may have any properties of the resolved entity
 472     known in advance, for example "title" if the title of the referred video is
 473     known ahead of time.
 474
 475
 476     _type "url_transparent" entities have the same specification as "url", but
 477     indicate that the given additional information is more precise than the one
 478     associated with the resolved URL.
 479     This is useful when a site employs a video service that hosts the video and
 480     its technical metadata, but that video service does not embed a useful
 481     title, description etc.
 482
 483
 484     Subclasses of this should also be added to the list of extractors and
 485     should define _VALID_URL as a regexp or a Sequence of regexps, and
 486     re-define the _real_extract() and (optionally) _real_initialize() methods.
 487
 488     Subclasses may also override suitable() if necessary, but ensure the function
 489     signature is preserved and that this function imports everything it needs
 490     (except other extractors), so that lazy_extractors works correctly.
 491
 492     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 493     the HTML of Generic webpages. It may also override _extract_embed_urls
 494     or _extract_from_webpage as necessary. While these are normally classmethods,
 495     _extract_from_webpage is allowed to be an instance method.
 496
 497     _extract_from_webpage may raise self.StopExtraction() to stop further
 498     processing of the webpage and obtain exclusive rights to it. This is useful
 499     when the extractor cannot reliably be matched using just the URL,
 500     e.g. invidious/peertube instances
 501
 502     Embed-only extractors can be defined by setting _VALID_URL = False.
 503
 504     To support username + password (or netrc) login, the extractor must define a
 505     _NETRC_MACHINE and re-define _perform_login(username, password) and
 506     (optionally) _initialize_pre_login() methods. The _perform_login method will
 507     be called between _initialize_pre_login and _real_initialize if credentials
 508     are passed by the user. In cases where it is necessary to have the login
 509     process as part of the extraction rather than initialization, _perform_login
 510     can be left undefined.
 511
 512     _GEO_BYPASS attribute may be set to False in order to disable
 513     geo restriction bypass mechanisms for a particular extractor.
 514     Though it won't disable explicit geo restriction bypass based on
 515     country code provided with geo_bypass_country.
 516
 517     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 518     countries for this extractor. One of these countries will be used by
 519     geo restriction bypass mechanism right away in order to bypass
 520     geo restriction, of course, if the mechanism is not disabled.
 521
 522     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 523     IP blocks in CIDR notation for this extractor. One of these IP blocks
 524     will be used by geo restriction bypass mechanism similarly
 525     to _GEO_COUNTRIES.
 526
 527     The _ENABLED attribute should be set to False for IEs that
 528     are disabled by default and must be explicitly enabled.
 529
 530     The _WORKING attribute should be set to False for broken IEs
 531     in order to warn the users and skip the tests.
 532     """
 533
 534     _ready = False
 535     _downloader = None
 536     _x_forwarded_for_ip = None
 537     _GEO_BYPASS = True
 538     _GEO_COUNTRIES = None
 539     _GEO_IP_BLOCKS = None
 540     _WORKING = True
 541     _ENABLED = True
 542     _NETRC_MACHINE = None
 543     IE_DESC = None
 544     SEARCH_KEY = None
 545     _VALID_URL = None
 546     _EMBED_REGEX = []
 547
 548     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 549         password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 550         return {
 551             None: '',
 552             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 553             'password': f'Use {password_hint}',
 554             'cookies': (
 555                 'Use --cookies-from-browser or --cookies for the authentication. '
 556                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 557         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 558
 559     def __init__(self, downloader=None):
 560         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 561         If a downloader is not passed during initialization,
 562         it must be set using "set_downloader()" before "extract()" is called"""
 563         self._ready = False
 564         self._x_forwarded_for_ip = None
 565         self._printed_messages = set()
 566         self.set_downloader(downloader)
 567
 568     @classmethod
 569     def _match_valid_url(cls, url):
 570         if cls._VALID_URL is False:
 571             return None
 572         # This does not use has/getattr intentionally - we want to know whether
 573         # we have cached the regexp for *this* class, whereas getattr would also
 574         # match the superclass
 575         if '_VALID_URL_RE' not in cls.__dict__:
 576             cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
 577         return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
 578
 579     @classmethod
 580     def suitable(cls, url):
 581         """Receives a URL and returns True if suitable for this IE."""
 582         # This function must import everything it needs (except other extractors),
 583         # so that lazy_extractors works correctly
 584         return cls._match_valid_url(url) is not None
 585
 586     @classmethod
 587     def _match_id(cls, url):
 588         return cls._match_valid_url(url).group('id')
 589
 590     @classmethod
 591     def get_temp_id(cls, url):
 592         try:
 593             return cls._match_id(url)
 594         except (IndexError, AttributeError):
 595             return None
 596
 597     @classmethod
 598     def working(cls):
 599         """Getter method for _WORKING."""
 600         return cls._WORKING
 601
 602     @classmethod
 603     def supports_login(cls):
 604         return bool(cls._NETRC_MACHINE)
 605
 606     def initialize(self):
 607         """Initializes an instance (authentication, etc)."""
 608         self._printed_messages = set()
 609         self._initialize_geo_bypass({
 610             'countries': self._GEO_COUNTRIES,
 611             'ip_blocks': self._GEO_IP_BLOCKS,
 612         })
 613         if not self._ready:
 614             self._initialize_pre_login()
 615             if self.supports_login():
 616                 username, password = self._get_login_info()
 617                 if username:
 618                     self._perform_login(username, password)
 619             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 620                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 621             self._real_initialize()
 622             self._ready = True
 623
 624     def _initialize_geo_bypass(self, geo_bypass_context):
 625         """
 626         Initialize geo restriction bypass mechanism.
 627
 628         This method is used to initialize geo bypass mechanism based on faking
 629         X-Forwarded-For HTTP header. A random country from provided country list
 630         is selected and a random IP belonging to this country is generated. This
 631         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 632         HTTP requests.
 633
 634         This method will be used for initial geo bypass mechanism initialization
 635         during the instance initialization with _GEO_COUNTRIES and
 636         _GEO_IP_BLOCKS.
 637
 638         You may also manually call it from extractor's code if geo bypass
 639         information is not available beforehand (e.g. obtained during
 640         extraction) or due to some other reason. In this case you should pass
 641         this information in geo bypass context passed as first argument. It may
 642         contain following fields:
 643
 644         countries:  List of geo unrestricted countries (similar
 645                     to _GEO_COUNTRIES)
 646         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 647                     (similar to _GEO_IP_BLOCKS)
 648
 649         """
 650         if not self._x_forwarded_for_ip:
 651
 652             # Geo bypass mechanism is explicitly disabled by user
 653             if not self.get_param('geo_bypass', True):
 654                 return
 655
 656             if not geo_bypass_context:
 657                 geo_bypass_context = {}
 658
 659             # Backward compatibility: previously _initialize_geo_bypass
 660             # expected a list of countries, some 3rd party code may still use
 661             # it this way
 662             if isinstance(geo_bypass_context, (list, tuple)):
 663                 geo_bypass_context = {
 664                     'countries': geo_bypass_context,
 665                 }
 666
 667             # The whole point of geo bypass mechanism is to fake IP
 668             # as X-Forwarded-For HTTP header based on some IP block or
 669             # country code.
 670
 671             # Path 1: bypassing based on IP block in CIDR notation
 672
 673             # Explicit IP block specified by user, use it right away
 674             # regardless of whether extractor is geo bypassable or not
 675             ip_block = self.get_param('geo_bypass_ip_block', None)
 676
 677             # Otherwise use random IP block from geo bypass context but only
 678             # if extractor is known as geo bypassable
 679             if not ip_block:
 680                 ip_blocks = geo_bypass_context.get('ip_blocks')
 681                 if self._GEO_BYPASS and ip_blocks:
 682                     ip_block = random.choice(ip_blocks)
 683
 684             if ip_block:
 685                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 686                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 687                 return
 688
 689             # Path 2: bypassing based on country code
 690
 691             # Explicit country code specified by user, use it right away
 692             # regardless of whether extractor is geo bypassable or not
 693             country = self.get_param('geo_bypass_country', None)
 694
 695             # Otherwise use random country code from geo bypass context but
 696             # only if extractor is known as geo bypassable
 697             if not country:
 698                 countries = geo_bypass_context.get('countries')
 699                 if self._GEO_BYPASS and countries:
 700                     country = random.choice(countries)
 701
 702             if country:
 703                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 704                 self._downloader.write_debug(
 705                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 706
 707     def extract(self, url):
 708         """Extracts URL information and returns it in list of dicts."""
 709         try:
 710             for _ in range(2):
 711                 try:
 712                     self.initialize()
 713                     self.to_screen('Extracting URL: %s' % (
 714                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 715                     ie_result = self._real_extract(url)
 716                     if ie_result is None:
 717                         return None
 718                     if self._x_forwarded_for_ip:
 719                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 720                     subtitles = ie_result.get('subtitles') or {}
 721                     if 'no-live-chat' in self.get_param('compat_opts'):
 722                         for lang in ('live_chat', 'comments', 'danmaku'):
 723                             subtitles.pop(lang, None)
 724                     return ie_result
 725                 except GeoRestrictedError as e:
 726                     if self.__maybe_fake_ip_and_retry(e.countries):
 727                         continue
 728                     raise
 729         except UnsupportedError:
 730             raise
 731         except ExtractorError as e:
 732             e.video_id = e.video_id or self.get_temp_id(url)
 733             e.ie = e.ie or self.IE_NAME,
 734             e.traceback = e.traceback or sys.exc_info()[2]
 735             raise
 736         except IncompleteRead as e:
 737             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 738         except (KeyError, StopIteration) as e:
 739             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 740
 741     def __maybe_fake_ip_and_retry(self, countries):
 742         if (not self.get_param('geo_bypass_country', None)
 743                 and self._GEO_BYPASS
 744                 and self.get_param('geo_bypass', True)
 745                 and not self._x_forwarded_for_ip
 746                 and countries):
 747             country_code = random.choice(countries)
 748             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 749             if self._x_forwarded_for_ip:
 750                 self.report_warning(
 751                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 752                     % (self._x_forwarded_for_ip, country_code.upper()))
 753                 return True
 754         return False
 755
 756     def set_downloader(self, downloader):
 757         """Sets a YoutubeDL instance as the downloader for this IE."""
 758         self._downloader = downloader
 759
 760     @property
 761     def cache(self):
 762         return self._downloader.cache
 763
 764     @property
 765     def cookiejar(self):
 766         return self._downloader.cookiejar
 767
 768     def _initialize_pre_login(self):
 769         """ Initialization before login. Redefine in subclasses."""
 770         pass
 771
 772     def _perform_login(self, username, password):
 773         """ Login with username and password. Redefine in subclasses."""
 774         pass
 775
 776     def _real_initialize(self):
 777         """Real initialization process. Redefine in subclasses."""
 778         pass
 779
 780     def _real_extract(self, url):
 781         """Real extraction process. Redefine in subclasses."""
 782         raise NotImplementedError('This method must be implemented by subclasses')
 783
 784     @classmethod
 785     def ie_key(cls):
 786         """A string for getting the InfoExtractor with get_info_extractor"""
 787         return cls.__name__[:-2]
 788
 789     @classproperty
 790     def IE_NAME(cls):
 791         return cls.__name__[:-2]
 792
 793     @staticmethod
 794     def __can_accept_status_code(err, expected_status):
 795         assert isinstance(err, HTTPError)
 796         if expected_status is None:
 797             return False
 798         elif callable(expected_status):
 799             return expected_status(err.status) is True
 800         else:
 801             return err.status in variadic(expected_status)
 802
 803     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 804         if isinstance(url_or_request, urllib.request.Request):
 805             self._downloader.deprecation_warning(
 806                 'Passing a urllib.request.Request to _create_request() is deprecated. '
 807                 'Use yt_dlp.networking.common.Request instead.')
 808             url_or_request = urllib_req_to_req(url_or_request)
 809         elif not isinstance(url_or_request, Request):
 810             url_or_request = Request(url_or_request)
 811
 812         url_or_request.update(data=data, headers=headers, query=query)
 813         return url_or_request
 814
 815     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 816         """
 817         Return the response handle.
 818
 819         See _download_webpage docstring for arguments specification.
 820         """
 821         if not self._downloader._first_webpage_request:
 822             sleep_interval = self.get_param('sleep_interval_requests') or 0
 823             if sleep_interval > 0:
 824                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 825                 time.sleep(sleep_interval)
 826         else:
 827             self._downloader._first_webpage_request = False
 828
 829         if note is None:
 830             self.report_download_webpage(video_id)
 831         elif note is not False:
 832             if video_id is None:
 833                 self.to_screen(str(note))
 834             else:
 835                 self.to_screen(f'{video_id}: {note}')
 836
 837         # Some sites check X-Forwarded-For HTTP header in order to figure out
 838         # the origin of the client behind proxy. This allows bypassing geo
 839         # restriction by faking this header's value to IP that belongs to some
 840         # geo unrestricted country. We will do so once we encounter any
 841         # geo restriction error.
 842         if self._x_forwarded_for_ip:
 843             headers = (headers or {}).copy()
 844             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 845
 846         try:
 847             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 848         except network_exceptions as err:
 849             if isinstance(err, HTTPError):
 850                 if self.__can_accept_status_code(err, expected_status):
 851                     return err.response
 852
 853             if errnote is False:
 854                 return False
 855             if errnote is None:
 856                 errnote = 'Unable to download webpage'
 857
 858             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 859             if fatal:
 860                 raise ExtractorError(errmsg, cause=err)
 861             else:
 862                 self.report_warning(errmsg)
 863                 return False
 864
 865     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 866                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 867         """
 868         Return a tuple (page content as string, URL handle).
 869
 870         Arguments:
 871         url_or_request -- plain text URL as a string or
 872             a urllib.request.Request object
 873         video_id -- Video/playlist/item identifier (string)
 874
 875         Keyword arguments:
 876         note -- note printed before downloading (string)
 877         errnote -- note printed in case of an error (string)
 878         fatal -- flag denoting whether error should be considered fatal,
 879             i.e. whether it should cause ExtractionError to be raised,
 880             otherwise a warning will be reported and extraction continued
 881         encoding -- encoding for a page content decoding, guessed automatically
 882             when not explicitly specified
 883         data -- POST data (bytes)
 884         headers -- HTTP headers (dict)
 885         query -- URL query (dict)
 886         expected_status -- allows to accept failed HTTP requests (non 2xx
 887             status code) by explicitly specifying a set of accepted status
 888             codes. Can be any of the following entities:
 889                 - an integer type specifying an exact failed status code to
 890                   accept
 891                 - a list or a tuple of integer types specifying a list of
 892                   failed status codes to accept
 893                 - a callable accepting an actual failed status code and
 894                   returning True if it should be accepted
 895             Note that this argument does not affect success status codes (2xx)
 896             which are always accepted.
 897         """
 898
 899         # Strip hashes from the URL (#1038)
 900         if isinstance(url_or_request, str):
 901             url_or_request = url_or_request.partition('#')[0]
 902
 903         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 904         if urlh is False:
 905             assert not fatal
 906             return False
 907         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 908         return (content, urlh)
 909
 910     @staticmethod
 911     def _guess_encoding_from_content(content_type, webpage_bytes):
 912         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 913         if m:
 914             encoding = m.group(1)
 915         else:
 916             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 917                           webpage_bytes[:1024])
 918             if m:
 919                 encoding = m.group(1).decode('ascii')
 920             elif webpage_bytes.startswith(b'\xff\xfe'):
 921                 encoding = 'utf-16'
 922             else:
 923                 encoding = 'utf-8'
 924
 925         return encoding
 926
 927     def __check_blocked(self, content):
 928         first_block = content[:512]
 929         if ('<title>Access to this site is blocked</title>' in content
 930                 and 'Websense' in first_block):
 931             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 932             blocked_iframe = self._html_search_regex(
 933                 r'<iframe src="([^"]+)"', content,
 934                 'Websense information URL', default=None)
 935             if blocked_iframe:
 936                 msg += ' Visit %s for more details' % blocked_iframe
 937             raise ExtractorError(msg, expected=True)
 938         if '<title>The URL you requested has been blocked</title>' in first_block:
 939             msg = (
 940                 'Access to this webpage has been blocked by Indian censorship. '
 941                 'Use a VPN or proxy server (with --proxy) to route around it.')
 942             block_msg = self._html_search_regex(
 943                 r'</h1><p>(.*?)</p>',
 944                 content, 'block message', default=None)
 945             if block_msg:
 946                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 947             raise ExtractorError(msg, expected=True)
 948         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 949                 and 'blocklist.rkn.gov.ru' in content):
 950             raise ExtractorError(
 951                 'Access to this webpage has been blocked by decision of the Russian government. '
 952                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 953                 expected=True)
 954
 955     def _request_dump_filename(self, url, video_id):
 956         basen = f'{video_id}_{url}'
 957         trim_length = self.get_param('trim_file_name') or 240
 958         if len(basen) > trim_length:
 959             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 960             basen = basen[:trim_length - len(h)] + h
 961         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 962         # Working around MAX_PATH limitation on Windows (see
 963         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 964         if compat_os_name == 'nt':
 965             absfilepath = os.path.abspath(filename)
 966             if len(absfilepath) > 259:
 967                 filename = fR'\\?\{absfilepath}'
 968         return filename
 969
 970     def __decode_webpage(self, webpage_bytes, encoding, headers):
 971         if not encoding:
 972             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 973         try:
 974             return webpage_bytes.decode(encoding, 'replace')
 975         except LookupError:
 976             return webpage_bytes.decode('utf-8', 'replace')
 977
 978     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 979         webpage_bytes = urlh.read()
 980         if prefix is not None:
 981             webpage_bytes = prefix + webpage_bytes
 982         if self.get_param('dump_intermediate_pages', False):
 983             self.to_screen('Dumping request to ' + urlh.url)
 984             dump = base64.b64encode(webpage_bytes).decode('ascii')
 985             self._downloader.to_screen(dump)
 986         if self.get_param('write_pages'):
 987             filename = self._request_dump_filename(urlh.url, video_id)
 988             self.to_screen(f'Saving request to {filename}')
 989             with open(filename, 'wb') as outf:
 990                 outf.write(webpage_bytes)
 991
 992         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 993         self.__check_blocked(content)
 994
 995         return content
 996
 997     def __print_error(self, errnote, fatal, video_id, err):
 998         if fatal:
 999             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
1000         elif errnote:
1001             self.report_warning(f'{video_id}: {errnote}: {err}')
1002
1003     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
1004         if transform_source:
1005             xml_string = transform_source(xml_string)
1006         try:
1007             return compat_etree_fromstring(xml_string.encode('utf-8'))
1008         except xml.etree.ElementTree.ParseError as ve:
1009             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1010
1011     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1012         try:
1013             return json.loads(
1014                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1015         except ValueError as ve:
1016             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1017
1018     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1019         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1020
1021     def __create_download_methods(name, parser, note, errnote, return_value):
1022
1023         def parse(ie, content, *args, errnote=errnote, **kwargs):
1024             if parser is None:
1025                 return content
1026             if errnote is False:
1027                 kwargs['errnote'] = errnote
1028             # parser is fetched by name so subclasses can override it
1029             return getattr(ie, parser)(content, *args, **kwargs)
1030
1031         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1032                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1033             res = self._download_webpage_handle(
1034                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1035                 data=data, headers=headers, query=query, expected_status=expected_status)
1036             if res is False:
1037                 return res
1038             content, urlh = res
1039             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1040
1041         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1042                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1043             if self.get_param('load_pages'):
1044                 url_or_request = self._create_request(url_or_request, data, headers, query)
1045                 filename = self._request_dump_filename(url_or_request.url, video_id)
1046                 self.to_screen(f'Loading request from {filename}')
1047                 try:
1048                     with open(filename, 'rb') as dumpf:
1049                         webpage_bytes = dumpf.read()
1050                 except OSError as e:
1051                     self.report_warning(f'Unable to load request from disk: {e}')
1052                 else:
1053                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1054                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1055             kwargs = {
1056                 'note': note,
1057                 'errnote': errnote,
1058                 'transform_source': transform_source,
1059                 'fatal': fatal,
1060                 'encoding': encoding,
1061                 'data': data,
1062                 'headers': headers,
1063                 'query': query,
1064                 'expected_status': expected_status,
1065             }
1066             if parser is None:
1067                 kwargs.pop('transform_source')
1068             # The method is fetched by name so subclasses can override _download_..._handle
1069             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1070             return res if res is False else res[0]
1071
1072         def impersonate(func, name, return_value):
1073             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1074             func.__doc__ = f'''
1075                 @param transform_source     Apply this transformation before parsing
1076                 @returns                    {return_value}
1077
1078                 See _download_webpage_handle docstring for other arguments specification
1079             '''
1080
1081         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1082         impersonate(download_content, f'_download_{name}', f'{return_value}')
1083         return download_handle, download_content
1084
1085     _download_xml_handle, _download_xml = __create_download_methods(
1086         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1087     _download_json_handle, _download_json = __create_download_methods(
1088         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1089     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1090         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1091     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1092
1093     def _download_webpage(
1094             self, url_or_request, video_id, note=None, errnote=None,
1095             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1096         """
1097         Return the data of the page as a string.
1098
1099         Keyword arguments:
1100         tries -- number of tries
1101         timeout -- sleep interval between tries
1102
1103         See _download_webpage_handle docstring for other arguments specification.
1104         """
1105
1106         R''' # NB: These are unused; should they be deprecated?
1107         if tries != 1:
1108             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1109         if timeout is NO_DEFAULT:
1110             timeout = 5
1111         else:
1112             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1113         '''
1114
1115         try_count = 0
1116         while True:
1117             try:
1118                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1119             except IncompleteRead as e:
1120                 try_count += 1
1121                 if try_count >= tries:
1122                     raise e
1123                 self._sleep(timeout, video_id)
1124
1125     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1126         idstr = format_field(video_id, None, '%s: ')
1127         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1128         if only_once:
1129             if f'WARNING: {msg}' in self._printed_messages:
1130                 return
1131             self._printed_messages.add(f'WARNING: {msg}')
1132         self._downloader.report_warning(msg, *args, **kwargs)
1133
1134     def to_screen(self, msg, *args, **kwargs):
1135         """Print msg to screen, prefixing it with '[ie_name]'"""
1136         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1137
1138     def write_debug(self, msg, *args, **kwargs):
1139         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1140
1141     def get_param(self, name, default=None, *args, **kwargs):
1142         if self._downloader:
1143             return self._downloader.params.get(name, default, *args, **kwargs)
1144         return default
1145
1146     def report_drm(self, video_id, partial=NO_DEFAULT):
1147         if partial is not NO_DEFAULT:
1148             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1149         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1150
1151     def report_extraction(self, id_or_name):
1152         """Report information extraction."""
1153         self.to_screen('%s: Extracting information' % id_or_name)
1154
1155     def report_download_webpage(self, video_id):
1156         """Report webpage download."""
1157         self.to_screen('%s: Downloading webpage' % video_id)
1158
1159     def report_age_confirmation(self):
1160         """Report attempt to confirm age."""
1161         self.to_screen('Confirming age')
1162
1163     def report_login(self):
1164         """Report attempt to log in."""
1165         self.to_screen('Logging in')
1166
1167     def raise_login_required(
1168             self, msg='This video is only available for registered users',
1169             metadata_available=False, method=NO_DEFAULT):
1170         if metadata_available and (
1171                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1172             self.report_warning(msg)
1173             return
1174         msg += format_field(self._login_hint(method), None, '. %s')
1175         raise ExtractorError(msg, expected=True)
1176
1177     def raise_geo_restricted(
1178             self, msg='This video is not available from your location due to geo restriction',
1179             countries=None, metadata_available=False):
1180         if metadata_available and (
1181                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1182             self.report_warning(msg)
1183         else:
1184             raise GeoRestrictedError(msg, countries=countries)
1185
1186     def raise_no_formats(self, msg, expected=False, video_id=None):
1187         if expected and (
1188                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1189             self.report_warning(msg, video_id)
1190         elif isinstance(msg, ExtractorError):
1191             raise msg
1192         else:
1193             raise ExtractorError(msg, expected=expected, video_id=video_id)
1194
1195     # Methods for following #608
1196     @staticmethod
1197     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1198         """Returns a URL that points to a page that should be processed"""
1199         if ie is not None:
1200             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1201         if video_id is not None:
1202             kwargs['id'] = video_id
1203         if video_title is not None:
1204             kwargs['title'] = video_title
1205         return {
1206             **kwargs,
1207             '_type': 'url_transparent' if url_transparent else 'url',
1208             'url': url,
1209         }
1210
1211     @classmethod
1212     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1213                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1214         return cls.playlist_result(
1215             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1216             playlist_id, playlist_title, **kwargs)
1217
1218     @staticmethod
1219     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1220         """Returns a playlist"""
1221         if playlist_id:
1222             kwargs['id'] = playlist_id
1223         if playlist_title:
1224             kwargs['title'] = playlist_title
1225         if playlist_description is not None:
1226             kwargs['description'] = playlist_description
1227         return {
1228             **kwargs,
1229             '_type': 'multi_video' if multi_video else 'playlist',
1230             'entries': entries,
1231         }
1232
1233     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1234         """
1235         Perform a regex search on the given string, using a single or a list of
1236         patterns returning the first matching group.
1237         In case of failure return a default value or raise a WARNING or a
1238         RegexNotFoundError, depending on fatal, specifying the field name.
1239         """
1240         if string is None:
1241             mobj = None
1242         elif isinstance(pattern, (str, re.Pattern)):
1243             mobj = re.search(pattern, string, flags)
1244         else:
1245             for p in pattern:
1246                 mobj = re.search(p, string, flags)
1247                 if mobj:
1248                     break
1249
1250         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1251
1252         if mobj:
1253             if group is None:
1254                 # return the first matching group
1255                 return next(g for g in mobj.groups() if g is not None)
1256             elif isinstance(group, (list, tuple)):
1257                 return tuple(mobj.group(g) for g in group)
1258             else:
1259                 return mobj.group(group)
1260         elif default is not NO_DEFAULT:
1261             return default
1262         elif fatal:
1263             raise RegexNotFoundError('Unable to extract %s' % _name)
1264         else:
1265             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1266             return None
1267
1268     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1269                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1270         """Searches string for the JSON object specified by start_pattern"""
1271         # NB: end_pattern is only used to reduce the size of the initial match
1272         if default is NO_DEFAULT:
1273             default, has_default = {}, False
1274         else:
1275             fatal, has_default = False, True
1276
1277         json_string = self._search_regex(
1278             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1279             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1280         if not json_string:
1281             return default
1282
1283         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1284         try:
1285             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1286         except ExtractorError as e:
1287             if fatal:
1288                 raise ExtractorError(
1289                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1290             elif not has_default:
1291                 self.report_warning(
1292                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1293         return default
1294
1295     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1296         """
1297         Like _search_regex, but strips HTML tags and unescapes entities.
1298         """
1299         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1300         if isinstance(res, tuple):
1301             return tuple(map(clean_html, res))
1302         return clean_html(res)
1303
1304     def _get_netrc_login_info(self, netrc_machine=None):
1305         netrc_machine = netrc_machine or self._NETRC_MACHINE
1306
1307         cmd = self.get_param('netrc_cmd')
1308         if cmd:
1309             cmd = cmd.replace('{}', netrc_machine)
1310             self.to_screen(f'Executing command: {cmd}')
1311             stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1312             if ret != 0:
1313                 raise OSError(f'Command returned error code {ret}')
1314             info = netrc_from_content(stdout).authenticators(netrc_machine)
1315
1316         elif self.get_param('usenetrc', False):
1317             netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1318             if os.path.isdir(netrc_file):
1319                 netrc_file = os.path.join(netrc_file, '.netrc')
1320             info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1321
1322         else:
1323             return None, None
1324         if not info:
1325             raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')
1326         return info[0], info[2]
1327
1328     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1329         """
1330         Get the login info as (username, password)
1331         First look for the manually specified credentials using username_option
1332         and password_option as keys in params dictionary. If no such credentials
1333         are available try the netrc_cmd if it is defined or look in the
1334         netrc file using the netrc_machine or _NETRC_MACHINE value.
1335         If there's no info available, return (None, None)
1336         """
1337
1338         username = self.get_param(username_option)
1339         if username is not None:
1340             password = self.get_param(password_option)
1341         else:
1342             try:
1343                 username, password = self._get_netrc_login_info(netrc_machine)
1344             except (OSError, netrc.NetrcParseError) as err:
1345                 self.report_warning(f'Failed to parse .netrc: {err}')
1346                 return None, None
1347         return username, password
1348
1349     def _get_tfa_info(self, note='two-factor verification code'):
1350         """
1351         Get the two-factor authentication info
1352         TODO - asking the user will be required for sms/phone verify
1353         currently just uses the command line option
1354         If there's no info available, return None
1355         """
1356
1357         tfa = self.get_param('twofactor')
1358         if tfa is not None:
1359             return tfa
1360
1361         return getpass.getpass('Type %s and press [Return]: ' % note)
1362
1363     # Helper functions for extracting OpenGraph info
1364     @staticmethod
1365     def _og_regexes(prop):
1366         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1367         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1368                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1369         template = r'<meta[^>]+?%s[^>]+?%s'
1370         return [
1371             template % (property_re, content_re),
1372             template % (content_re, property_re),
1373         ]
1374
1375     @staticmethod
1376     def _meta_regex(prop):
1377         return r'''(?isx)<meta
1378                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1379                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1380
1381     def _og_search_property(self, prop, html, name=None, **kargs):
1382         prop = variadic(prop)
1383         if name is None:
1384             name = 'OpenGraph %s' % prop[0]
1385         og_regexes = []
1386         for p in prop:
1387             og_regexes.extend(self._og_regexes(p))
1388         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1389         if escaped is None:
1390             return None
1391         return unescapeHTML(escaped)
1392
1393     def _og_search_thumbnail(self, html, **kargs):
1394         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1395
1396     def _og_search_description(self, html, **kargs):
1397         return self._og_search_property('description', html, fatal=False, **kargs)
1398
1399     def _og_search_title(self, html, *, fatal=False, **kargs):
1400         return self._og_search_property('title', html, fatal=fatal, **kargs)
1401
1402     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1403         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1404         if secure:
1405             regexes = self._og_regexes('video:secure_url') + regexes
1406         return self._html_search_regex(regexes, html, name, **kargs)
1407
1408     def _og_search_url(self, html, **kargs):
1409         return self._og_search_property('url', html, **kargs)
1410
1411     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1412         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1413
1414     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1415         name = variadic(name)
1416         if display_name is None:
1417             display_name = name[0]
1418         return self._html_search_regex(
1419             [self._meta_regex(n) for n in name],
1420             html, display_name, fatal=fatal, group='content', **kwargs)
1421
1422     def _dc_search_uploader(self, html):
1423         return self._html_search_meta('dc.creator', html, 'uploader')
1424
1425     @staticmethod
1426     def _rta_search(html):
1427         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1428         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1429                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1430                      html):
1431             return 18
1432
1433         # And then there are the jokers who advertise that they use RTA, but actually don't.
1434         AGE_LIMIT_MARKERS = [
1435             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1436             r'>[^<]*you acknowledge you are at least (\d+) years old',
1437             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1438         ]
1439
1440         age_limit = 0
1441         for marker in AGE_LIMIT_MARKERS:
1442             mobj = re.search(marker, html)
1443             if mobj:
1444                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1445         return age_limit
1446
1447     def _media_rating_search(self, html):
1448         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1449         rating = self._html_search_meta('rating', html)
1450
1451         if not rating:
1452             return None
1453
1454         RATING_TABLE = {
1455             'safe for kids': 0,
1456             'general': 8,
1457             '14 years': 14,
1458             'mature': 17,
1459             'restricted': 19,
1460         }
1461         return RATING_TABLE.get(rating.lower())
1462
1463     def _family_friendly_search(self, html):
1464         # See http://schema.org/VideoObject
1465         family_friendly = self._html_search_meta(
1466             'isFamilyFriendly', html, default=None)
1467
1468         if not family_friendly:
1469             return None
1470
1471         RATING_TABLE = {
1472             '1': 0,
1473             'true': 0,
1474             '0': 18,
1475             'false': 18,
1476         }
1477         return RATING_TABLE.get(family_friendly.lower())
1478
1479     def _twitter_search_player(self, html):
1480         return self._html_search_meta('twitter:player', html,
1481                                       'twitter card player')
1482
1483     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1484         """Yield all json ld objects in the html"""
1485         if default is not NO_DEFAULT:
1486             fatal = False
1487         for mobj in re.finditer(JSON_LD_RE, html):
1488             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1489             for json_ld in variadic(json_ld_item):
1490                 if isinstance(json_ld, dict):
1491                     yield json_ld
1492
1493     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1494         """Search for a video in any json ld in the html"""
1495         if default is not NO_DEFAULT:
1496             fatal = False
1497         info = self._json_ld(
1498             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1499             video_id, fatal=fatal, expected_type=expected_type)
1500         if info:
1501             return info
1502         if default is not NO_DEFAULT:
1503             return default
1504         elif fatal:
1505             raise RegexNotFoundError('Unable to extract JSON-LD')
1506         else:
1507             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1508             return {}
1509
1510     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1511         if isinstance(json_ld, str):
1512             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1513         if not json_ld:
1514             return {}
1515         info = {}
1516
1517         INTERACTION_TYPE_MAP = {
1518             'CommentAction': 'comment',
1519             'AgreeAction': 'like',
1520             'DisagreeAction': 'dislike',
1521             'LikeAction': 'like',
1522             'DislikeAction': 'dislike',
1523             'ListenAction': 'view',
1524             'WatchAction': 'view',
1525             'ViewAction': 'view',
1526         }
1527
1528         def is_type(e, *expected_types):
1529             type = variadic(traverse_obj(e, '@type'))
1530             return any(x in type for x in expected_types)
1531
1532         def extract_interaction_type(e):
1533             interaction_type = e.get('interactionType')
1534             if isinstance(interaction_type, dict):
1535                 interaction_type = interaction_type.get('@type')
1536             return str_or_none(interaction_type)
1537
1538         def extract_interaction_statistic(e):
1539             interaction_statistic = e.get('interactionStatistic')
1540             if isinstance(interaction_statistic, dict):
1541                 interaction_statistic = [interaction_statistic]
1542             if not isinstance(interaction_statistic, list):
1543                 return
1544             for is_e in interaction_statistic:
1545                 if not is_type(is_e, 'InteractionCounter'):
1546                     continue
1547                 interaction_type = extract_interaction_type(is_e)
1548                 if not interaction_type:
1549                     continue
1550                 # For interaction count some sites provide string instead of
1551                 # an integer (as per spec) with non digit characters (e.g. ",")
1552                 # so extracting count with more relaxed str_to_int
1553                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1554                 if interaction_count is None:
1555                     continue
1556                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1557                 if not count_kind:
1558                     continue
1559                 count_key = '%s_count' % count_kind
1560                 if info.get(count_key) is not None:
1561                     continue
1562                 info[count_key] = interaction_count
1563
1564         def extract_chapter_information(e):
1565             chapters = [{
1566                 'title': part.get('name'),
1567                 'start_time': part.get('startOffset'),
1568                 'end_time': part.get('endOffset'),
1569             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1570             for idx, (last_c, current_c, next_c) in enumerate(zip(
1571                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1572                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1573                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1574                 if None in current_c.values():
1575                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1576                     return
1577             if chapters:
1578                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1579                 info['chapters'] = chapters
1580
1581         def extract_video_object(e):
1582             author = e.get('author')
1583             info.update({
1584                 'url': url_or_none(e.get('contentUrl')),
1585                 'ext': mimetype2ext(e.get('encodingFormat')),
1586                 'title': unescapeHTML(e.get('name')),
1587                 'description': unescapeHTML(e.get('description')),
1588                 'thumbnails': [{'url': unescapeHTML(url)}
1589                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1590                                if url_or_none(url)],
1591                 'duration': parse_duration(e.get('duration')),
1592                 'timestamp': unified_timestamp(e.get('uploadDate')),
1593                 # author can be an instance of 'Organization' or 'Person' types.
1594                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1595                 # however some websites are using 'Text' type instead.
1596                 # 1. https://schema.org/VideoObject
1597                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1598                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1599                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1600                 'tbr': int_or_none(e.get('bitrate')),
1601                 'width': int_or_none(e.get('width')),
1602                 'height': int_or_none(e.get('height')),
1603                 'view_count': int_or_none(e.get('interactionCount')),
1604                 'tags': try_call(lambda: e.get('keywords').split(',')),
1605             })
1606             if is_type(e, 'AudioObject'):
1607                 info.update({
1608                     'vcodec': 'none',
1609                     'abr': int_or_none(e.get('bitrate')),
1610                 })
1611             extract_interaction_statistic(e)
1612             extract_chapter_information(e)
1613
1614         def traverse_json_ld(json_ld, at_top_level=True):
1615             for e in variadic(json_ld):
1616                 if not isinstance(e, dict):
1617                     continue
1618                 if at_top_level and '@context' not in e:
1619                     continue
1620                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1621                     traverse_json_ld(e['@graph'], at_top_level=False)
1622                     continue
1623                 if expected_type is not None and not is_type(e, expected_type):
1624                     continue
1625                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1626                 if rating is not None:
1627                     info['average_rating'] = rating
1628                 if is_type(e, 'TVEpisode', 'Episode'):
1629                     episode_name = unescapeHTML(e.get('name'))
1630                     info.update({
1631                         'episode': episode_name,
1632                         'episode_number': int_or_none(e.get('episodeNumber')),
1633                         'description': unescapeHTML(e.get('description')),
1634                     })
1635                     if not info.get('title') and episode_name:
1636                         info['title'] = episode_name
1637                     part_of_season = e.get('partOfSeason')
1638                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1639                         info.update({
1640                             'season': unescapeHTML(part_of_season.get('name')),
1641                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1642                         })
1643                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1644                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1645                         info['series'] = unescapeHTML(part_of_series.get('name'))
1646                 elif is_type(e, 'Movie'):
1647                     info.update({
1648                         'title': unescapeHTML(e.get('name')),
1649                         'description': unescapeHTML(e.get('description')),
1650                         'duration': parse_duration(e.get('duration')),
1651                         'timestamp': unified_timestamp(e.get('dateCreated')),
1652                     })
1653                 elif is_type(e, 'Article', 'NewsArticle'):
1654                     info.update({
1655                         'timestamp': parse_iso8601(e.get('datePublished')),
1656                         'title': unescapeHTML(e.get('headline')),
1657                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1658                     })
1659                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1660                         extract_video_object(e['video'][0])
1661                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1662                         extract_video_object(e['subjectOf'][0])
1663                 elif is_type(e, 'VideoObject', 'AudioObject'):
1664                     extract_video_object(e)
1665                     if expected_type is None:
1666                         continue
1667                     else:
1668                         break
1669                 video = e.get('video')
1670                 if is_type(video, 'VideoObject'):
1671                     extract_video_object(video)
1672                 if expected_type is None:
1673                     continue
1674                 else:
1675                     break
1676
1677         traverse_json_ld(json_ld)
1678         return filter_dict(info)
1679
1680     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1681         return self._parse_json(
1682             self._search_regex(
1683                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1684                 webpage, 'next.js data', fatal=fatal, **kw),
1685             video_id, transform_source=transform_source, fatal=fatal)
1686
1687     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1688         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1689         rectx = re.escape(context_name)
1690         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1691         js, arg_keys, arg_vals = self._search_regex(
1692             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1693             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1694             default=NO_DEFAULT if fatal else (None, None, None))
1695         if js is None:
1696             return {}
1697
1698         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1699             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1700
1701         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1702         return traverse_obj(ret, traverse) or {}
1703
1704     @staticmethod
1705     def _hidden_inputs(html):
1706         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1707         hidden_inputs = {}
1708         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1709             attrs = extract_attributes(input)
1710             if not input:
1711                 continue
1712             if attrs.get('type') not in ('hidden', 'submit'):
1713                 continue
1714             name = attrs.get('name') or attrs.get('id')
1715             value = attrs.get('value')
1716             if name and value is not None:
1717                 hidden_inputs[name] = value
1718         return hidden_inputs
1719
1720     def _form_hidden_inputs(self, form_id, html):
1721         form = self._search_regex(
1722             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1723             html, '%s form' % form_id, group='form')
1724         return self._hidden_inputs(form)
1725
1726     @classproperty(cache=True)
1727     def FormatSort(cls):
1728         class FormatSort(FormatSorter):
1729             def __init__(ie, *args, **kwargs):
1730                 super().__init__(ie._downloader, *args, **kwargs)
1731
1732         deprecation_warning(
1733             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1734             'Use yt_dlp.utils.FormatSorter instead')
1735         return FormatSort
1736
1737     def _sort_formats(self, formats, field_preference=[]):
1738         if not field_preference:
1739             self._downloader.deprecation_warning(
1740                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1741             return
1742         self._downloader.deprecation_warning(
1743             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1744             'Return _format_sort_fields in the info_dict instead')
1745         if formats:
1746             formats[0]['__sort_fields'] = field_preference
1747
1748     def _check_formats(self, formats, video_id):
1749         if formats:
1750             formats[:] = filter(
1751                 lambda f: self._is_valid_url(
1752                     f['url'], video_id,
1753                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1754                 formats)
1755
1756     @staticmethod
1757     def _remove_duplicate_formats(formats):
1758         format_urls = set()
1759         unique_formats = []
1760         for f in formats:
1761             if f['url'] not in format_urls:
1762                 format_urls.add(f['url'])
1763                 unique_formats.append(f)
1764         formats[:] = unique_formats
1765
1766     def _is_valid_url(self, url, video_id, item='video', headers={}):
1767         url = self._proto_relative_url(url, scheme='http:')
1768         # For now assume non HTTP(S) URLs always valid
1769         if not (url.startswith('http://') or url.startswith('https://')):
1770             return True
1771         try:
1772             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1773             return True
1774         except ExtractorError as e:
1775             self.to_screen(
1776                 '%s: %s URL is invalid, skipping: %s'
1777                 % (video_id, item, error_to_compat_str(e.cause)))
1778             return False
1779
1780     def http_scheme(self):
1781         """ Either "http:" or "https:", depending on the user's preferences """
1782         return (
1783             'http:'
1784             if self.get_param('prefer_insecure', False)
1785             else 'https:')
1786
1787     def _proto_relative_url(self, url, scheme=None):
1788         scheme = scheme or self.http_scheme()
1789         assert scheme.endswith(':')
1790         return sanitize_url(url, scheme=scheme[:-1])
1791
1792     def _sleep(self, timeout, video_id, msg_template=None):
1793         if msg_template is None:
1794             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1795         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1796         self.to_screen(msg)
1797         time.sleep(timeout)
1798
1799     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1800                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1801                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1802         if self.get_param('ignore_no_formats_error'):
1803             fatal = False
1804
1805         res = self._download_xml_handle(
1806             manifest_url, video_id, 'Downloading f4m manifest',
1807             'Unable to download f4m manifest',
1808             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1809             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1810             transform_source=transform_source,
1811             fatal=fatal, data=data, headers=headers, query=query)
1812         if res is False:
1813             return []
1814
1815         manifest, urlh = res
1816         manifest_url = urlh.url
1817
1818         return self._parse_f4m_formats(
1819             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1820             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1821
1822     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1823                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1824                            fatal=True, m3u8_id=None):
1825         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1826             return []
1827
1828         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1829         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1830         if akamai_pv is not None and ';' in akamai_pv.text:
1831             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1832             if playerVerificationChallenge.strip() != '':
1833                 return []
1834
1835         formats = []
1836         manifest_version = '1.0'
1837         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1838         if not media_nodes:
1839             manifest_version = '2.0'
1840             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1841         # Remove unsupported DRM protected media from final formats
1842         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1843         media_nodes = remove_encrypted_media(media_nodes)
1844         if not media_nodes:
1845             return formats
1846
1847         manifest_base_url = get_base_url(manifest)
1848
1849         bootstrap_info = xpath_element(
1850             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1851             'bootstrap info', default=None)
1852
1853         vcodec = None
1854         mime_type = xpath_text(
1855             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1856             'base URL', default=None)
1857         if mime_type and mime_type.startswith('audio/'):
1858             vcodec = 'none'
1859
1860         for i, media_el in enumerate(media_nodes):
1861             tbr = int_or_none(media_el.attrib.get('bitrate'))
1862             width = int_or_none(media_el.attrib.get('width'))
1863             height = int_or_none(media_el.attrib.get('height'))
1864             format_id = join_nonempty(f4m_id, tbr or i)
1865             # If <bootstrapInfo> is present, the specified f4m is a
1866             # stream-level manifest, and only set-level manifests may refer to
1867             # external resources.  See section 11.4 and section 4 of F4M spec
1868             if bootstrap_info is None:
1869                 media_url = None
1870                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1871                 if manifest_version == '2.0':
1872                     media_url = media_el.attrib.get('href')
1873                 if media_url is None:
1874                     media_url = media_el.attrib.get('url')
1875                 if not media_url:
1876                     continue
1877                 manifest_url = (
1878                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1879                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1880                 # If media_url is itself a f4m manifest do the recursive extraction
1881                 # since bitrates in parent manifest (this one) and media_url manifest
1882                 # may differ leading to inability to resolve the format by requested
1883                 # bitrate in f4m downloader
1884                 ext = determine_ext(manifest_url)
1885                 if ext == 'f4m':
1886                     f4m_formats = self._extract_f4m_formats(
1887                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1888                         transform_source=transform_source, fatal=fatal)
1889                     # Sometimes stream-level manifest contains single media entry that
1890                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1891                     # At the same time parent's media entry in set-level manifest may
1892                     # contain it. We will copy it from parent in such cases.
1893                     if len(f4m_formats) == 1:
1894                         f = f4m_formats[0]
1895                         f.update({
1896                             'tbr': f.get('tbr') or tbr,
1897                             'width': f.get('width') or width,
1898                             'height': f.get('height') or height,
1899                             'format_id': f.get('format_id') if not tbr else format_id,
1900                             'vcodec': vcodec,
1901                         })
1902                     formats.extend(f4m_formats)
1903                     continue
1904                 elif ext == 'm3u8':
1905                     formats.extend(self._extract_m3u8_formats(
1906                         manifest_url, video_id, 'mp4', preference=preference,
1907                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1908                     continue
1909             formats.append({
1910                 'format_id': format_id,
1911                 'url': manifest_url,
1912                 'manifest_url': manifest_url,
1913                 'ext': 'flv' if bootstrap_info is not None else None,
1914                 'protocol': 'f4m',
1915                 'tbr': tbr,
1916                 'width': width,
1917                 'height': height,
1918                 'vcodec': vcodec,
1919                 'preference': preference,
1920                 'quality': quality,
1921             })
1922         return formats
1923
1924     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1925         return {
1926             'format_id': join_nonempty(m3u8_id, 'meta'),
1927             'url': m3u8_url,
1928             'ext': ext,
1929             'protocol': 'm3u8',
1930             'preference': preference - 100 if preference else -100,
1931             'quality': quality,
1932             'resolution': 'multiple',
1933             'format_note': 'Quality selection URL',
1934         }
1935
1936     def _report_ignoring_subs(self, name):
1937         self.report_warning(bug_reports_message(
1938             f'Ignoring subtitle tracks found in the {name} manifest; '
1939             'if any subtitle tracks are missing,'
1940         ), only_once=True)
1941
1942     def _extract_m3u8_formats(self, *args, **kwargs):
1943         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1944         if subs:
1945             self._report_ignoring_subs('HLS')
1946         return fmts
1947
1948     def _extract_m3u8_formats_and_subtitles(
1949             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1950             preference=None, quality=None, m3u8_id=None, note=None,
1951             errnote=None, fatal=True, live=False, data=None, headers={},
1952             query={}):
1953
1954         if self.get_param('ignore_no_formats_error'):
1955             fatal = False
1956
1957         if not m3u8_url:
1958             if errnote is not False:
1959                 errnote = errnote or 'Failed to obtain m3u8 URL'
1960                 if fatal:
1961                     raise ExtractorError(errnote, video_id=video_id)
1962                 self.report_warning(f'{errnote}{bug_reports_message()}')
1963             return [], {}
1964
1965         res = self._download_webpage_handle(
1966             m3u8_url, video_id,
1967             note='Downloading m3u8 information' if note is None else note,
1968             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1969             fatal=fatal, data=data, headers=headers, query=query)
1970
1971         if res is False:
1972             return [], {}
1973
1974         m3u8_doc, urlh = res
1975         m3u8_url = urlh.url
1976
1977         return self._parse_m3u8_formats_and_subtitles(
1978             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1979             preference=preference, quality=quality, m3u8_id=m3u8_id,
1980             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1981             headers=headers, query=query, video_id=video_id)
1982
1983     def _parse_m3u8_formats_and_subtitles(
1984             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
1985             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1986             errnote=None, fatal=True, data=None, headers={}, query={},
1987             video_id=None):
1988         formats, subtitles = [], {}
1989         has_drm = HlsFD._has_drm(m3u8_doc)
1990
1991         def format_url(url):
1992             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
1993
1994         if self.get_param('hls_split_discontinuity', False):
1995             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1996                 if not m3u8_doc:
1997                     if not manifest_url:
1998                         return []
1999                     m3u8_doc = self._download_webpage(
2000                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2001                         note=False, errnote='Failed to download m3u8 playlist information')
2002                     if m3u8_doc is False:
2003                         return []
2004                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2005
2006         else:
2007             def _extract_m3u8_playlist_indices(*args, **kwargs):
2008                 return [None]
2009
2010         # References:
2011         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2012         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2013         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2014
2015         # We should try extracting formats only from master playlists [1, 4.3.4],
2016         # i.e. playlists that describe available qualities. On the other hand
2017         # media playlists [1, 4.3.3] should be returned as is since they contain
2018         # just the media without qualities renditions.
2019         # Fortunately, master playlist can be easily distinguished from media
2020         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2021         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2022         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2023         # media playlist and MUST NOT appear in master playlist thus we can
2024         # clearly detect media playlist with this criterion.
2025
2026         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2027             formats = [{
2028                 'format_id': join_nonempty(m3u8_id, idx),
2029                 'format_index': idx,
2030                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2031                 'ext': ext,
2032                 'protocol': entry_protocol,
2033                 'preference': preference,
2034                 'quality': quality,
2035                 'has_drm': has_drm,
2036             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2037
2038             return formats, subtitles
2039
2040         groups = {}
2041         last_stream_inf = {}
2042
2043         def extract_media(x_media_line):
2044             media = parse_m3u8_attributes(x_media_line)
2045             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2046             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2047             if not (media_type and group_id and name):
2048                 return
2049             groups.setdefault(group_id, []).append(media)
2050             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2051             if media_type == 'SUBTITLES':
2052                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2053                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2054                 # However, lack of URI has been spotted in the wild.
2055                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2056                 if not media.get('URI'):
2057                     return
2058                 url = format_url(media['URI'])
2059                 sub_info = {
2060                     'url': url,
2061                     'ext': determine_ext(url),
2062                 }
2063                 if sub_info['ext'] == 'm3u8':
2064                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2065                     # files may contain is WebVTT:
2066                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2067                     sub_info['ext'] = 'vtt'
2068                     sub_info['protocol'] = 'm3u8_native'
2069                 lang = media.get('LANGUAGE') or 'und'
2070                 subtitles.setdefault(lang, []).append(sub_info)
2071             if media_type not in ('VIDEO', 'AUDIO'):
2072                 return
2073             media_url = media.get('URI')
2074             if media_url:
2075                 manifest_url = format_url(media_url)
2076                 formats.extend({
2077                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2078                     'format_note': name,
2079                     'format_index': idx,
2080                     'url': manifest_url,
2081                     'manifest_url': m3u8_url,
2082                     'language': media.get('LANGUAGE'),
2083                     'ext': ext,
2084                     'protocol': entry_protocol,
2085                     'preference': preference,
2086                     'quality': quality,
2087                     'has_drm': has_drm,
2088                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2089                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2090
2091         def build_stream_name():
2092             # Despite specification does not mention NAME attribute for
2093             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2094             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2095             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2096             stream_name = last_stream_inf.get('NAME')
2097             if stream_name:
2098                 return stream_name
2099             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2100             # from corresponding rendition group
2101             stream_group_id = last_stream_inf.get('VIDEO')
2102             if not stream_group_id:
2103                 return
2104             stream_group = groups.get(stream_group_id)
2105             if not stream_group:
2106                 return stream_group_id
2107             rendition = stream_group[0]
2108             return rendition.get('NAME') or stream_group_id
2109
2110         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2111         # chance to detect video only formats when EXT-X-STREAM-INF tags
2112         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2113         for line in m3u8_doc.splitlines():
2114             if line.startswith('#EXT-X-MEDIA:'):
2115                 extract_media(line)
2116
2117         for line in m3u8_doc.splitlines():
2118             if line.startswith('#EXT-X-STREAM-INF:'):
2119                 last_stream_inf = parse_m3u8_attributes(line)
2120             elif line.startswith('#') or not line.strip():
2121                 continue
2122             else:
2123                 tbr = float_or_none(
2124                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2125                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2126                 manifest_url = format_url(line.strip())
2127
2128                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2129                     format_id = [m3u8_id, None, idx]
2130                     # Bandwidth of live streams may differ over time thus making
2131                     # format_id unpredictable. So it's better to keep provided
2132                     # format_id intact.
2133                     if not live:
2134                         stream_name = build_stream_name()
2135                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2136                     f = {
2137                         'format_id': join_nonempty(*format_id),
2138                         'format_index': idx,
2139                         'url': manifest_url,
2140                         'manifest_url': m3u8_url,
2141                         'tbr': tbr,
2142                         'ext': ext,
2143                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2144                         'protocol': entry_protocol,
2145                         'preference': preference,
2146                         'quality': quality,
2147                         'has_drm': has_drm,
2148                     }
2149                     resolution = last_stream_inf.get('RESOLUTION')
2150                     if resolution:
2151                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2152                         if mobj:
2153                             f['width'] = int(mobj.group('width'))
2154                             f['height'] = int(mobj.group('height'))
2155                     # Unified Streaming Platform
2156                     mobj = re.search(
2157                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2158                     if mobj:
2159                         abr, vbr = mobj.groups()
2160                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2161                         f.update({
2162                             'vbr': vbr,
2163                             'abr': abr,
2164                         })
2165                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2166                     f.update(codecs)
2167                     audio_group_id = last_stream_inf.get('AUDIO')
2168                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2169                     # references a rendition group MUST have a CODECS attribute.
2170                     # However, this is not always respected. E.g. [2]
2171                     # contains EXT-X-STREAM-INF tag which references AUDIO
2172                     # rendition group but does not have CODECS and despite
2173                     # referencing an audio group it represents a complete
2174                     # (with audio and video) format. So, for such cases we will
2175                     # ignore references to rendition groups and treat them
2176                     # as complete formats.
2177                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2178                         audio_group = groups.get(audio_group_id)
2179                         if audio_group and audio_group[0].get('URI'):
2180                             # TODO: update acodec for audio only formats with
2181                             # the same GROUP-ID
2182                             f['acodec'] = 'none'
2183                     if not f.get('ext'):
2184                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2185                     formats.append(f)
2186
2187                     # for DailyMotion
2188                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2189                     if progressive_uri:
2190                         http_f = f.copy()
2191                         del http_f['manifest_url']
2192                         http_f.update({
2193                             'format_id': f['format_id'].replace('hls-', 'http-'),
2194                             'protocol': 'http',
2195                             'url': progressive_uri,
2196                         })
2197                         formats.append(http_f)
2198
2199                 last_stream_inf = {}
2200         return formats, subtitles
2201
2202     def _extract_m3u8_vod_duration(
2203             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2204
2205         m3u8_vod = self._download_webpage(
2206             m3u8_vod_url, video_id,
2207             note='Downloading m3u8 VOD manifest' if note is None else note,
2208             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2209             fatal=False, data=data, headers=headers, query=query)
2210
2211         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2212
2213     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2214         if '#EXT-X-ENDLIST' not in m3u8_vod:
2215             return None
2216
2217         return int(sum(
2218             float(line[len('#EXTINF:'):].split(',')[0])
2219             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2220
2221     def _extract_mpd_vod_duration(
2222             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2223
2224         mpd_doc = self._download_xml(
2225             mpd_url, video_id,
2226             note='Downloading MPD VOD manifest' if note is None else note,
2227             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2228             fatal=False, data=data, headers=headers, query=query)
2229         if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2230             return None
2231         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2232
2233     @staticmethod
2234     def _xpath_ns(path, namespace=None):
2235         if not namespace:
2236             return path
2237         out = []
2238         for c in path.split('/'):
2239             if not c or c == '.':
2240                 out.append(c)
2241             else:
2242                 out.append('{%s}%s' % (namespace, c))
2243         return '/'.join(out)
2244
2245     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2246         if self.get_param('ignore_no_formats_error'):
2247             fatal = False
2248
2249         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2250         if res is False:
2251             assert not fatal
2252             return [], {}
2253         smil, urlh = res
2254
2255         return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2256                                                       namespace=self._parse_smil_namespace(smil))
2257
2258     def _extract_smil_formats(self, *args, **kwargs):
2259         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2260         if subs:
2261             self._report_ignoring_subs('SMIL')
2262         return fmts
2263
2264     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2265         res = self._download_smil(smil_url, video_id, fatal=fatal)
2266         if res is False:
2267             return {}
2268
2269         smil, urlh = res
2270         smil_url = urlh.url
2271
2272         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2273
2274     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2275         return self._download_xml_handle(
2276             smil_url, video_id, 'Downloading SMIL file',
2277             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2278
2279     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2280         namespace = self._parse_smil_namespace(smil)
2281
2282         formats, subtitles = self._parse_smil_formats_and_subtitles(
2283             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2284
2285         video_id = os.path.splitext(url_basename(smil_url))[0]
2286         title = None
2287         description = None
2288         upload_date = None
2289         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2290             name = meta.attrib.get('name')
2291             content = meta.attrib.get('content')
2292             if not name or not content:
2293                 continue
2294             if not title and name == 'title':
2295                 title = content
2296             elif not description and name in ('description', 'abstract'):
2297                 description = content
2298             elif not upload_date and name == 'date':
2299                 upload_date = unified_strdate(content)
2300
2301         thumbnails = [{
2302             'id': image.get('type'),
2303             'url': image.get('src'),
2304             'width': int_or_none(image.get('width')),
2305             'height': int_or_none(image.get('height')),
2306         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2307
2308         return {
2309             'id': video_id,
2310             'title': title or video_id,
2311             'description': description,
2312             'upload_date': upload_date,
2313             'thumbnails': thumbnails,
2314             'formats': formats,
2315             'subtitles': subtitles,
2316         }
2317
2318     def _parse_smil_namespace(self, smil):
2319         return self._search_regex(
2320             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2321
2322     def _parse_smil_formats(self, *args, **kwargs):
2323         fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2324         if subs:
2325             self._report_ignoring_subs('SMIL')
2326         return fmts
2327
2328     def _parse_smil_formats_and_subtitles(
2329             self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2330         base = smil_url
2331         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2332             b = meta.get('base') or meta.get('httpBase')
2333             if b:
2334                 base = b
2335                 break
2336
2337         formats, subtitles = [], {}
2338         rtmp_count = 0
2339         http_count = 0
2340         m3u8_count = 0
2341         imgs_count = 0
2342
2343         srcs = set()
2344         media = itertools.chain.from_iterable(
2345             smil.findall(self._xpath_ns(arg, namespace))
2346             for arg in ['.//video', './/audio', './/media'])
2347         for medium in media:
2348             src = medium.get('src')
2349             if not src or src in srcs:
2350                 continue
2351             srcs.add(src)
2352
2353             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2354             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2355             width = int_or_none(medium.get('width'))
2356             height = int_or_none(medium.get('height'))
2357             proto = medium.get('proto')
2358             ext = medium.get('ext')
2359             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2360                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2361             streamer = medium.get('streamer') or base
2362
2363             if proto == 'rtmp' or streamer.startswith('rtmp'):
2364                 rtmp_count += 1
2365                 formats.append({
2366                     'url': streamer,
2367                     'play_path': src,
2368                     'ext': 'flv',
2369                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2370                     'tbr': bitrate,
2371                     'filesize': filesize,
2372                     'width': width,
2373                     'height': height,
2374                 })
2375                 if transform_rtmp_url:
2376                     streamer, src = transform_rtmp_url(streamer, src)
2377                     formats[-1].update({
2378                         'url': streamer,
2379                         'play_path': src,
2380                     })
2381                 continue
2382
2383             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2384             src_url = src_url.strip()
2385
2386             if proto == 'm3u8' or src_ext == 'm3u8':
2387                 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
2388                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2389                 self._merge_subtitles(m3u8_subs, target=subtitles)
2390                 if len(m3u8_formats) == 1:
2391                     m3u8_count += 1
2392                     m3u8_formats[0].update({
2393                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2394                         'tbr': bitrate,
2395                         'width': width,
2396                         'height': height,
2397                     })
2398                 formats.extend(m3u8_formats)
2399             elif src_ext == 'f4m':
2400                 f4m_url = src_url
2401                 if not f4m_params:
2402                     f4m_params = {
2403                         'hdcore': '3.2.0',
2404                         'plugin': 'flowplayer-3.2.0.1',
2405                     }
2406                 f4m_url += '&' if '?' in f4m_url else '?'
2407                 f4m_url += urllib.parse.urlencode(f4m_params)
2408                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2409             elif src_ext == 'mpd':
2410                 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2411                     src_url, video_id, mpd_id='dash', fatal=False)
2412                 formats.extend(mpd_formats)
2413                 self._merge_subtitles(mpd_subs, target=subtitles)
2414             elif re.search(r'\.ism/[Mm]anifest', src_url):
2415                 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2416                     src_url, video_id, ism_id='mss', fatal=False)
2417                 formats.extend(ism_formats)
2418                 self._merge_subtitles(ism_subs, target=subtitles)
2419             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2420                 http_count += 1
2421                 formats.append({
2422                     'url': src_url,
2423                     'ext': ext or src_ext or 'flv',
2424                     'format_id': 'http-%d' % (bitrate or http_count),
2425                     'tbr': bitrate,
2426                     'filesize': filesize,
2427                     'width': width,
2428                     'height': height,
2429                 })
2430
2431         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2432             src = medium.get('src')
2433             if not src or src in srcs:
2434                 continue
2435             srcs.add(src)
2436
2437             imgs_count += 1
2438             formats.append({
2439                 'format_id': 'imagestream-%d' % (imgs_count),
2440                 'url': src,
2441                 'ext': mimetype2ext(medium.get('type')),
2442                 'acodec': 'none',
2443                 'vcodec': 'none',
2444                 'width': int_or_none(medium.get('width')),
2445                 'height': int_or_none(medium.get('height')),
2446                 'format_note': 'SMIL storyboards',
2447             })
2448
2449         smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2450         self._merge_subtitles(smil_subs, target=subtitles)
2451
2452         return formats, subtitles
2453
2454     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2455         urls = []
2456         subtitles = {}
2457         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2458             src = textstream.get('src')
2459             if not src or src in urls:
2460                 continue
2461             urls.append(src)
2462             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2463             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2464             subtitles.setdefault(lang, []).append({
2465                 'url': src,
2466                 'ext': ext,
2467             })
2468         return subtitles
2469
2470     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2471         res = self._download_xml_handle(
2472             xspf_url, playlist_id, 'Downloading xpsf playlist',
2473             'Unable to download xspf manifest', fatal=fatal)
2474         if res is False:
2475             return []
2476
2477         xspf, urlh = res
2478         xspf_url = urlh.url
2479
2480         return self._parse_xspf(
2481             xspf, playlist_id, xspf_url=xspf_url,
2482             xspf_base_url=base_url(xspf_url))
2483
2484     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2485         NS_MAP = {
2486             'xspf': 'http://xspf.org/ns/0/',
2487             's1': 'http://static.streamone.nl/player/ns/0',
2488         }
2489
2490         entries = []
2491         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2492             title = xpath_text(
2493                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2494             description = xpath_text(
2495                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2496             thumbnail = xpath_text(
2497                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2498             duration = float_or_none(
2499                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2500
2501             formats = []
2502             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2503                 format_url = urljoin(xspf_base_url, location.text)
2504                 if not format_url:
2505                     continue
2506                 formats.append({
2507                     'url': format_url,
2508                     'manifest_url': xspf_url,
2509                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2510                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2511                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2512                 })
2513
2514             entries.append({
2515                 'id': playlist_id,
2516                 'title': title,
2517                 'description': description,
2518                 'thumbnail': thumbnail,
2519                 'duration': duration,
2520                 'formats': formats,
2521             })
2522         return entries
2523
2524     def _extract_mpd_formats(self, *args, **kwargs):
2525         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2526         if subs:
2527             self._report_ignoring_subs('DASH')
2528         return fmts
2529
2530     def _extract_mpd_formats_and_subtitles(
2531             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2532             fatal=True, data=None, headers={}, query={}):
2533
2534         if self.get_param('ignore_no_formats_error'):
2535             fatal = False
2536
2537         res = self._download_xml_handle(
2538             mpd_url, video_id,
2539             note='Downloading MPD manifest' if note is None else note,
2540             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2541             fatal=fatal, data=data, headers=headers, query=query)
2542         if res is False:
2543             return [], {}
2544         mpd_doc, urlh = res
2545         if mpd_doc is None:
2546             return [], {}
2547
2548         # We could have been redirected to a new url when we retrieved our mpd file.
2549         mpd_url = urlh.url
2550         mpd_base_url = base_url(mpd_url)
2551
2552         return self._parse_mpd_formats_and_subtitles(
2553             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2554
2555     def _parse_mpd_formats(self, *args, **kwargs):
2556         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2557         if subs:
2558             self._report_ignoring_subs('DASH')
2559         return fmts
2560
2561     def _parse_mpd_formats_and_subtitles(
2562             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2563         """
2564         Parse formats from MPD manifest.
2565         References:
2566          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2567             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2568          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2569         """
2570         if not self.get_param('dynamic_mpd', True):
2571             if mpd_doc.get('type') == 'dynamic':
2572                 return [], {}
2573
2574         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2575
2576         def _add_ns(path):
2577             return self._xpath_ns(path, namespace)
2578
2579         def is_drm_protected(element):
2580             return element.find(_add_ns('ContentProtection')) is not None
2581
2582         def extract_multisegment_info(element, ms_parent_info):
2583             ms_info = ms_parent_info.copy()
2584
2585             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2586             # common attributes and elements.  We will only extract relevant
2587             # for us.
2588             def extract_common(source):
2589                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2590                 if segment_timeline is not None:
2591                     s_e = segment_timeline.findall(_add_ns('S'))
2592                     if s_e:
2593                         ms_info['total_number'] = 0
2594                         ms_info['s'] = []
2595                         for s in s_e:
2596                             r = int(s.get('r', 0))
2597                             ms_info['total_number'] += 1 + r
2598                             ms_info['s'].append({
2599                                 't': int(s.get('t', 0)),
2600                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2601                                 'd': int(s.attrib['d']),
2602                                 'r': r,
2603                             })
2604                 start_number = source.get('startNumber')
2605                 if start_number:
2606                     ms_info['start_number'] = int(start_number)
2607                 timescale = source.get('timescale')
2608                 if timescale:
2609                     ms_info['timescale'] = int(timescale)
2610                 segment_duration = source.get('duration')
2611                 if segment_duration:
2612                     ms_info['segment_duration'] = float(segment_duration)
2613
2614             def extract_Initialization(source):
2615                 initialization = source.find(_add_ns('Initialization'))
2616                 if initialization is not None:
2617                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2618
2619             segment_list = element.find(_add_ns('SegmentList'))
2620             if segment_list is not None:
2621                 extract_common(segment_list)
2622                 extract_Initialization(segment_list)
2623                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2624                 if segment_urls_e:
2625                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2626             else:
2627                 segment_template = element.find(_add_ns('SegmentTemplate'))
2628                 if segment_template is not None:
2629                     extract_common(segment_template)
2630                     media = segment_template.get('media')
2631                     if media:
2632                         ms_info['media'] = media
2633                     initialization = segment_template.get('initialization')
2634                     if initialization:
2635                         ms_info['initialization'] = initialization
2636                     else:
2637                         extract_Initialization(segment_template)
2638             return ms_info
2639
2640         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2641         formats, subtitles = [], {}
2642         stream_numbers = collections.defaultdict(int)
2643         for period in mpd_doc.findall(_add_ns('Period')):
2644             period_duration = parse_duration(period.get('duration')) or mpd_duration
2645             period_ms_info = extract_multisegment_info(period, {
2646                 'start_number': 1,
2647                 'timescale': 1,
2648             })
2649             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2650                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2651                 for representation in adaptation_set.findall(_add_ns('Representation')):
2652                     representation_attrib = adaptation_set.attrib.copy()
2653                     representation_attrib.update(representation.attrib)
2654                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2655                     mime_type = representation_attrib['mimeType']
2656                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2657
2658                     codec_str = representation_attrib.get('codecs', '')
2659                     # Some kind of binary subtitle found in some youtube livestreams
2660                     if mime_type == 'application/x-rawcc':
2661                         codecs = {'scodec': codec_str}
2662                     else:
2663                         codecs = parse_codecs(codec_str)
2664                     if content_type not in ('video', 'audio', 'text'):
2665                         if mime_type == 'image/jpeg':
2666                             content_type = mime_type
2667                         elif codecs.get('vcodec', 'none') != 'none':
2668                             content_type = 'video'
2669                         elif codecs.get('acodec', 'none') != 'none':
2670                             content_type = 'audio'
2671                         elif codecs.get('scodec', 'none') != 'none':
2672                             content_type = 'text'
2673                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2674                             content_type = 'text'
2675                         else:
2676                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2677                             continue
2678
2679                     base_url = ''
2680                     for element in (representation, adaptation_set, period, mpd_doc):
2681                         base_url_e = element.find(_add_ns('BaseURL'))
2682                         if try_call(lambda: base_url_e.text) is not None:
2683                             base_url = base_url_e.text + base_url
2684                             if re.match(r'^https?://', base_url):
2685                                 break
2686                     if mpd_base_url and base_url.startswith('/'):
2687                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2688                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2689                         if not mpd_base_url.endswith('/'):
2690                             mpd_base_url += '/'
2691                         base_url = mpd_base_url + base_url
2692                     representation_id = representation_attrib.get('id')
2693                     lang = representation_attrib.get('lang')
2694                     url_el = representation.find(_add_ns('BaseURL'))
2695                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2696                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2697                     if representation_id is not None:
2698                         format_id = representation_id
2699                     else:
2700                         format_id = content_type
2701                     if mpd_id:
2702                         format_id = mpd_id + '-' + format_id
2703                     if content_type in ('video', 'audio'):
2704                         f = {
2705                             'format_id': format_id,
2706                             'manifest_url': mpd_url,
2707                             'ext': mimetype2ext(mime_type),
2708                             'width': int_or_none(representation_attrib.get('width')),
2709                             'height': int_or_none(representation_attrib.get('height')),
2710                             'tbr': float_or_none(bandwidth, 1000),
2711                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2712                             'fps': int_or_none(representation_attrib.get('frameRate')),
2713                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2714                             'format_note': 'DASH %s' % content_type,
2715                             'filesize': filesize,
2716                             'container': mimetype2ext(mime_type) + '_dash',
2717                             **codecs
2718                         }
2719                     elif content_type == 'text':
2720                         f = {
2721                             'ext': mimetype2ext(mime_type),
2722                             'manifest_url': mpd_url,
2723                             'filesize': filesize,
2724                         }
2725                     elif content_type == 'image/jpeg':
2726                         # See test case in VikiIE
2727                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2728                         f = {
2729                             'format_id': format_id,
2730                             'ext': 'mhtml',
2731                             'manifest_url': mpd_url,
2732                             'format_note': 'DASH storyboards (jpeg)',
2733                             'acodec': 'none',
2734                             'vcodec': 'none',
2735                         }
2736                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2737                         f['has_drm'] = True
2738                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2739
2740                     def prepare_template(template_name, identifiers):
2741                         tmpl = representation_ms_info[template_name]
2742                         if representation_id is not None:
2743                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2744                         # First of, % characters outside $...$ templates
2745                         # must be escaped by doubling for proper processing
2746                         # by % operator string formatting used further (see
2747                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2748                         t = ''
2749                         in_template = False
2750                         for c in tmpl:
2751                             t += c
2752                             if c == '$':
2753                                 in_template = not in_template
2754                             elif c == '%' and not in_template:
2755                                 t += c
2756                         # Next, $...$ templates are translated to their
2757                         # %(...) counterparts to be used with % operator
2758                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2759                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2760                         t.replace('$$', '$')
2761                         return t
2762
2763                     # @initialization is a regular template like @media one
2764                     # so it should be handled just the same way (see
2765                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2766                     if 'initialization' in representation_ms_info:
2767                         initialization_template = prepare_template(
2768                             'initialization',
2769                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2770                             # $Time$ shall not be included for @initialization thus
2771                             # only $Bandwidth$ remains
2772                             ('Bandwidth', ))
2773                         representation_ms_info['initialization_url'] = initialization_template % {
2774                             'Bandwidth': bandwidth,
2775                         }
2776
2777                     def location_key(location):
2778                         return 'url' if re.match(r'^https?://', location) else 'path'
2779
2780                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2781
2782                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2783                         media_location_key = location_key(media_template)
2784
2785                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2786                         # can't be used at the same time
2787                         if '%(Number' in media_template and 's' not in representation_ms_info:
2788                             segment_duration = None
2789                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2790                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2791                                 representation_ms_info['total_number'] = int(math.ceil(
2792                                     float_or_none(period_duration, segment_duration, default=0)))
2793                             representation_ms_info['fragments'] = [{
2794                                 media_location_key: media_template % {
2795                                     'Number': segment_number,
2796                                     'Bandwidth': bandwidth,
2797                                 },
2798                                 'duration': segment_duration,
2799                             } for segment_number in range(
2800                                 representation_ms_info['start_number'],
2801                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2802                         else:
2803                             # $Number*$ or $Time$ in media template with S list available
2804                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2805                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2806                             representation_ms_info['fragments'] = []
2807                             segment_time = 0
2808                             segment_d = None
2809                             segment_number = representation_ms_info['start_number']
2810
2811                             def add_segment_url():
2812                                 segment_url = media_template % {
2813                                     'Time': segment_time,
2814                                     'Bandwidth': bandwidth,
2815                                     'Number': segment_number,
2816                                 }
2817                                 representation_ms_info['fragments'].append({
2818                                     media_location_key: segment_url,
2819                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2820                                 })
2821
2822                             for num, s in enumerate(representation_ms_info['s']):
2823                                 segment_time = s.get('t') or segment_time
2824                                 segment_d = s['d']
2825                                 add_segment_url()
2826                                 segment_number += 1
2827                                 for r in range(s.get('r', 0)):
2828                                     segment_time += segment_d
2829                                     add_segment_url()
2830                                     segment_number += 1
2831                                 segment_time += segment_d
2832                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2833                         # No media template,
2834                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2835                         # or any YouTube dashsegments video
2836                         fragments = []
2837                         segment_index = 0
2838                         timescale = representation_ms_info['timescale']
2839                         for s in representation_ms_info['s']:
2840                             duration = float_or_none(s['d'], timescale)
2841                             for r in range(s.get('r', 0) + 1):
2842                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2843                                 fragments.append({
2844                                     location_key(segment_uri): segment_uri,
2845                                     'duration': duration,
2846                                 })
2847                                 segment_index += 1
2848                         representation_ms_info['fragments'] = fragments
2849                     elif 'segment_urls' in representation_ms_info:
2850                         # Segment URLs with no SegmentTimeline
2851                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2852                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2853                         fragments = []
2854                         segment_duration = float_or_none(
2855                             representation_ms_info['segment_duration'],
2856                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2857                         for segment_url in representation_ms_info['segment_urls']:
2858                             fragment = {
2859                                 location_key(segment_url): segment_url,
2860                             }
2861                             if segment_duration:
2862                                 fragment['duration'] = segment_duration
2863                             fragments.append(fragment)
2864                         representation_ms_info['fragments'] = fragments
2865                     # If there is a fragments key available then we correctly recognized fragmented media.
2866                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2867                     # assumption is not necessarily correct since we may simply have no support for
2868                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2869                     if 'fragments' in representation_ms_info:
2870                         f.update({
2871                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2872                             'url': mpd_url or base_url,
2873                             'fragment_base_url': base_url,
2874                             'fragments': [],
2875                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2876                         })
2877                         if 'initialization_url' in representation_ms_info:
2878                             initialization_url = representation_ms_info['initialization_url']
2879                             if not f.get('url'):
2880                                 f['url'] = initialization_url
2881                             f['fragments'].append({location_key(initialization_url): initialization_url})
2882                         f['fragments'].extend(representation_ms_info['fragments'])
2883                         if not period_duration:
2884                             period_duration = try_get(
2885                                 representation_ms_info,
2886                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2887                     else:
2888                         # Assuming direct URL to unfragmented media.
2889                         f['url'] = base_url
2890                     if content_type in ('video', 'audio', 'image/jpeg'):
2891                         f['manifest_stream_number'] = stream_numbers[f['url']]
2892                         stream_numbers[f['url']] += 1
2893                         formats.append(f)
2894                     elif content_type == 'text':
2895                         subtitles.setdefault(lang or 'und', []).append(f)
2896
2897         return formats, subtitles
2898
2899     def _extract_ism_formats(self, *args, **kwargs):
2900         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2901         if subs:
2902             self._report_ignoring_subs('ISM')
2903         return fmts
2904
2905     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2906         if self.get_param('ignore_no_formats_error'):
2907             fatal = False
2908
2909         res = self._download_xml_handle(
2910             ism_url, video_id,
2911             note='Downloading ISM manifest' if note is None else note,
2912             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2913             fatal=fatal, data=data, headers=headers, query=query)
2914         if res is False:
2915             return [], {}
2916         ism_doc, urlh = res
2917         if ism_doc is None:
2918             return [], {}
2919
2920         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
2921
2922     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2923         """
2924         Parse formats from ISM manifest.
2925         References:
2926          1. [MS-SSTR]: Smooth Streaming Protocol,
2927             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2928         """
2929         if ism_doc.get('IsLive') == 'TRUE':
2930             return [], {}
2931
2932         duration = int(ism_doc.attrib['Duration'])
2933         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2934
2935         formats = []
2936         subtitles = {}
2937         for stream in ism_doc.findall('StreamIndex'):
2938             stream_type = stream.get('Type')
2939             if stream_type not in ('video', 'audio', 'text'):
2940                 continue
2941             url_pattern = stream.attrib['Url']
2942             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2943             stream_name = stream.get('Name')
2944             stream_language = stream.get('Language', 'und')
2945             for track in stream.findall('QualityLevel'):
2946                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2947                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
2948                 # TODO: add support for WVC1 and WMAP
2949                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
2950                     self.report_warning('%s is not a supported codec' % fourcc)
2951                     continue
2952                 tbr = int(track.attrib['Bitrate']) // 1000
2953                 # [1] does not mention Width and Height attributes. However,
2954                 # they're often present while MaxWidth and MaxHeight are
2955                 # missing, so should be used as fallbacks
2956                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2957                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2958                 sampling_rate = int_or_none(track.get('SamplingRate'))
2959
2960                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2961                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
2962
2963                 fragments = []
2964                 fragment_ctx = {
2965                     'time': 0,
2966                 }
2967                 stream_fragments = stream.findall('c')
2968                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2969                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2970                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2971                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2972                     if not fragment_ctx['duration']:
2973                         try:
2974                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2975                         except IndexError:
2976                             next_fragment_time = duration
2977                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2978                     for _ in range(fragment_repeat):
2979                         fragments.append({
2980                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
2981                             'duration': fragment_ctx['duration'] / stream_timescale,
2982                         })
2983                         fragment_ctx['time'] += fragment_ctx['duration']
2984
2985                 if stream_type == 'text':
2986                     subtitles.setdefault(stream_language, []).append({
2987                         'ext': 'ismt',
2988                         'protocol': 'ism',
2989                         'url': ism_url,
2990                         'manifest_url': ism_url,
2991                         'fragments': fragments,
2992                         '_download_params': {
2993                             'stream_type': stream_type,
2994                             'duration': duration,
2995                             'timescale': stream_timescale,
2996                             'fourcc': fourcc,
2997                             'language': stream_language,
2998                             'codec_private_data': track.get('CodecPrivateData'),
2999                         }
3000                     })
3001                 elif stream_type in ('video', 'audio'):
3002                     formats.append({
3003                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3004                         'url': ism_url,
3005                         'manifest_url': ism_url,
3006                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3007                         'width': width,
3008                         'height': height,
3009                         'tbr': tbr,
3010                         'asr': sampling_rate,
3011                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3012                         'acodec': 'none' if stream_type == 'video' else fourcc,
3013                         'protocol': 'ism',
3014                         'fragments': fragments,
3015                         'has_drm': ism_doc.find('Protection') is not None,
3016                         'language': stream_language,
3017                         'audio_channels': int_or_none(track.get('Channels')),
3018                         '_download_params': {
3019                             'stream_type': stream_type,
3020                             'duration': duration,
3021                             'timescale': stream_timescale,
3022                             'width': width or 0,
3023                             'height': height or 0,
3024                             'fourcc': fourcc,
3025                             'language': stream_language,
3026                             'codec_private_data': track.get('CodecPrivateData'),
3027                             'sampling_rate': sampling_rate,
3028                             'channels': int_or_none(track.get('Channels', 2)),
3029                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3030                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3031                         },
3032                     })
3033         return formats, subtitles
3034
3035     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3036         def absolute_url(item_url):
3037             return urljoin(base_url, item_url)
3038
3039         def parse_content_type(content_type):
3040             if not content_type:
3041                 return {}
3042             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3043             if ctr:
3044                 mimetype, codecs = ctr.groups()
3045                 f = parse_codecs(codecs)
3046                 f['ext'] = mimetype2ext(mimetype)
3047                 return f
3048             return {}
3049
3050         def _media_formats(src, cur_media_type, type_info=None):
3051             type_info = type_info or {}
3052             full_url = absolute_url(src)
3053             ext = type_info.get('ext') or determine_ext(full_url)
3054             if ext == 'm3u8':
3055                 is_plain_url = False
3056                 formats = self._extract_m3u8_formats(
3057                     full_url, video_id, ext='mp4',
3058                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3059                     preference=preference, quality=quality, fatal=False)
3060             elif ext == 'mpd':
3061                 is_plain_url = False
3062                 formats = self._extract_mpd_formats(
3063                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3064             else:
3065                 is_plain_url = True
3066                 formats = [{
3067                     'url': full_url,
3068                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3069                     'ext': ext,
3070                 }]
3071             return is_plain_url, formats
3072
3073         entries = []
3074         # amp-video and amp-audio are very similar to their HTML5 counterparts
3075         # so we will include them right here (see
3076         # https://www.ampproject.org/docs/reference/components/amp-video)
3077         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3078         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3079         media_tags = [(media_tag, media_tag_name, media_type, '')
3080                       for media_tag, media_tag_name, media_type
3081                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3082         media_tags.extend(re.findall(
3083             # We only allow video|audio followed by a whitespace or '>'.
3084             # Allowing more characters may end up in significant slow down (see
3085             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3086             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3087             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3088         for media_tag, _, media_type, media_content in media_tags:
3089             media_info = {
3090                 'formats': [],
3091                 'subtitles': {},
3092             }
3093             media_attributes = extract_attributes(media_tag)
3094             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3095             if src:
3096                 f = parse_content_type(media_attributes.get('type'))
3097                 _, formats = _media_formats(src, media_type, f)
3098                 media_info['formats'].extend(formats)
3099             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3100             if media_content:
3101                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3102                     s_attr = extract_attributes(source_tag)
3103                     # data-video-src and data-src are non standard but seen
3104                     # several times in the wild
3105                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3106                     if not src:
3107                         continue
3108                     f = parse_content_type(s_attr.get('type'))
3109                     is_plain_url, formats = _media_formats(src, media_type, f)
3110                     if is_plain_url:
3111                         # width, height, res, label and title attributes are
3112                         # all not standard but seen several times in the wild
3113                         labels = [
3114                             s_attr.get(lbl)
3115                             for lbl in ('label', 'title')
3116                             if str_or_none(s_attr.get(lbl))
3117                         ]
3118                         width = int_or_none(s_attr.get('width'))
3119                         height = (int_or_none(s_attr.get('height'))
3120                                   or int_or_none(s_attr.get('res')))
3121                         if not width or not height:
3122                             for lbl in labels:
3123                                 resolution = parse_resolution(lbl)
3124                                 if not resolution:
3125                                     continue
3126                                 width = width or resolution.get('width')
3127                                 height = height or resolution.get('height')
3128                         for lbl in labels:
3129                             tbr = parse_bitrate(lbl)
3130                             if tbr:
3131                                 break
3132                         else:
3133                             tbr = None
3134                         f.update({
3135                             'width': width,
3136                             'height': height,
3137                             'tbr': tbr,
3138                             'format_id': s_attr.get('label') or s_attr.get('title'),
3139                         })
3140                         f.update(formats[0])
3141                         media_info['formats'].append(f)
3142                     else:
3143                         media_info['formats'].extend(formats)
3144                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3145                     track_attributes = extract_attributes(track_tag)
3146                     kind = track_attributes.get('kind')
3147                     if not kind or kind in ('subtitles', 'captions'):
3148                         src = strip_or_none(track_attributes.get('src'))
3149                         if not src:
3150                             continue
3151                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3152                         media_info['subtitles'].setdefault(lang, []).append({
3153                             'url': absolute_url(src),
3154                         })
3155             for f in media_info['formats']:
3156                 f.setdefault('http_headers', {})['Referer'] = base_url
3157             if media_info['formats'] or media_info['subtitles']:
3158                 entries.append(media_info)
3159         return entries
3160
3161     def _extract_akamai_formats(self, *args, **kwargs):
3162         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3163         if subs:
3164             self._report_ignoring_subs('akamai')
3165         return fmts
3166
3167     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3168         signed = 'hdnea=' in manifest_url
3169         if not signed:
3170             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3171             manifest_url = re.sub(
3172                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3173                 '', manifest_url).strip('?')
3174
3175         formats = []
3176         subtitles = {}
3177
3178         hdcore_sign = 'hdcore=3.7.0'
3179         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3180         hds_host = hosts.get('hds')
3181         if hds_host:
3182             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3183         if 'hdcore=' not in f4m_url:
3184             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3185         f4m_formats = self._extract_f4m_formats(
3186             f4m_url, video_id, f4m_id='hds', fatal=False)
3187         for entry in f4m_formats:
3188             entry.update({'extra_param_to_segment_url': hdcore_sign})
3189         formats.extend(f4m_formats)
3190
3191         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3192         hls_host = hosts.get('hls')
3193         if hls_host:
3194             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3195         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3196             m3u8_url, video_id, 'mp4', 'm3u8_native',
3197             m3u8_id='hls', fatal=False)
3198         formats.extend(m3u8_formats)
3199         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3200
3201         http_host = hosts.get('http')
3202         if http_host and m3u8_formats and not signed:
3203             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3204             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3205             qualities_length = len(qualities)
3206             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3207                 i = 0
3208                 for f in m3u8_formats:
3209                     if f['vcodec'] != 'none':
3210                         for protocol in ('http', 'https'):
3211                             http_f = f.copy()
3212                             del http_f['manifest_url']
3213                             http_url = re.sub(
3214                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3215                             http_f.update({
3216                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3217                                 'url': http_url,
3218                                 'protocol': protocol,
3219                             })
3220                             formats.append(http_f)
3221                         i += 1
3222
3223         return formats, subtitles
3224
3225     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3226         query = urllib.parse.urlparse(url).query
3227         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3228         mobj = re.search(
3229             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3230         url_base = mobj.group('url')
3231         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3232         formats = []
3233
3234         def manifest_url(manifest):
3235             m_url = f'{http_base_url}/{manifest}'
3236             if query:
3237                 m_url += '?%s' % query
3238             return m_url
3239
3240         if 'm3u8' not in skip_protocols:
3241             formats.extend(self._extract_m3u8_formats(
3242                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3243                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3244         if 'f4m' not in skip_protocols:
3245             formats.extend(self._extract_f4m_formats(
3246                 manifest_url('manifest.f4m'),
3247                 video_id, f4m_id='hds', fatal=False))
3248         if 'dash' not in skip_protocols:
3249             formats.extend(self._extract_mpd_formats(
3250                 manifest_url('manifest.mpd'),
3251                 video_id, mpd_id='dash', fatal=False))
3252         if re.search(r'(?:/smil:|\.smil)', url_base):
3253             if 'smil' not in skip_protocols:
3254                 rtmp_formats = self._extract_smil_formats(
3255                     manifest_url('jwplayer.smil'),
3256                     video_id, fatal=False)
3257                 for rtmp_format in rtmp_formats:
3258                     rtsp_format = rtmp_format.copy()
3259                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3260                     del rtsp_format['play_path']
3261                     del rtsp_format['ext']
3262                     rtsp_format.update({
3263                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3264                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3265                         'protocol': 'rtsp',
3266                     })
3267                     formats.extend([rtmp_format, rtsp_format])
3268         else:
3269             for protocol in ('rtmp', 'rtsp'):
3270                 if protocol not in skip_protocols:
3271                     formats.append({
3272                         'url': f'{protocol}:{url_base}',
3273                         'format_id': protocol,
3274                         'protocol': protocol,
3275                     })
3276         return formats
3277
3278     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3279         mobj = re.search(
3280             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3281             webpage)
3282         if mobj:
3283             try:
3284                 jwplayer_data = self._parse_json(mobj.group('options'),
3285                                                  video_id=video_id,
3286                                                  transform_source=transform_source)
3287             except ExtractorError:
3288                 pass
3289             else:
3290                 if isinstance(jwplayer_data, dict):
3291                     return jwplayer_data
3292
3293     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3294         jwplayer_data = self._find_jwplayer_data(
3295             webpage, video_id, transform_source=js_to_json)
3296         return self._parse_jwplayer_data(
3297             jwplayer_data, video_id, *args, **kwargs)
3298
3299     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3300                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3301         entries = []
3302         if not isinstance(jwplayer_data, dict):
3303             return entries
3304
3305         playlist_items = jwplayer_data.get('playlist')
3306         # JWPlayer backward compatibility: single playlist item/flattened playlists
3307         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3308         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3309         if not isinstance(playlist_items, list):
3310             playlist_items = (playlist_items or jwplayer_data, )
3311
3312         for video_data in playlist_items:
3313             if not isinstance(video_data, dict):
3314                 continue
3315             # JWPlayer backward compatibility: flattened sources
3316             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3317             if 'sources' not in video_data:
3318                 video_data['sources'] = [video_data]
3319
3320             this_video_id = video_id or video_data['mediaid']
3321
3322             formats = self._parse_jwplayer_formats(
3323                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3324                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3325
3326             subtitles = {}
3327             tracks = video_data.get('tracks')
3328             if tracks and isinstance(tracks, list):
3329                 for track in tracks:
3330                     if not isinstance(track, dict):
3331                         continue
3332                     track_kind = track.get('kind')
3333                     if not track_kind or not isinstance(track_kind, str):
3334                         continue
3335                     if track_kind.lower() not in ('captions', 'subtitles'):
3336                         continue
3337                     track_url = urljoin(base_url, track.get('file'))
3338                     if not track_url:
3339                         continue
3340                     subtitles.setdefault(track.get('label') or 'en', []).append({
3341                         'url': self._proto_relative_url(track_url)
3342                     })
3343
3344             entry = {
3345                 'id': this_video_id,
3346                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3347                 'description': clean_html(video_data.get('description')),
3348                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3349                 'timestamp': int_or_none(video_data.get('pubdate')),
3350                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3351                 'subtitles': subtitles,
3352                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3353                 'genre': clean_html(video_data.get('genre')),
3354                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3355                 'season_number': int_or_none(video_data.get('season')),
3356                 'episode_number': int_or_none(video_data.get('episode')),
3357                 'release_year': int_or_none(video_data.get('releasedate')),
3358                 'age_limit': int_or_none(video_data.get('age_restriction')),
3359             }
3360             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3361             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3362                 entry.update({
3363                     '_type': 'url_transparent',
3364                     'url': formats[0]['url'],
3365                 })
3366             else:
3367                 entry['formats'] = formats
3368             entries.append(entry)
3369         if len(entries) == 1:
3370             return entries[0]
3371         else:
3372             return self.playlist_result(entries)
3373
3374     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3375                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3376         urls = set()
3377         formats = []
3378         for source in jwplayer_sources_data:
3379             if not isinstance(source, dict):
3380                 continue
3381             source_url = urljoin(
3382                 base_url, self._proto_relative_url(source.get('file')))
3383             if not source_url or source_url in urls:
3384                 continue
3385             urls.add(source_url)
3386             source_type = source.get('type') or ''
3387             ext = mimetype2ext(source_type) or determine_ext(source_url)
3388             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3389                 formats.extend(self._extract_m3u8_formats(
3390                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3391                     m3u8_id=m3u8_id, fatal=False))
3392             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3393                 formats.extend(self._extract_mpd_formats(
3394                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3395             elif ext == 'smil':
3396                 formats.extend(self._extract_smil_formats(
3397                     source_url, video_id, fatal=False))
3398             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3399             elif source_type.startswith('audio') or ext in (
3400                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3401                 formats.append({
3402                     'url': source_url,
3403                     'vcodec': 'none',
3404                     'ext': ext,
3405                 })
3406             else:
3407                 format_id = str_or_none(source.get('label'))
3408                 height = int_or_none(source.get('height'))
3409                 if height is None and format_id:
3410                     # Often no height is provided but there is a label in
3411                     # format like "1080p", "720p SD", or 1080.
3412                     height = parse_resolution(format_id).get('height')
3413                 a_format = {
3414                     'url': source_url,
3415                     'width': int_or_none(source.get('width')),
3416                     'height': height,
3417                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3418                     'filesize': int_or_none(source.get('filesize')),
3419                     'ext': ext,
3420                     'format_id': format_id
3421                 }
3422                 if source_url.startswith('rtmp'):
3423                     a_format['ext'] = 'flv'
3424                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3425                     # of jwplayer.flash.swf
3426                     rtmp_url_parts = re.split(
3427                         r'((?:mp4|mp3|flv):)', source_url, 1)
3428                     if len(rtmp_url_parts) == 3:
3429                         rtmp_url, prefix, play_path = rtmp_url_parts
3430                         a_format.update({
3431                             'url': rtmp_url,
3432                             'play_path': prefix + play_path,
3433                         })
3434                     if rtmp_params:
3435                         a_format.update(rtmp_params)
3436                 formats.append(a_format)
3437         return formats
3438
3439     def _live_title(self, name):
3440         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3441         return name
3442
3443     def _int(self, v, name, fatal=False, **kwargs):
3444         res = int_or_none(v, **kwargs)
3445         if res is None:
3446             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3447             if fatal:
3448                 raise ExtractorError(msg)
3449             else:
3450                 self.report_warning(msg)
3451         return res
3452
3453     def _float(self, v, name, fatal=False, **kwargs):
3454         res = float_or_none(v, **kwargs)
3455         if res is None:
3456             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3457             if fatal:
3458                 raise ExtractorError(msg)
3459             else:
3460                 self.report_warning(msg)
3461         return res
3462
3463     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3464                     path='/', secure=False, discard=False, rest={}, **kwargs):
3465         cookie = http.cookiejar.Cookie(
3466             0, name, value, port, port is not None, domain, True,
3467             domain.startswith('.'), path, True, secure, expire_time,
3468             discard, None, None, rest)
3469         self.cookiejar.set_cookie(cookie)
3470
3471     def _get_cookies(self, url):
3472         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3473         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3474
3475     def _apply_first_set_cookie_header(self, url_handle, cookie):
3476         """
3477         Apply first Set-Cookie header instead of the last. Experimental.
3478
3479         Some sites (e.g. [1-3]) may serve two cookies under the same name
3480         in Set-Cookie header and expect the first (old) one to be set rather
3481         than second (new). However, as of RFC6265 the newer one cookie
3482         should be set into cookie store what actually happens.
3483         We will workaround this issue by resetting the cookie to
3484         the first one manually.
3485         1. https://new.vk.com/
3486         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3487         3. https://learning.oreilly.com/
3488         """
3489         for header, cookies in url_handle.headers.items():
3490             if header.lower() != 'set-cookie':
3491                 continue
3492             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3493             cookie_value = re.search(
3494                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3495             if cookie_value:
3496                 value, domain = cookie_value.groups()
3497                 self._set_cookie(domain, cookie, value)
3498                 break
3499
3500     @classmethod
3501     def get_testcases(cls, include_onlymatching=False):
3502         # Do not look in super classes
3503         t = vars(cls).get('_TEST')
3504         if t:
3505             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3506             tests = [t]
3507         else:
3508             tests = vars(cls).get('_TESTS', [])
3509         for t in tests:
3510             if not include_onlymatching and t.get('only_matching', False):
3511                 continue
3512             t['name'] = cls.ie_key()
3513             yield t
3514         if getattr(cls, '__wrapped__', None):
3515             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3516
3517     @classmethod
3518     def get_webpage_testcases(cls):
3519         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3520         for t in tests:
3521             t['name'] = cls.ie_key()
3522             yield t
3523         if getattr(cls, '__wrapped__', None):
3524             yield from cls.__wrapped__.get_webpage_testcases()
3525
3526     @classproperty(cache=True)
3527     def age_limit(cls):
3528         """Get age limit from the testcases"""
3529         return max(traverse_obj(
3530             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3531             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3532
3533     @classproperty(cache=True)
3534     def _RETURN_TYPE(cls):
3535         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3536         tests = tuple(cls.get_testcases(include_onlymatching=False))
3537         if not tests:
3538             return None
3539         elif not any(k.startswith('playlist') for test in tests for k in test):
3540             return 'video'
3541         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3542             return 'playlist'
3543         return 'any'
3544
3545     @classmethod
3546     def is_single_video(cls, url):
3547         """Returns whether the URL is of a single video, None if unknown"""
3548         if cls.suitable(url):
3549             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3550
3551     @classmethod
3552     def is_suitable(cls, age_limit):
3553         """Test whether the extractor is generally suitable for the given age limit"""
3554         return not age_restricted(cls.age_limit, age_limit)
3555
3556     @classmethod
3557     def description(cls, *, markdown=True, search_examples=None):
3558         """Description of the extractor"""
3559         desc = ''
3560         if cls._NETRC_MACHINE:
3561             if markdown:
3562                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3563             else:
3564                 desc += f' [{cls._NETRC_MACHINE}]'
3565         if cls.IE_DESC is False:
3566             desc += ' [HIDDEN]'
3567         elif cls.IE_DESC:
3568             desc += f' {cls.IE_DESC}'
3569         if cls.SEARCH_KEY:
3570             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3571             if search_examples:
3572                 _COUNTS = ('', '5', '10', 'all')
3573                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3574         if not cls.working():
3575             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3576
3577         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3578         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3579         return f'{name}:{desc}' if desc else name
3580
3581     def extract_subtitles(self, *args, **kwargs):
3582         if (self.get_param('writesubtitles', False)
3583                 or self.get_param('listsubtitles')):
3584             return self._get_subtitles(*args, **kwargs)
3585         return {}
3586
3587     def _get_subtitles(self, *args, **kwargs):
3588         raise NotImplementedError('This method must be implemented by subclasses')
3589
3590     class CommentsDisabled(Exception):
3591         """Raise in _get_comments if comments are disabled for the video"""
3592
3593     def extract_comments(self, *args, **kwargs):
3594         if not self.get_param('getcomments'):
3595             return None
3596         generator = self._get_comments(*args, **kwargs)
3597
3598         def extractor():
3599             comments = []
3600             interrupted = True
3601             try:
3602                 while True:
3603                     comments.append(next(generator))
3604             except StopIteration:
3605                 interrupted = False
3606             except KeyboardInterrupt:
3607                 self.to_screen('Interrupted by user')
3608             except self.CommentsDisabled:
3609                 return {'comments': None, 'comment_count': None}
3610             except Exception as e:
3611                 if self.get_param('ignoreerrors') is not True:
3612                     raise
3613                 self._downloader.report_error(e)
3614             comment_count = len(comments)
3615             self.to_screen(f'Extracted {comment_count} comments')
3616             return {
3617                 'comments': comments,
3618                 'comment_count': None if interrupted else comment_count
3619             }
3620         return extractor
3621
3622     def _get_comments(self, *args, **kwargs):
3623         raise NotImplementedError('This method must be implemented by subclasses')
3624
3625     @staticmethod
3626     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3627         """ Merge subtitle items for one language. Items with duplicated URLs/data
3628         will be dropped. """
3629         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3630         ret = list(subtitle_list1)
3631         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3632         return ret
3633
3634     @classmethod
3635     def _merge_subtitles(cls, *dicts, target=None):
3636         """ Merge subtitle dictionaries, language by language. """
3637         if target is None:
3638             target = {}
3639         for d in dicts:
3640             for lang, subs in d.items():
3641                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3642         return target
3643
3644     def extract_automatic_captions(self, *args, **kwargs):
3645         if (self.get_param('writeautomaticsub', False)
3646                 or self.get_param('listsubtitles')):
3647             return self._get_automatic_captions(*args, **kwargs)
3648         return {}
3649
3650     def _get_automatic_captions(self, *args, **kwargs):
3651         raise NotImplementedError('This method must be implemented by subclasses')
3652
3653     @functools.cached_property
3654     def _cookies_passed(self):
3655         """Whether cookies have been passed to YoutubeDL"""
3656         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3657
3658     def mark_watched(self, *args, **kwargs):
3659         if not self.get_param('mark_watched', False):
3660             return
3661         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3662             self._mark_watched(*args, **kwargs)
3663
3664     def _mark_watched(self, *args, **kwargs):
3665         raise NotImplementedError('This method must be implemented by subclasses')
3666
3667     def geo_verification_headers(self):
3668         headers = {}
3669         geo_verification_proxy = self.get_param('geo_verification_proxy')
3670         if geo_verification_proxy:
3671             headers['Ytdl-request-proxy'] = geo_verification_proxy
3672         return headers
3673
3674     @staticmethod
3675     def _generic_id(url):
3676         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3677
3678     def _generic_title(self, url='', webpage='', *, default=None):
3679         return (self._og_search_title(webpage, default=None)
3680                 or self._html_extract_title(webpage, default=None)
3681                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3682                 or default)
3683
3684     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3685         if not duration:
3686             return
3687         chapter_list = [{
3688             'start_time': start_function(chapter),
3689             'title': title_function(chapter),
3690         } for chapter in chapter_list or []]
3691         if strict:
3692             warn = self.report_warning
3693         else:
3694             warn = self.write_debug
3695             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3696
3697         chapters = [{'start_time': 0}]
3698         for idx, chapter in enumerate(chapter_list):
3699             if chapter['start_time'] is None:
3700                 warn(f'Incomplete chapter {idx}')
3701             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3702                 chapters.append(chapter)
3703             elif chapter not in chapters:
3704                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3705                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3706                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3707         return chapters[1:]
3708
3709     def _extract_chapters_from_description(self, description, duration):
3710         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3711         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3712         return self._extract_chapters_helper(
3713             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3714             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3715             duration=duration, strict=False) or self._extract_chapters_helper(
3716             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3717             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3718             duration=duration, strict=False)
3719
3720     @staticmethod
3721     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3722         all_known = all(map(
3723             lambda x: x is not None,
3724             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3725         return (
3726             'private' if is_private
3727             else 'premium_only' if needs_premium
3728             else 'subscriber_only' if needs_subscription
3729             else 'needs_auth' if needs_auth
3730             else 'unlisted' if is_unlisted
3731             else 'public' if all_known
3732             else None)
3733
3734     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3735         '''
3736         @returns            A list of values for the extractor argument given by "key"
3737                             or "default" if no such key is present
3738         @param default      The default value to return when the key is not present (default: [])
3739         @param casesense    When false, the values are converted to lower case
3740         '''
3741         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3742         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3743         if val is None:
3744             return [] if default is NO_DEFAULT else default
3745         return list(val) if casesense else [x.lower() for x in val]
3746
3747     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3748         if not playlist_id or not video_id:
3749             return not video_id
3750
3751         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3752         if no_playlist is not None:
3753             return not no_playlist
3754
3755         video_id = '' if video_id is True else f' {video_id}'
3756         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3757         if self.get_param('noplaylist'):
3758             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3759             return False
3760         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3761         return True
3762
3763     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3764         RetryManager.report_retry(
3765             err, _count or int(fatal), _retries,
3766             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3767             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3768
3769     def RetryManager(self, **kwargs):
3770         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3771
3772     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3773         display_id = traverse_obj(info_dict, 'display_id', 'id')
3774         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3775         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3776             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3777
3778     @classmethod
3779     def extract_from_webpage(cls, ydl, url, webpage):
3780         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3781               else ydl.get_info_extractor(cls.ie_key()))
3782         for info in ie._extract_from_webpage(url, webpage) or []:
3783             # url = None since we do not want to set (webpage/original)_url
3784             ydl.add_default_extra_info(info, ie, None)
3785             yield info
3786
3787     @classmethod
3788     def _extract_from_webpage(cls, url, webpage):
3789         for embed_url in orderedSet(
3790                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3791             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3792
3793     @classmethod
3794     def _extract_embed_urls(cls, url, webpage):
3795         """@returns all the embed urls on the webpage"""
3796         if '_EMBED_URL_RE' not in cls.__dict__:
3797             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3798             for idx, regex in enumerate(cls._EMBED_REGEX):
3799                 assert regex.count('(?P<url>') == 1, \
3800                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3801             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3802
3803         for regex in cls._EMBED_URL_RE:
3804             for mobj in regex.finditer(webpage):
3805                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3806                 if cls._VALID_URL is False or cls.suitable(embed_url):
3807                     yield embed_url
3808
3809     class StopExtraction(Exception):
3810         pass
3811
3812     @classmethod
3813     def _extract_url(cls, webpage):  # TODO: Remove
3814         """Only for compatibility with some older extractors"""
3815         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3816
3817     @classmethod
3818     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3819         if plugin_name:
3820             mro = inspect.getmro(cls)
3821             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3822             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3823             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3824             while getattr(super_class, '__wrapped__', None):
3825                 super_class = super_class.__wrapped__
3826             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3827             _PLUGIN_OVERRIDES[super_class].append(cls)
3828
3829         return super().__init_subclass__(**kwargs)
3830
3831
3832 class SearchInfoExtractor(InfoExtractor):
3833     """
3834     Base class for paged search queries extractors.
3835     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3836     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3837     """
3838
3839     _MAX_RESULTS = float('inf')
3840     _RETURN_TYPE = 'playlist'
3841
3842     @classproperty
3843     def _VALID_URL(cls):
3844         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3845
3846     def _real_extract(self, query):
3847         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3848         if prefix == '':
3849             return self._get_n_results(query, 1)
3850         elif prefix == 'all':
3851             return self._get_n_results(query, self._MAX_RESULTS)
3852         else:
3853             n = int(prefix)
3854             if n <= 0:
3855                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3856             elif n > self._MAX_RESULTS:
3857                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3858                 n = self._MAX_RESULTS
3859             return self._get_n_results(query, n)
3860
3861     def _get_n_results(self, query, n):
3862         """Get a specified number of results for a query.
3863         Either this function or _search_results must be overridden by subclasses """
3864         return self.playlist_result(
3865             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3866             query, query)
3867
3868     def _search_results(self, query):
3869         """Returns an iterator of search results"""
3870         raise NotImplementedError('This method must be implemented by subclasses')
3871
3872     @classproperty
3873     def SEARCH_KEY(cls):
3874         return cls._SEARCH_KEY
3875
3876
3877 class UnsupportedURLIE(InfoExtractor):
3878     _VALID_URL = '.*'
3879     _ENABLED = False
3880     IE_DESC = False
3881
3882     def _real_extract(self, url):
3883         raise UnsupportedError(url)
3884
3885
3886 _PLUGIN_OVERRIDES = collections.defaultdict(list)