yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import subprocess
  17 import sys
  18 import time
  19 import types
  20 import urllib.error
  21 import urllib.parse
  22 import urllib.request
  23 import xml.etree.ElementTree
  24
  25 from ..compat import functools  # isort: split
  26 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  27 from ..cookies import LenientSimpleCookie
  28 from ..downloader.f4m import get_base_url, remove_encrypted_media
  29 from ..utils import (
  30     IDENTITY,
  31     JSON_LD_RE,
  32     NO_DEFAULT,
  33     ExtractorError,
  34     FormatSorter,
  35     GeoRestrictedError,
  36     GeoUtils,
  37     HEADRequest,
  38     LenientJSONDecoder,
  39     Popen,
  40     RegexNotFoundError,
  41     RetryManager,
  42     UnsupportedError,
  43     age_restricted,
  44     base_url,
  45     bug_reports_message,
  46     classproperty,
  47     clean_html,
  48     deprecation_warning,
  49     determine_ext,
  50     dict_get,
  51     encode_data_uri,
  52     error_to_compat_str,
  53     extract_attributes,
  54     filter_dict,
  55     fix_xml_ampersands,
  56     float_or_none,
  57     format_field,
  58     int_or_none,
  59     join_nonempty,
  60     js_to_json,
  61     mimetype2ext,
  62     netrc_from_content,
  63     network_exceptions,
  64     orderedSet,
  65     parse_bitrate,
  66     parse_codecs,
  67     parse_duration,
  68     parse_iso8601,
  69     parse_m3u8_attributes,
  70     parse_resolution,
  71     sanitize_filename,
  72     sanitize_url,
  73     sanitized_Request,
  74     smuggle_url,
  75     str_or_none,
  76     str_to_int,
  77     strip_or_none,
  78     traverse_obj,
  79     truncate_string,
  80     try_call,
  81     try_get,
  82     unescapeHTML,
  83     unified_strdate,
  84     unified_timestamp,
  85     update_Request,
  86     update_url_query,
  87     url_basename,
  88     url_or_none,
  89     urlhandle_detect_ext,
  90     urljoin,
  91     variadic,
  92     xpath_element,
  93     xpath_text,
  94     xpath_with_ns,
  95 )
  96
  97
  98 class InfoExtractor:
  99     """Information Extractor class.
 100
 101     Information extractors are the classes that, given a URL, extract
 102     information about the video (or videos) the URL refers to. This
 103     information includes the real video URL, the video title, author and
 104     others. The information is stored in a dictionary which is then
 105     passed to the YoutubeDL. The YoutubeDL processes this
 106     information possibly downloading the video to the file system, among
 107     other possible outcomes.
 108
 109     The type field determines the type of the result.
 110     By far the most common value (and the default if _type is missing) is
 111     "video", which indicates a single video.
 112
 113     For a video, the dictionaries must include the following fields:
 114
 115     id:             Video identifier.
 116     title:          Video title, unescaped. Set to an empty string if video has
 117                     no title as opposed to "None" which signifies that the
 118                     extractor failed to obtain a title
 119
 120     Additionally, it must contain either a formats entry or a url one:
 121
 122     formats:        A list of dictionaries for each format available, ordered
 123                     from worst to best quality.
 124
 125                     Potential fields:
 126                     * url        The mandatory URL representing the media:
 127                                    for plain file media - HTTP URL of this file,
 128                                    for RTMP - RTMP URL,
 129                                    for HLS - URL of the M3U8 media playlist,
 130                                    for HDS - URL of the F4M manifest,
 131                                    for DASH
 132                                      - HTTP URL to plain file media (in case of
 133                                        unfragmented media)
 134                                      - URL of the MPD manifest or base URL
 135                                        representing the media if MPD manifest
 136                                        is parsed from a string (in case of
 137                                        fragmented media)
 138                                    for MSS - URL of the ISM manifest.
 139                     * request_data  Data to send in POST request to the URL
 140                     * manifest_url
 141                                  The URL of the manifest file in case of
 142                                  fragmented media:
 143                                    for HLS - URL of the M3U8 master playlist,
 144                                    for HDS - URL of the F4M manifest,
 145                                    for DASH - URL of the MPD manifest,
 146                                    for MSS - URL of the ISM manifest.
 147                     * manifest_stream_number  (For internal use only)
 148                                  The index of the stream in the manifest file
 149                     * ext        Will be calculated from URL if missing
 150                     * format     A human-readable description of the format
 151                                  ("mp4 container with h264/opus").
 152                                  Calculated from the format_id, width, height.
 153                                  and format_note fields if missing.
 154                     * format_id  A short description of the format
 155                                  ("mp4_h264_opus" or "19").
 156                                 Technically optional, but strongly recommended.
 157                     * format_note Additional info about the format
 158                                  ("3D" or "DASH video")
 159                     * width      Width of the video, if known
 160                     * height     Height of the video, if known
 161                     * aspect_ratio  Aspect ratio of the video, if known
 162                                  Automatically calculated from width and height
 163                     * resolution Textual description of width and height
 164                                  Automatically calculated from width and height
 165                     * dynamic_range The dynamic range of the video. One of:
 166                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 167                     * tbr        Average bitrate of audio and video in KBit/s
 168                     * abr        Average audio bitrate in KBit/s
 169                     * acodec     Name of the audio codec in use
 170                     * asr        Audio sampling rate in Hertz
 171                     * audio_channels  Number of audio channels
 172                     * vbr        Average video bitrate in KBit/s
 173                     * fps        Frame rate
 174                     * vcodec     Name of the video codec in use
 175                     * container  Name of the container format
 176                     * filesize   The number of bytes, if known in advance
 177                     * filesize_approx  An estimate for the number of bytes
 178                     * player_url SWF Player URL (used for rtmpdump).
 179                     * protocol   The protocol that will be used for the actual
 180                                  download, lower-case. One of "http", "https" or
 181                                  one of the protocols defined in downloader.PROTOCOL_MAP
 182                     * fragment_base_url
 183                                  Base URL for fragments. Each fragment's path
 184                                  value (if present) will be relative to
 185                                  this URL.
 186                     * fragments  A list of fragments of a fragmented media.
 187                                  Each fragment entry must contain either an url
 188                                  or a path. If an url is present it should be
 189                                  considered by a client. Otherwise both path and
 190                                  fragment_base_url must be present. Here is
 191                                  the list of all potential fields:
 192                                  * "url" - fragment's URL
 193                                  * "path" - fragment's path relative to
 194                                             fragment_base_url
 195                                  * "duration" (optional, int or float)
 196                                  * "filesize" (optional, int)
 197                     * is_from_start  Is a live format that can be downloaded
 198                                 from the start. Boolean
 199                     * preference Order number of this format. If this field is
 200                                  present and not None, the formats get sorted
 201                                  by this field, regardless of all other values.
 202                                  -1 for default (order by other properties),
 203                                  -2 or smaller for less than default.
 204                                  < -1000 to hide the format (if there is
 205                                     another one which is strictly better)
 206                     * language   Language code, e.g. "de" or "en-US".
 207                     * language_preference  Is this in the language mentioned in
 208                                  the URL?
 209                                  10 if it's what the URL is about,
 210                                  -1 for default (don't know),
 211                                  -10 otherwise, other values reserved for now.
 212                     * quality    Order number of the video quality of this
 213                                  format, irrespective of the file format.
 214                                  -1 for default (order by other properties),
 215                                  -2 or smaller for less than default.
 216                     * source_preference  Order number for this video source
 217                                   (quality takes higher priority)
 218                                  -1 for default (order by other properties),
 219                                  -2 or smaller for less than default.
 220                     * http_headers  A dictionary of additional HTTP headers
 221                                  to add to the request.
 222                     * stretched_ratio  If given and not 1, indicates that the
 223                                  video's pixels are not square.
 224                                  width : height ratio as float.
 225                     * no_resume  The server does not support resuming the
 226                                  (HTTP or RTMP) download. Boolean.
 227                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 228                     * extra_param_to_segment_url  A query string to append to each
 229                                  fragment's URL, or to update each existing query string
 230                                  with. Only applied by the native HLS/DASH downloaders.
 231                     * hls_aes    A dictionary of HLS AES-128 decryption information
 232                                  used by the native HLS downloader to override the
 233                                  values in the media playlist when an '#EXT-X-KEY' tag
 234                                  is present in the playlist:
 235                                  * uri  The URI from which the key will be downloaded
 236                                  * key  The key (as hex) used to decrypt fragments.
 237                                         If `key` is given, any key URI will be ignored
 238                                  * iv   The IV (as hex) used to decrypt fragments
 239                     * downloader_options  A dictionary of downloader options
 240                                  (For internal use only)
 241                                  * http_chunk_size Chunk size for HTTP downloads
 242                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 243                     RTMP formats can also have the additional fields: page_url,
 244                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 245                     rtmp_protocol, rtmp_real_time
 246
 247     url:            Final video URL.
 248     ext:            Video filename extension.
 249     format:         The video format, defaults to ext (used for --get-format)
 250     player_url:     SWF Player URL (used for rtmpdump).
 251
 252     The following fields are optional:
 253
 254     direct:         True if a direct video file was given (must only be set by GenericIE)
 255     alt_title:      A secondary title of the video.
 256     display_id      An alternative identifier for the video, not necessarily
 257                     unique, but available before title. Typically, id is
 258                     something like "4234987", title "Dancing naked mole rats",
 259                     and display_id "dancing-naked-mole-rats"
 260     thumbnails:     A list of dictionaries, with the following entries:
 261                         * "id" (optional, string) - Thumbnail format ID
 262                         * "url"
 263                         * "preference" (optional, int) - quality of the image
 264                         * "width" (optional, int)
 265                         * "height" (optional, int)
 266                         * "resolution" (optional, string "{width}x{height}",
 267                                         deprecated)
 268                         * "filesize" (optional, int)
 269                         * "http_headers" (dict) - HTTP headers for the request
 270     thumbnail:      Full URL to a video thumbnail image.
 271     description:    Full video description.
 272     uploader:       Full name of the video uploader.
 273     license:        License name the video is licensed under.
 274     creator:        The creator of the video.
 275     timestamp:      UNIX timestamp of the moment the video was uploaded
 276     upload_date:    Video upload date in UTC (YYYYMMDD).
 277                     If not explicitly set, calculated from timestamp
 278     release_timestamp: UNIX timestamp of the moment the video was released.
 279                     If it is not clear whether to use timestamp or this, use the former
 280     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 281                     If not explicitly set, calculated from release_timestamp
 282     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 283     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 284                     If not explicitly set, calculated from modified_timestamp
 285     uploader_id:    Nickname or id of the video uploader.
 286     uploader_url:   Full URL to a personal webpage of the video uploader.
 287     channel:        Full name of the channel the video is uploaded on.
 288                     Note that channel fields may or may not repeat uploader
 289                     fields. This depends on a particular extractor.
 290     channel_id:     Id of the channel.
 291     channel_url:    Full URL to a channel webpage.
 292     channel_follower_count: Number of followers of the channel.
 293     channel_is_verified: Whether the channel is verified on the platform.
 294     location:       Physical location where the video was filmed.
 295     subtitles:      The available subtitles as a dictionary in the format
 296                     {tag: subformats}. "tag" is usually a language code, and
 297                     "subformats" is a list sorted from lower to higher
 298                     preference, each element is a dictionary with the "ext"
 299                     entry and one of:
 300                         * "data": The subtitles file contents
 301                         * "url": A URL pointing to the subtitles file
 302                     It can optionally also have:
 303                         * "name": Name or description of the subtitles
 304                         * "http_headers": A dictionary of additional HTTP headers
 305                                   to add to the request.
 306                     "ext" will be calculated from URL if missing
 307     automatic_captions: Like 'subtitles'; contains automatically generated
 308                     captions instead of normal subtitles
 309     duration:       Length of the video in seconds, as an integer or float.
 310     view_count:     How many users have watched the video on the platform.
 311     concurrent_view_count: How many users are currently watching the video on the platform.
 312     like_count:     Number of positive ratings of the video
 313     dislike_count:  Number of negative ratings of the video
 314     repost_count:   Number of reposts of the video
 315     average_rating: Average rating give by users, the scale used depends on the webpage
 316     comment_count:  Number of comments on the video
 317     comments:       A list of comments, each with one or more of the following
 318                     properties (all but one of text or html optional):
 319                         * "author" - human-readable name of the comment author
 320                         * "author_id" - user ID of the comment author
 321                         * "author_thumbnail" - The thumbnail of the comment author
 322                         * "author_url" - The url to the comment author's page
 323                         * "author_is_verified" - Whether the author is verified
 324                                                  on the platform
 325                         * "author_is_uploader" - Whether the comment is made by
 326                                                  the video uploader
 327                         * "id" - Comment ID
 328                         * "html" - Comment as HTML
 329                         * "text" - Plain text of the comment
 330                         * "timestamp" - UNIX timestamp of comment
 331                         * "parent" - ID of the comment this one is replying to.
 332                                      Set to "root" to indicate that this is a
 333                                      comment to the original video.
 334                         * "like_count" - Number of positive ratings of the comment
 335                         * "dislike_count" - Number of negative ratings of the comment
 336                         * "is_favorited" - Whether the comment is marked as
 337                                            favorite by the video uploader
 338                         * "is_pinned" - Whether the comment is pinned to
 339                                         the top of the comments
 340     age_limit:      Age restriction for the video, as an integer (years)
 341     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 342                     should allow to get the same result again. (It will be set
 343                     by YoutubeDL if it's missing)
 344     categories:     A list of categories that the video falls in, for example
 345                     ["Sports", "Berlin"]
 346     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 347     cast:           A list of the video cast
 348     is_live:        True, False, or None (=unknown). Whether this video is a
 349                     live stream that goes on instead of a fixed-length video.
 350     was_live:       True, False, or None (=unknown). Whether this video was
 351                     originally a live stream.
 352     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 353                     or 'post_live' (was live, but VOD is not yet processed)
 354                     If absent, automatically set from is_live, was_live
 355     start_time:     Time in seconds where the reproduction should start, as
 356                     specified in the URL.
 357     end_time:       Time in seconds where the reproduction should end, as
 358                     specified in the URL.
 359     chapters:       A list of dictionaries, with the following entries:
 360                         * "start_time" - The start time of the chapter in seconds
 361                         * "end_time" - The end time of the chapter in seconds
 362                         * "title" (optional, string)
 363     heatmap:        A list of dictionaries, with the following entries:
 364                         * "start_time" - The start time of the data point in seconds
 365                         * "end_time" - The end time of the data point in seconds
 366                         * "value" - The normalized value of the data point (float between 0 and 1)
 367     playable_in_embed: Whether this video is allowed to play in embedded
 368                     players on other sites. Can be True (=always allowed),
 369                     False (=never allowed), None (=unknown), or a string
 370                     specifying the criteria for embedability; e.g. 'whitelist'
 371     availability:   Under what condition the video is available. One of
 372                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 373                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 374                     to set it
 375     _old_archive_ids: A list of old archive ids needed for backward compatibility
 376     _format_sort_fields: A list of fields to use for sorting formats
 377     __post_extractor: A function to be called just before the metadata is
 378                     written to either disk, logger or console. The function
 379                     must return a dict which will be added to the info_dict.
 380                     This is usefull for additional information that is
 381                     time-consuming to extract. Note that the fields thus
 382                     extracted will not be available to output template and
 383                     match_filter. So, only "comments" and "comment_count" are
 384                     currently allowed to be extracted via this method.
 385
 386     The following fields should only be used when the video belongs to some logical
 387     chapter or section:
 388
 389     chapter:        Name or title of the chapter the video belongs to.
 390     chapter_number: Number of the chapter the video belongs to, as an integer.
 391     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 392
 393     The following fields should only be used when the video is an episode of some
 394     series, programme or podcast:
 395
 396     series:         Title of the series or programme the video episode belongs to.
 397     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 398     season:         Title of the season the video episode belongs to.
 399     season_number:  Number of the season the video episode belongs to, as an integer.
 400     season_id:      Id of the season the video episode belongs to, as a unicode string.
 401     episode:        Title of the video episode. Unlike mandatory video title field,
 402                     this field should denote the exact title of the video episode
 403                     without any kind of decoration.
 404     episode_number: Number of the video episode within a season, as an integer.
 405     episode_id:     Id of the video episode, as a unicode string.
 406
 407     The following fields should only be used when the media is a track or a part of
 408     a music album:
 409
 410     track:          Title of the track.
 411     track_number:   Number of the track within an album or a disc, as an integer.
 412     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 413                     as a unicode string.
 414     artist:         Artist(s) of the track.
 415     genre:          Genre(s) of the track.
 416     album:          Title of the album the track belongs to.
 417     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 418     album_artist:   List of all artists appeared on the album (e.g.
 419                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 420                     and compilations).
 421     disc_number:    Number of the disc or other physical medium the track belongs to,
 422                     as an integer.
 423     release_year:   Year (YYYY) when the album was released.
 424     composer:       Composer of the piece
 425
 426     The following fields should only be set for clips that should be cut from the original video:
 427
 428     section_start:  Start time of the section in seconds
 429     section_end:    End time of the section in seconds
 430
 431     The following fields should only be set for storyboards:
 432     rows:           Number of rows in each storyboard fragment, as an integer
 433     columns:        Number of columns in each storyboard fragment, as an integer
 434
 435     Unless mentioned otherwise, the fields should be Unicode strings.
 436
 437     Unless mentioned otherwise, None is equivalent to absence of information.
 438
 439
 440     _type "playlist" indicates multiple videos.
 441     There must be a key "entries", which is a list, an iterable, or a PagedList
 442     object, each element of which is a valid dictionary by this specification.
 443
 444     Additionally, playlists can have "id", "title", and any other relevant
 445     attributes with the same semantics as videos (see above).
 446
 447     It can also have the following optional fields:
 448
 449     playlist_count: The total number of videos in a playlist. If not given,
 450                     YoutubeDL tries to calculate it from "entries"
 451
 452
 453     _type "multi_video" indicates that there are multiple videos that
 454     form a single show, for examples multiple acts of an opera or TV episode.
 455     It must have an entries key like a playlist and contain all the keys
 456     required for a video at the same time.
 457
 458
 459     _type "url" indicates that the video must be extracted from another
 460     location, possibly by a different extractor. Its only required key is:
 461     "url" - the next URL to extract.
 462     The key "ie_key" can be set to the class name (minus the trailing "IE",
 463     e.g. "Youtube") if the extractor class is known in advance.
 464     Additionally, the dictionary may have any properties of the resolved entity
 465     known in advance, for example "title" if the title of the referred video is
 466     known ahead of time.
 467
 468
 469     _type "url_transparent" entities have the same specification as "url", but
 470     indicate that the given additional information is more precise than the one
 471     associated with the resolved URL.
 472     This is useful when a site employs a video service that hosts the video and
 473     its technical metadata, but that video service does not embed a useful
 474     title, description etc.
 475
 476
 477     Subclasses of this should also be added to the list of extractors and
 478     should define a _VALID_URL regexp and, re-define the _real_extract() and
 479     (optionally) _real_initialize() methods.
 480
 481     Subclasses may also override suitable() if necessary, but ensure the function
 482     signature is preserved and that this function imports everything it needs
 483     (except other extractors), so that lazy_extractors works correctly.
 484
 485     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 486     the HTML of Generic webpages. It may also override _extract_embed_urls
 487     or _extract_from_webpage as necessary. While these are normally classmethods,
 488     _extract_from_webpage is allowed to be an instance method.
 489
 490     _extract_from_webpage may raise self.StopExtraction() to stop further
 491     processing of the webpage and obtain exclusive rights to it. This is useful
 492     when the extractor cannot reliably be matched using just the URL,
 493     e.g. invidious/peertube instances
 494
 495     Embed-only extractors can be defined by setting _VALID_URL = False.
 496
 497     To support username + password (or netrc) login, the extractor must define a
 498     _NETRC_MACHINE and re-define _perform_login(username, password) and
 499     (optionally) _initialize_pre_login() methods. The _perform_login method will
 500     be called between _initialize_pre_login and _real_initialize if credentials
 501     are passed by the user. In cases where it is necessary to have the login
 502     process as part of the extraction rather than initialization, _perform_login
 503     can be left undefined.
 504
 505     _GEO_BYPASS attribute may be set to False in order to disable
 506     geo restriction bypass mechanisms for a particular extractor.
 507     Though it won't disable explicit geo restriction bypass based on
 508     country code provided with geo_bypass_country.
 509
 510     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 511     countries for this extractor. One of these countries will be used by
 512     geo restriction bypass mechanism right away in order to bypass
 513     geo restriction, of course, if the mechanism is not disabled.
 514
 515     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 516     IP blocks in CIDR notation for this extractor. One of these IP blocks
 517     will be used by geo restriction bypass mechanism similarly
 518     to _GEO_COUNTRIES.
 519
 520     The _ENABLED attribute should be set to False for IEs that
 521     are disabled by default and must be explicitly enabled.
 522
 523     The _WORKING attribute should be set to False for broken IEs
 524     in order to warn the users and skip the tests.
 525     """
 526
 527     _ready = False
 528     _downloader = None
 529     _x_forwarded_for_ip = None
 530     _GEO_BYPASS = True
 531     _GEO_COUNTRIES = None
 532     _GEO_IP_BLOCKS = None
 533     _WORKING = True
 534     _ENABLED = True
 535     _NETRC_MACHINE = None
 536     IE_DESC = None
 537     SEARCH_KEY = None
 538     _VALID_URL = None
 539     _EMBED_REGEX = []
 540
 541     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 542         password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 543         return {
 544             None: '',
 545             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 546             'password': f'Use {password_hint}',
 547             'cookies': (
 548                 'Use --cookies-from-browser or --cookies for the authentication. '
 549                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 550         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 551
 552     def __init__(self, downloader=None):
 553         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 554         If a downloader is not passed during initialization,
 555         it must be set using "set_downloader()" before "extract()" is called"""
 556         self._ready = False
 557         self._x_forwarded_for_ip = None
 558         self._printed_messages = set()
 559         self.set_downloader(downloader)
 560
 561     @classmethod
 562     def _match_valid_url(cls, url):
 563         if cls._VALID_URL is False:
 564             return None
 565         # This does not use has/getattr intentionally - we want to know whether
 566         # we have cached the regexp for *this* class, whereas getattr would also
 567         # match the superclass
 568         if '_VALID_URL_RE' not in cls.__dict__:
 569             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 570         return cls._VALID_URL_RE.match(url)
 571
 572     @classmethod
 573     def suitable(cls, url):
 574         """Receives a URL and returns True if suitable for this IE."""
 575         # This function must import everything it needs (except other extractors),
 576         # so that lazy_extractors works correctly
 577         return cls._match_valid_url(url) is not None
 578
 579     @classmethod
 580     def _match_id(cls, url):
 581         return cls._match_valid_url(url).group('id')
 582
 583     @classmethod
 584     def get_temp_id(cls, url):
 585         try:
 586             return cls._match_id(url)
 587         except (IndexError, AttributeError):
 588             return None
 589
 590     @classmethod
 591     def working(cls):
 592         """Getter method for _WORKING."""
 593         return cls._WORKING
 594
 595     @classmethod
 596     def supports_login(cls):
 597         return bool(cls._NETRC_MACHINE)
 598
 599     def initialize(self):
 600         """Initializes an instance (authentication, etc)."""
 601         self._printed_messages = set()
 602         self._initialize_geo_bypass({
 603             'countries': self._GEO_COUNTRIES,
 604             'ip_blocks': self._GEO_IP_BLOCKS,
 605         })
 606         if not self._ready:
 607             self._initialize_pre_login()
 608             if self.supports_login():
 609                 username, password = self._get_login_info()
 610                 if username:
 611                     self._perform_login(username, password)
 612             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 613                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 614             self._real_initialize()
 615             self._ready = True
 616
 617     def _initialize_geo_bypass(self, geo_bypass_context):
 618         """
 619         Initialize geo restriction bypass mechanism.
 620
 621         This method is used to initialize geo bypass mechanism based on faking
 622         X-Forwarded-For HTTP header. A random country from provided country list
 623         is selected and a random IP belonging to this country is generated. This
 624         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 625         HTTP requests.
 626
 627         This method will be used for initial geo bypass mechanism initialization
 628         during the instance initialization with _GEO_COUNTRIES and
 629         _GEO_IP_BLOCKS.
 630
 631         You may also manually call it from extractor's code if geo bypass
 632         information is not available beforehand (e.g. obtained during
 633         extraction) or due to some other reason. In this case you should pass
 634         this information in geo bypass context passed as first argument. It may
 635         contain following fields:
 636
 637         countries:  List of geo unrestricted countries (similar
 638                     to _GEO_COUNTRIES)
 639         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 640                     (similar to _GEO_IP_BLOCKS)
 641
 642         """
 643         if not self._x_forwarded_for_ip:
 644
 645             # Geo bypass mechanism is explicitly disabled by user
 646             if not self.get_param('geo_bypass', True):
 647                 return
 648
 649             if not geo_bypass_context:
 650                 geo_bypass_context = {}
 651
 652             # Backward compatibility: previously _initialize_geo_bypass
 653             # expected a list of countries, some 3rd party code may still use
 654             # it this way
 655             if isinstance(geo_bypass_context, (list, tuple)):
 656                 geo_bypass_context = {
 657                     'countries': geo_bypass_context,
 658                 }
 659
 660             # The whole point of geo bypass mechanism is to fake IP
 661             # as X-Forwarded-For HTTP header based on some IP block or
 662             # country code.
 663
 664             # Path 1: bypassing based on IP block in CIDR notation
 665
 666             # Explicit IP block specified by user, use it right away
 667             # regardless of whether extractor is geo bypassable or not
 668             ip_block = self.get_param('geo_bypass_ip_block', None)
 669
 670             # Otherwise use random IP block from geo bypass context but only
 671             # if extractor is known as geo bypassable
 672             if not ip_block:
 673                 ip_blocks = geo_bypass_context.get('ip_blocks')
 674                 if self._GEO_BYPASS and ip_blocks:
 675                     ip_block = random.choice(ip_blocks)
 676
 677             if ip_block:
 678                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 679                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 680                 return
 681
 682             # Path 2: bypassing based on country code
 683
 684             # Explicit country code specified by user, use it right away
 685             # regardless of whether extractor is geo bypassable or not
 686             country = self.get_param('geo_bypass_country', None)
 687
 688             # Otherwise use random country code from geo bypass context but
 689             # only if extractor is known as geo bypassable
 690             if not country:
 691                 countries = geo_bypass_context.get('countries')
 692                 if self._GEO_BYPASS and countries:
 693                     country = random.choice(countries)
 694
 695             if country:
 696                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 697                 self._downloader.write_debug(
 698                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 699
 700     def extract(self, url):
 701         """Extracts URL information and returns it in list of dicts."""
 702         try:
 703             for _ in range(2):
 704                 try:
 705                     self.initialize()
 706                     self.to_screen('Extracting URL: %s' % (
 707                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 708                     ie_result = self._real_extract(url)
 709                     if ie_result is None:
 710                         return None
 711                     if self._x_forwarded_for_ip:
 712                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 713                     subtitles = ie_result.get('subtitles') or {}
 714                     if 'no-live-chat' in self.get_param('compat_opts'):
 715                         for lang in ('live_chat', 'comments', 'danmaku'):
 716                             subtitles.pop(lang, None)
 717                     return ie_result
 718                 except GeoRestrictedError as e:
 719                     if self.__maybe_fake_ip_and_retry(e.countries):
 720                         continue
 721                     raise
 722         except UnsupportedError:
 723             raise
 724         except ExtractorError as e:
 725             e.video_id = e.video_id or self.get_temp_id(url),
 726             e.ie = e.ie or self.IE_NAME,
 727             e.traceback = e.traceback or sys.exc_info()[2]
 728             raise
 729         except http.client.IncompleteRead as e:
 730             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 731         except (KeyError, StopIteration) as e:
 732             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 733
 734     def __maybe_fake_ip_and_retry(self, countries):
 735         if (not self.get_param('geo_bypass_country', None)
 736                 and self._GEO_BYPASS
 737                 and self.get_param('geo_bypass', True)
 738                 and not self._x_forwarded_for_ip
 739                 and countries):
 740             country_code = random.choice(countries)
 741             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 742             if self._x_forwarded_for_ip:
 743                 self.report_warning(
 744                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 745                     % (self._x_forwarded_for_ip, country_code.upper()))
 746                 return True
 747         return False
 748
 749     def set_downloader(self, downloader):
 750         """Sets a YoutubeDL instance as the downloader for this IE."""
 751         self._downloader = downloader
 752
 753     @property
 754     def cache(self):
 755         return self._downloader.cache
 756
 757     @property
 758     def cookiejar(self):
 759         return self._downloader.cookiejar
 760
 761     def _initialize_pre_login(self):
 762         """ Initialization before login. Redefine in subclasses."""
 763         pass
 764
 765     def _perform_login(self, username, password):
 766         """ Login with username and password. Redefine in subclasses."""
 767         pass
 768
 769     def _real_initialize(self):
 770         """Real initialization process. Redefine in subclasses."""
 771         pass
 772
 773     def _real_extract(self, url):
 774         """Real extraction process. Redefine in subclasses."""
 775         raise NotImplementedError('This method must be implemented by subclasses')
 776
 777     @classmethod
 778     def ie_key(cls):
 779         """A string for getting the InfoExtractor with get_info_extractor"""
 780         return cls.__name__[:-2]
 781
 782     @classproperty
 783     def IE_NAME(cls):
 784         return cls.__name__[:-2]
 785
 786     @staticmethod
 787     def __can_accept_status_code(err, expected_status):
 788         assert isinstance(err, urllib.error.HTTPError)
 789         if expected_status is None:
 790             return False
 791         elif callable(expected_status):
 792             return expected_status(err.code) is True
 793         else:
 794             return err.code in variadic(expected_status)
 795
 796     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 797         if isinstance(url_or_request, urllib.request.Request):
 798             return update_Request(url_or_request, data=data, headers=headers, query=query)
 799         if query:
 800             url_or_request = update_url_query(url_or_request, query)
 801         return sanitized_Request(url_or_request, data, headers or {})
 802
 803     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 804         """
 805         Return the response handle.
 806
 807         See _download_webpage docstring for arguments specification.
 808         """
 809         if not self._downloader._first_webpage_request:
 810             sleep_interval = self.get_param('sleep_interval_requests') or 0
 811             if sleep_interval > 0:
 812                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 813                 time.sleep(sleep_interval)
 814         else:
 815             self._downloader._first_webpage_request = False
 816
 817         if note is None:
 818             self.report_download_webpage(video_id)
 819         elif note is not False:
 820             if video_id is None:
 821                 self.to_screen(str(note))
 822             else:
 823                 self.to_screen(f'{video_id}: {note}')
 824
 825         # Some sites check X-Forwarded-For HTTP header in order to figure out
 826         # the origin of the client behind proxy. This allows bypassing geo
 827         # restriction by faking this header's value to IP that belongs to some
 828         # geo unrestricted country. We will do so once we encounter any
 829         # geo restriction error.
 830         if self._x_forwarded_for_ip:
 831             headers = (headers or {}).copy()
 832             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 833
 834         try:
 835             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 836         except network_exceptions as err:
 837             if isinstance(err, urllib.error.HTTPError):
 838                 if self.__can_accept_status_code(err, expected_status):
 839                     # Retain reference to error to prevent file object from
 840                     # being closed before it can be read. Works around the
 841                     # effects of <https://bugs.python.org/issue15002>
 842                     # introduced in Python 3.4.1.
 843                     err.fp._error = err
 844                     return err.fp
 845
 846             if errnote is False:
 847                 return False
 848             if errnote is None:
 849                 errnote = 'Unable to download webpage'
 850
 851             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 852             if fatal:
 853                 raise ExtractorError(errmsg, cause=err)
 854             else:
 855                 self.report_warning(errmsg)
 856                 return False
 857
 858     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 859                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 860         """
 861         Return a tuple (page content as string, URL handle).
 862
 863         Arguments:
 864         url_or_request -- plain text URL as a string or
 865             a urllib.request.Request object
 866         video_id -- Video/playlist/item identifier (string)
 867
 868         Keyword arguments:
 869         note -- note printed before downloading (string)
 870         errnote -- note printed in case of an error (string)
 871         fatal -- flag denoting whether error should be considered fatal,
 872             i.e. whether it should cause ExtractionError to be raised,
 873             otherwise a warning will be reported and extraction continued
 874         encoding -- encoding for a page content decoding, guessed automatically
 875             when not explicitly specified
 876         data -- POST data (bytes)
 877         headers -- HTTP headers (dict)
 878         query -- URL query (dict)
 879         expected_status -- allows to accept failed HTTP requests (non 2xx
 880             status code) by explicitly specifying a set of accepted status
 881             codes. Can be any of the following entities:
 882                 - an integer type specifying an exact failed status code to
 883                   accept
 884                 - a list or a tuple of integer types specifying a list of
 885                   failed status codes to accept
 886                 - a callable accepting an actual failed status code and
 887                   returning True if it should be accepted
 888             Note that this argument does not affect success status codes (2xx)
 889             which are always accepted.
 890         """
 891
 892         # Strip hashes from the URL (#1038)
 893         if isinstance(url_or_request, str):
 894             url_or_request = url_or_request.partition('#')[0]
 895
 896         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 897         if urlh is False:
 898             assert not fatal
 899             return False
 900         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 901         return (content, urlh)
 902
 903     @staticmethod
 904     def _guess_encoding_from_content(content_type, webpage_bytes):
 905         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 906         if m:
 907             encoding = m.group(1)
 908         else:
 909             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 910                           webpage_bytes[:1024])
 911             if m:
 912                 encoding = m.group(1).decode('ascii')
 913             elif webpage_bytes.startswith(b'\xff\xfe'):
 914                 encoding = 'utf-16'
 915             else:
 916                 encoding = 'utf-8'
 917
 918         return encoding
 919
 920     def __check_blocked(self, content):
 921         first_block = content[:512]
 922         if ('<title>Access to this site is blocked</title>' in content
 923                 and 'Websense' in first_block):
 924             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 925             blocked_iframe = self._html_search_regex(
 926                 r'<iframe src="([^"]+)"', content,
 927                 'Websense information URL', default=None)
 928             if blocked_iframe:
 929                 msg += ' Visit %s for more details' % blocked_iframe
 930             raise ExtractorError(msg, expected=True)
 931         if '<title>The URL you requested has been blocked</title>' in first_block:
 932             msg = (
 933                 'Access to this webpage has been blocked by Indian censorship. '
 934                 'Use a VPN or proxy server (with --proxy) to route around it.')
 935             block_msg = self._html_search_regex(
 936                 r'</h1><p>(.*?)</p>',
 937                 content, 'block message', default=None)
 938             if block_msg:
 939                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 940             raise ExtractorError(msg, expected=True)
 941         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 942                 and 'blocklist.rkn.gov.ru' in content):
 943             raise ExtractorError(
 944                 'Access to this webpage has been blocked by decision of the Russian government. '
 945                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 946                 expected=True)
 947
 948     def _request_dump_filename(self, url, video_id):
 949         basen = f'{video_id}_{url}'
 950         trim_length = self.get_param('trim_file_name') or 240
 951         if len(basen) > trim_length:
 952             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 953             basen = basen[:trim_length - len(h)] + h
 954         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 955         # Working around MAX_PATH limitation on Windows (see
 956         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 957         if compat_os_name == 'nt':
 958             absfilepath = os.path.abspath(filename)
 959             if len(absfilepath) > 259:
 960                 filename = fR'\\?\{absfilepath}'
 961         return filename
 962
 963     def __decode_webpage(self, webpage_bytes, encoding, headers):
 964         if not encoding:
 965             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 966         try:
 967             return webpage_bytes.decode(encoding, 'replace')
 968         except LookupError:
 969             return webpage_bytes.decode('utf-8', 'replace')
 970
 971     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 972         webpage_bytes = urlh.read()
 973         if prefix is not None:
 974             webpage_bytes = prefix + webpage_bytes
 975         if self.get_param('dump_intermediate_pages', False):
 976             self.to_screen('Dumping request to ' + urlh.geturl())
 977             dump = base64.b64encode(webpage_bytes).decode('ascii')
 978             self._downloader.to_screen(dump)
 979         if self.get_param('write_pages'):
 980             filename = self._request_dump_filename(urlh.geturl(), video_id)
 981             self.to_screen(f'Saving request to {filename}')
 982             with open(filename, 'wb') as outf:
 983                 outf.write(webpage_bytes)
 984
 985         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 986         self.__check_blocked(content)
 987
 988         return content
 989
 990     def __print_error(self, errnote, fatal, video_id, err):
 991         if fatal:
 992             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 993         elif errnote:
 994             self.report_warning(f'{video_id}: {errnote}: {err}')
 995
 996     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 997         if transform_source:
 998             xml_string = transform_source(xml_string)
 999         try:
1000             return compat_etree_fromstring(xml_string.encode('utf-8'))
1001         except xml.etree.ElementTree.ParseError as ve:
1002             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1003
1004     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1005         try:
1006             return json.loads(
1007                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1008         except ValueError as ve:
1009             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1010
1011     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1012         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1013
1014     def __create_download_methods(name, parser, note, errnote, return_value):
1015
1016         def parse(ie, content, *args, errnote=errnote, **kwargs):
1017             if parser is None:
1018                 return content
1019             if errnote is False:
1020                 kwargs['errnote'] = errnote
1021             # parser is fetched by name so subclasses can override it
1022             return getattr(ie, parser)(content, *args, **kwargs)
1023
1024         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1025                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1026             res = self._download_webpage_handle(
1027                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1028                 data=data, headers=headers, query=query, expected_status=expected_status)
1029             if res is False:
1030                 return res
1031             content, urlh = res
1032             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1033
1034         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1035                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1036             if self.get_param('load_pages'):
1037                 url_or_request = self._create_request(url_or_request, data, headers, query)
1038                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1039                 self.to_screen(f'Loading request from {filename}')
1040                 try:
1041                     with open(filename, 'rb') as dumpf:
1042                         webpage_bytes = dumpf.read()
1043                 except OSError as e:
1044                     self.report_warning(f'Unable to load request from disk: {e}')
1045                 else:
1046                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1047                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1048             kwargs = {
1049                 'note': note,
1050                 'errnote': errnote,
1051                 'transform_source': transform_source,
1052                 'fatal': fatal,
1053                 'encoding': encoding,
1054                 'data': data,
1055                 'headers': headers,
1056                 'query': query,
1057                 'expected_status': expected_status,
1058             }
1059             if parser is None:
1060                 kwargs.pop('transform_source')
1061             # The method is fetched by name so subclasses can override _download_..._handle
1062             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1063             return res if res is False else res[0]
1064
1065         def impersonate(func, name, return_value):
1066             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1067             func.__doc__ = f'''
1068                 @param transform_source     Apply this transformation before parsing
1069                 @returns                    {return_value}
1070
1071                 See _download_webpage_handle docstring for other arguments specification
1072             '''
1073
1074         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1075         impersonate(download_content, f'_download_{name}', f'{return_value}')
1076         return download_handle, download_content
1077
1078     _download_xml_handle, _download_xml = __create_download_methods(
1079         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1080     _download_json_handle, _download_json = __create_download_methods(
1081         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1082     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1083         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1084     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1085
1086     def _download_webpage(
1087             self, url_or_request, video_id, note=None, errnote=None,
1088             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1089         """
1090         Return the data of the page as a string.
1091
1092         Keyword arguments:
1093         tries -- number of tries
1094         timeout -- sleep interval between tries
1095
1096         See _download_webpage_handle docstring for other arguments specification.
1097         """
1098
1099         R''' # NB: These are unused; should they be deprecated?
1100         if tries != 1:
1101             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1102         if timeout is NO_DEFAULT:
1103             timeout = 5
1104         else:
1105             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1106         '''
1107
1108         try_count = 0
1109         while True:
1110             try:
1111                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1112             except http.client.IncompleteRead as e:
1113                 try_count += 1
1114                 if try_count >= tries:
1115                     raise e
1116                 self._sleep(timeout, video_id)
1117
1118     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1119         idstr = format_field(video_id, None, '%s: ')
1120         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1121         if only_once:
1122             if f'WARNING: {msg}' in self._printed_messages:
1123                 return
1124             self._printed_messages.add(f'WARNING: {msg}')
1125         self._downloader.report_warning(msg, *args, **kwargs)
1126
1127     def to_screen(self, msg, *args, **kwargs):
1128         """Print msg to screen, prefixing it with '[ie_name]'"""
1129         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1130
1131     def write_debug(self, msg, *args, **kwargs):
1132         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1133
1134     def get_param(self, name, default=None, *args, **kwargs):
1135         if self._downloader:
1136             return self._downloader.params.get(name, default, *args, **kwargs)
1137         return default
1138
1139     def report_drm(self, video_id, partial=NO_DEFAULT):
1140         if partial is not NO_DEFAULT:
1141             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1142         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1143
1144     def report_extraction(self, id_or_name):
1145         """Report information extraction."""
1146         self.to_screen('%s: Extracting information' % id_or_name)
1147
1148     def report_download_webpage(self, video_id):
1149         """Report webpage download."""
1150         self.to_screen('%s: Downloading webpage' % video_id)
1151
1152     def report_age_confirmation(self):
1153         """Report attempt to confirm age."""
1154         self.to_screen('Confirming age')
1155
1156     def report_login(self):
1157         """Report attempt to log in."""
1158         self.to_screen('Logging in')
1159
1160     def raise_login_required(
1161             self, msg='This video is only available for registered users',
1162             metadata_available=False, method=NO_DEFAULT):
1163         if metadata_available and (
1164                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1165             self.report_warning(msg)
1166             return
1167         msg += format_field(self._login_hint(method), None, '. %s')
1168         raise ExtractorError(msg, expected=True)
1169
1170     def raise_geo_restricted(
1171             self, msg='This video is not available from your location due to geo restriction',
1172             countries=None, metadata_available=False):
1173         if metadata_available and (
1174                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1175             self.report_warning(msg)
1176         else:
1177             raise GeoRestrictedError(msg, countries=countries)
1178
1179     def raise_no_formats(self, msg, expected=False, video_id=None):
1180         if expected and (
1181                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1182             self.report_warning(msg, video_id)
1183         elif isinstance(msg, ExtractorError):
1184             raise msg
1185         else:
1186             raise ExtractorError(msg, expected=expected, video_id=video_id)
1187
1188     # Methods for following #608
1189     @staticmethod
1190     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1191         """Returns a URL that points to a page that should be processed"""
1192         if ie is not None:
1193             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1194         if video_id is not None:
1195             kwargs['id'] = video_id
1196         if video_title is not None:
1197             kwargs['title'] = video_title
1198         return {
1199             **kwargs,
1200             '_type': 'url_transparent' if url_transparent else 'url',
1201             'url': url,
1202         }
1203
1204     @classmethod
1205     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1206                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1207         return cls.playlist_result(
1208             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1209             playlist_id, playlist_title, **kwargs)
1210
1211     @staticmethod
1212     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1213         """Returns a playlist"""
1214         if playlist_id:
1215             kwargs['id'] = playlist_id
1216         if playlist_title:
1217             kwargs['title'] = playlist_title
1218         if playlist_description is not None:
1219             kwargs['description'] = playlist_description
1220         return {
1221             **kwargs,
1222             '_type': 'multi_video' if multi_video else 'playlist',
1223             'entries': entries,
1224         }
1225
1226     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1227         """
1228         Perform a regex search on the given string, using a single or a list of
1229         patterns returning the first matching group.
1230         In case of failure return a default value or raise a WARNING or a
1231         RegexNotFoundError, depending on fatal, specifying the field name.
1232         """
1233         if string is None:
1234             mobj = None
1235         elif isinstance(pattern, (str, re.Pattern)):
1236             mobj = re.search(pattern, string, flags)
1237         else:
1238             for p in pattern:
1239                 mobj = re.search(p, string, flags)
1240                 if mobj:
1241                     break
1242
1243         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1244
1245         if mobj:
1246             if group is None:
1247                 # return the first matching group
1248                 return next(g for g in mobj.groups() if g is not None)
1249             elif isinstance(group, (list, tuple)):
1250                 return tuple(mobj.group(g) for g in group)
1251             else:
1252                 return mobj.group(group)
1253         elif default is not NO_DEFAULT:
1254             return default
1255         elif fatal:
1256             raise RegexNotFoundError('Unable to extract %s' % _name)
1257         else:
1258             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1259             return None
1260
1261     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1262                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1263         """Searches string for the JSON object specified by start_pattern"""
1264         # NB: end_pattern is only used to reduce the size of the initial match
1265         if default is NO_DEFAULT:
1266             default, has_default = {}, False
1267         else:
1268             fatal, has_default = False, True
1269
1270         json_string = self._search_regex(
1271             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1272             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1273         if not json_string:
1274             return default
1275
1276         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1277         try:
1278             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1279         except ExtractorError as e:
1280             if fatal:
1281                 raise ExtractorError(
1282                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1283             elif not has_default:
1284                 self.report_warning(
1285                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1286         return default
1287
1288     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1289         """
1290         Like _search_regex, but strips HTML tags and unescapes entities.
1291         """
1292         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1293         if isinstance(res, tuple):
1294             return tuple(map(clean_html, res))
1295         return clean_html(res)
1296
1297     def _get_netrc_login_info(self, netrc_machine=None):
1298         netrc_machine = netrc_machine or self._NETRC_MACHINE
1299
1300         cmd = self.get_param('netrc_cmd')
1301         if cmd:
1302             cmd = cmd.replace('{}', netrc_machine)
1303             self.to_screen(f'Executing command: {cmd}')
1304             stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1305             if ret != 0:
1306                 raise OSError(f'Command returned error code {ret}')
1307             info = netrc_from_content(stdout).authenticators(netrc_machine)
1308
1309         elif self.get_param('usenetrc', False):
1310             netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1311             if os.path.isdir(netrc_file):
1312                 netrc_file = os.path.join(netrc_file, '.netrc')
1313             info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1314
1315         else:
1316             return None, None
1317         if not info:
1318             raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')
1319         return info[0], info[2]
1320
1321     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1322         """
1323         Get the login info as (username, password)
1324         First look for the manually specified credentials using username_option
1325         and password_option as keys in params dictionary. If no such credentials
1326         are available try the netrc_cmd if it is defined or look in the
1327         netrc file using the netrc_machine or _NETRC_MACHINE value.
1328         If there's no info available, return (None, None)
1329         """
1330
1331         username = self.get_param(username_option)
1332         if username is not None:
1333             password = self.get_param(password_option)
1334         else:
1335             try:
1336                 username, password = self._get_netrc_login_info(netrc_machine)
1337             except (OSError, netrc.NetrcParseError) as err:
1338                 self.report_warning(f'Failed to parse .netrc: {err}')
1339                 return None, None
1340         return username, password
1341
1342     def _get_tfa_info(self, note='two-factor verification code'):
1343         """
1344         Get the two-factor authentication info
1345         TODO - asking the user will be required for sms/phone verify
1346         currently just uses the command line option
1347         If there's no info available, return None
1348         """
1349
1350         tfa = self.get_param('twofactor')
1351         if tfa is not None:
1352             return tfa
1353
1354         return getpass.getpass('Type %s and press [Return]: ' % note)
1355
1356     # Helper functions for extracting OpenGraph info
1357     @staticmethod
1358     def _og_regexes(prop):
1359         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1360         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1361                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1362         template = r'<meta[^>]+?%s[^>]+?%s'
1363         return [
1364             template % (property_re, content_re),
1365             template % (content_re, property_re),
1366         ]
1367
1368     @staticmethod
1369     def _meta_regex(prop):
1370         return r'''(?isx)<meta
1371                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1372                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1373
1374     def _og_search_property(self, prop, html, name=None, **kargs):
1375         prop = variadic(prop)
1376         if name is None:
1377             name = 'OpenGraph %s' % prop[0]
1378         og_regexes = []
1379         for p in prop:
1380             og_regexes.extend(self._og_regexes(p))
1381         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1382         if escaped is None:
1383             return None
1384         return unescapeHTML(escaped)
1385
1386     def _og_search_thumbnail(self, html, **kargs):
1387         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1388
1389     def _og_search_description(self, html, **kargs):
1390         return self._og_search_property('description', html, fatal=False, **kargs)
1391
1392     def _og_search_title(self, html, *, fatal=False, **kargs):
1393         return self._og_search_property('title', html, fatal=fatal, **kargs)
1394
1395     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1396         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1397         if secure:
1398             regexes = self._og_regexes('video:secure_url') + regexes
1399         return self._html_search_regex(regexes, html, name, **kargs)
1400
1401     def _og_search_url(self, html, **kargs):
1402         return self._og_search_property('url', html, **kargs)
1403
1404     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1405         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1406
1407     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1408         name = variadic(name)
1409         if display_name is None:
1410             display_name = name[0]
1411         return self._html_search_regex(
1412             [self._meta_regex(n) for n in name],
1413             html, display_name, fatal=fatal, group='content', **kwargs)
1414
1415     def _dc_search_uploader(self, html):
1416         return self._html_search_meta('dc.creator', html, 'uploader')
1417
1418     @staticmethod
1419     def _rta_search(html):
1420         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1421         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1422                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1423                      html):
1424             return 18
1425
1426         # And then there are the jokers who advertise that they use RTA, but actually don't.
1427         AGE_LIMIT_MARKERS = [
1428             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1429             r'>[^<]*you acknowledge you are at least (\d+) years old',
1430             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1431         ]
1432
1433         age_limit = 0
1434         for marker in AGE_LIMIT_MARKERS:
1435             mobj = re.search(marker, html)
1436             if mobj:
1437                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1438         return age_limit
1439
1440     def _media_rating_search(self, html):
1441         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1442         rating = self._html_search_meta('rating', html)
1443
1444         if not rating:
1445             return None
1446
1447         RATING_TABLE = {
1448             'safe for kids': 0,
1449             'general': 8,
1450             '14 years': 14,
1451             'mature': 17,
1452             'restricted': 19,
1453         }
1454         return RATING_TABLE.get(rating.lower())
1455
1456     def _family_friendly_search(self, html):
1457         # See http://schema.org/VideoObject
1458         family_friendly = self._html_search_meta(
1459             'isFamilyFriendly', html, default=None)
1460
1461         if not family_friendly:
1462             return None
1463
1464         RATING_TABLE = {
1465             '1': 0,
1466             'true': 0,
1467             '0': 18,
1468             'false': 18,
1469         }
1470         return RATING_TABLE.get(family_friendly.lower())
1471
1472     def _twitter_search_player(self, html):
1473         return self._html_search_meta('twitter:player', html,
1474                                       'twitter card player')
1475
1476     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1477         """Yield all json ld objects in the html"""
1478         if default is not NO_DEFAULT:
1479             fatal = False
1480         for mobj in re.finditer(JSON_LD_RE, html):
1481             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1482             for json_ld in variadic(json_ld_item):
1483                 if isinstance(json_ld, dict):
1484                     yield json_ld
1485
1486     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1487         """Search for a video in any json ld in the html"""
1488         if default is not NO_DEFAULT:
1489             fatal = False
1490         info = self._json_ld(
1491             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1492             video_id, fatal=fatal, expected_type=expected_type)
1493         if info:
1494             return info
1495         if default is not NO_DEFAULT:
1496             return default
1497         elif fatal:
1498             raise RegexNotFoundError('Unable to extract JSON-LD')
1499         else:
1500             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1501             return {}
1502
1503     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1504         if isinstance(json_ld, str):
1505             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1506         if not json_ld:
1507             return {}
1508         info = {}
1509
1510         INTERACTION_TYPE_MAP = {
1511             'CommentAction': 'comment',
1512             'AgreeAction': 'like',
1513             'DisagreeAction': 'dislike',
1514             'LikeAction': 'like',
1515             'DislikeAction': 'dislike',
1516             'ListenAction': 'view',
1517             'WatchAction': 'view',
1518             'ViewAction': 'view',
1519         }
1520
1521         def is_type(e, *expected_types):
1522             type = variadic(traverse_obj(e, '@type'))
1523             return any(x in type for x in expected_types)
1524
1525         def extract_interaction_type(e):
1526             interaction_type = e.get('interactionType')
1527             if isinstance(interaction_type, dict):
1528                 interaction_type = interaction_type.get('@type')
1529             return str_or_none(interaction_type)
1530
1531         def extract_interaction_statistic(e):
1532             interaction_statistic = e.get('interactionStatistic')
1533             if isinstance(interaction_statistic, dict):
1534                 interaction_statistic = [interaction_statistic]
1535             if not isinstance(interaction_statistic, list):
1536                 return
1537             for is_e in interaction_statistic:
1538                 if not is_type(is_e, 'InteractionCounter'):
1539                     continue
1540                 interaction_type = extract_interaction_type(is_e)
1541                 if not interaction_type:
1542                     continue
1543                 # For interaction count some sites provide string instead of
1544                 # an integer (as per spec) with non digit characters (e.g. ",")
1545                 # so extracting count with more relaxed str_to_int
1546                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1547                 if interaction_count is None:
1548                     continue
1549                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1550                 if not count_kind:
1551                     continue
1552                 count_key = '%s_count' % count_kind
1553                 if info.get(count_key) is not None:
1554                     continue
1555                 info[count_key] = interaction_count
1556
1557         def extract_chapter_information(e):
1558             chapters = [{
1559                 'title': part.get('name'),
1560                 'start_time': part.get('startOffset'),
1561                 'end_time': part.get('endOffset'),
1562             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1563             for idx, (last_c, current_c, next_c) in enumerate(zip(
1564                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1565                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1566                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1567                 if None in current_c.values():
1568                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1569                     return
1570             if chapters:
1571                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1572                 info['chapters'] = chapters
1573
1574         def extract_video_object(e):
1575             author = e.get('author')
1576             info.update({
1577                 'url': url_or_none(e.get('contentUrl')),
1578                 'ext': mimetype2ext(e.get('encodingFormat')),
1579                 'title': unescapeHTML(e.get('name')),
1580                 'description': unescapeHTML(e.get('description')),
1581                 'thumbnails': [{'url': unescapeHTML(url)}
1582                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1583                                if url_or_none(url)],
1584                 'duration': parse_duration(e.get('duration')),
1585                 'timestamp': unified_timestamp(e.get('uploadDate')),
1586                 # author can be an instance of 'Organization' or 'Person' types.
1587                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1588                 # however some websites are using 'Text' type instead.
1589                 # 1. https://schema.org/VideoObject
1590                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1591                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1592                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1593                 'tbr': int_or_none(e.get('bitrate')),
1594                 'width': int_or_none(e.get('width')),
1595                 'height': int_or_none(e.get('height')),
1596                 'view_count': int_or_none(e.get('interactionCount')),
1597                 'tags': try_call(lambda: e.get('keywords').split(',')),
1598             })
1599             if is_type(e, 'AudioObject'):
1600                 info.update({
1601                     'vcodec': 'none',
1602                     'abr': int_or_none(e.get('bitrate')),
1603                 })
1604             extract_interaction_statistic(e)
1605             extract_chapter_information(e)
1606
1607         def traverse_json_ld(json_ld, at_top_level=True):
1608             for e in variadic(json_ld):
1609                 if not isinstance(e, dict):
1610                     continue
1611                 if at_top_level and '@context' not in e:
1612                     continue
1613                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1614                     traverse_json_ld(e['@graph'], at_top_level=False)
1615                     continue
1616                 if expected_type is not None and not is_type(e, expected_type):
1617                     continue
1618                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1619                 if rating is not None:
1620                     info['average_rating'] = rating
1621                 if is_type(e, 'TVEpisode', 'Episode'):
1622                     episode_name = unescapeHTML(e.get('name'))
1623                     info.update({
1624                         'episode': episode_name,
1625                         'episode_number': int_or_none(e.get('episodeNumber')),
1626                         'description': unescapeHTML(e.get('description')),
1627                     })
1628                     if not info.get('title') and episode_name:
1629                         info['title'] = episode_name
1630                     part_of_season = e.get('partOfSeason')
1631                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1632                         info.update({
1633                             'season': unescapeHTML(part_of_season.get('name')),
1634                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1635                         })
1636                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1637                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1638                         info['series'] = unescapeHTML(part_of_series.get('name'))
1639                 elif is_type(e, 'Movie'):
1640                     info.update({
1641                         'title': unescapeHTML(e.get('name')),
1642                         'description': unescapeHTML(e.get('description')),
1643                         'duration': parse_duration(e.get('duration')),
1644                         'timestamp': unified_timestamp(e.get('dateCreated')),
1645                     })
1646                 elif is_type(e, 'Article', 'NewsArticle'):
1647                     info.update({
1648                         'timestamp': parse_iso8601(e.get('datePublished')),
1649                         'title': unescapeHTML(e.get('headline')),
1650                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1651                     })
1652                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1653                         extract_video_object(e['video'][0])
1654                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1655                         extract_video_object(e['subjectOf'][0])
1656                 elif is_type(e, 'VideoObject', 'AudioObject'):
1657                     extract_video_object(e)
1658                     if expected_type is None:
1659                         continue
1660                     else:
1661                         break
1662                 video = e.get('video')
1663                 if is_type(video, 'VideoObject'):
1664                     extract_video_object(video)
1665                 if expected_type is None:
1666                     continue
1667                 else:
1668                     break
1669
1670         traverse_json_ld(json_ld)
1671         return filter_dict(info)
1672
1673     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1674         return self._parse_json(
1675             self._search_regex(
1676                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1677                 webpage, 'next.js data', fatal=fatal, **kw),
1678             video_id, transform_source=transform_source, fatal=fatal)
1679
1680     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1681         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1682         rectx = re.escape(context_name)
1683         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1684         js, arg_keys, arg_vals = self._search_regex(
1685             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1686             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1687             default=NO_DEFAULT if fatal else (None, None, None))
1688         if js is None:
1689             return {}
1690
1691         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1692             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1693
1694         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1695         return traverse_obj(ret, traverse) or {}
1696
1697     @staticmethod
1698     def _hidden_inputs(html):
1699         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1700         hidden_inputs = {}
1701         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1702             attrs = extract_attributes(input)
1703             if not input:
1704                 continue
1705             if attrs.get('type') not in ('hidden', 'submit'):
1706                 continue
1707             name = attrs.get('name') or attrs.get('id')
1708             value = attrs.get('value')
1709             if name and value is not None:
1710                 hidden_inputs[name] = value
1711         return hidden_inputs
1712
1713     def _form_hidden_inputs(self, form_id, html):
1714         form = self._search_regex(
1715             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1716             html, '%s form' % form_id, group='form')
1717         return self._hidden_inputs(form)
1718
1719     @classproperty(cache=True)
1720     def FormatSort(cls):
1721         class FormatSort(FormatSorter):
1722             def __init__(ie, *args, **kwargs):
1723                 super().__init__(ie._downloader, *args, **kwargs)
1724
1725         deprecation_warning(
1726             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1727             'Use yt_dlp.utils.FormatSorter instead')
1728         return FormatSort
1729
1730     def _sort_formats(self, formats, field_preference=[]):
1731         if not field_preference:
1732             self._downloader.deprecation_warning(
1733                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1734             return
1735         self._downloader.deprecation_warning(
1736             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1737             'Return _format_sort_fields in the info_dict instead')
1738         if formats:
1739             formats[0]['__sort_fields'] = field_preference
1740
1741     def _check_formats(self, formats, video_id):
1742         if formats:
1743             formats[:] = filter(
1744                 lambda f: self._is_valid_url(
1745                     f['url'], video_id,
1746                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1747                 formats)
1748
1749     @staticmethod
1750     def _remove_duplicate_formats(formats):
1751         format_urls = set()
1752         unique_formats = []
1753         for f in formats:
1754             if f['url'] not in format_urls:
1755                 format_urls.add(f['url'])
1756                 unique_formats.append(f)
1757         formats[:] = unique_formats
1758
1759     def _is_valid_url(self, url, video_id, item='video', headers={}):
1760         url = self._proto_relative_url(url, scheme='http:')
1761         # For now assume non HTTP(S) URLs always valid
1762         if not (url.startswith('http://') or url.startswith('https://')):
1763             return True
1764         try:
1765             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1766             return True
1767         except ExtractorError as e:
1768             self.to_screen(
1769                 '%s: %s URL is invalid, skipping: %s'
1770                 % (video_id, item, error_to_compat_str(e.cause)))
1771             return False
1772
1773     def http_scheme(self):
1774         """ Either "http:" or "https:", depending on the user's preferences """
1775         return (
1776             'http:'
1777             if self.get_param('prefer_insecure', False)
1778             else 'https:')
1779
1780     def _proto_relative_url(self, url, scheme=None):
1781         scheme = scheme or self.http_scheme()
1782         assert scheme.endswith(':')
1783         return sanitize_url(url, scheme=scheme[:-1])
1784
1785     def _sleep(self, timeout, video_id, msg_template=None):
1786         if msg_template is None:
1787             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1788         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1789         self.to_screen(msg)
1790         time.sleep(timeout)
1791
1792     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1793                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1794                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1795         if self.get_param('ignore_no_formats_error'):
1796             fatal = False
1797
1798         res = self._download_xml_handle(
1799             manifest_url, video_id, 'Downloading f4m manifest',
1800             'Unable to download f4m manifest',
1801             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1802             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1803             transform_source=transform_source,
1804             fatal=fatal, data=data, headers=headers, query=query)
1805         if res is False:
1806             return []
1807
1808         manifest, urlh = res
1809         manifest_url = urlh.geturl()
1810
1811         return self._parse_f4m_formats(
1812             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1813             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1814
1815     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1816                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1817                            fatal=True, m3u8_id=None):
1818         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1819             return []
1820
1821         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1822         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1823         if akamai_pv is not None and ';' in akamai_pv.text:
1824             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1825             if playerVerificationChallenge.strip() != '':
1826                 return []
1827
1828         formats = []
1829         manifest_version = '1.0'
1830         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1831         if not media_nodes:
1832             manifest_version = '2.0'
1833             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1834         # Remove unsupported DRM protected media from final formats
1835         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1836         media_nodes = remove_encrypted_media(media_nodes)
1837         if not media_nodes:
1838             return formats
1839
1840         manifest_base_url = get_base_url(manifest)
1841
1842         bootstrap_info = xpath_element(
1843             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1844             'bootstrap info', default=None)
1845
1846         vcodec = None
1847         mime_type = xpath_text(
1848             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1849             'base URL', default=None)
1850         if mime_type and mime_type.startswith('audio/'):
1851             vcodec = 'none'
1852
1853         for i, media_el in enumerate(media_nodes):
1854             tbr = int_or_none(media_el.attrib.get('bitrate'))
1855             width = int_or_none(media_el.attrib.get('width'))
1856             height = int_or_none(media_el.attrib.get('height'))
1857             format_id = join_nonempty(f4m_id, tbr or i)
1858             # If <bootstrapInfo> is present, the specified f4m is a
1859             # stream-level manifest, and only set-level manifests may refer to
1860             # external resources.  See section 11.4 and section 4 of F4M spec
1861             if bootstrap_info is None:
1862                 media_url = None
1863                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1864                 if manifest_version == '2.0':
1865                     media_url = media_el.attrib.get('href')
1866                 if media_url is None:
1867                     media_url = media_el.attrib.get('url')
1868                 if not media_url:
1869                     continue
1870                 manifest_url = (
1871                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1872                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1873                 # If media_url is itself a f4m manifest do the recursive extraction
1874                 # since bitrates in parent manifest (this one) and media_url manifest
1875                 # may differ leading to inability to resolve the format by requested
1876                 # bitrate in f4m downloader
1877                 ext = determine_ext(manifest_url)
1878                 if ext == 'f4m':
1879                     f4m_formats = self._extract_f4m_formats(
1880                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1881                         transform_source=transform_source, fatal=fatal)
1882                     # Sometimes stream-level manifest contains single media entry that
1883                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1884                     # At the same time parent's media entry in set-level manifest may
1885                     # contain it. We will copy it from parent in such cases.
1886                     if len(f4m_formats) == 1:
1887                         f = f4m_formats[0]
1888                         f.update({
1889                             'tbr': f.get('tbr') or tbr,
1890                             'width': f.get('width') or width,
1891                             'height': f.get('height') or height,
1892                             'format_id': f.get('format_id') if not tbr else format_id,
1893                             'vcodec': vcodec,
1894                         })
1895                     formats.extend(f4m_formats)
1896                     continue
1897                 elif ext == 'm3u8':
1898                     formats.extend(self._extract_m3u8_formats(
1899                         manifest_url, video_id, 'mp4', preference=preference,
1900                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1901                     continue
1902             formats.append({
1903                 'format_id': format_id,
1904                 'url': manifest_url,
1905                 'manifest_url': manifest_url,
1906                 'ext': 'flv' if bootstrap_info is not None else None,
1907                 'protocol': 'f4m',
1908                 'tbr': tbr,
1909                 'width': width,
1910                 'height': height,
1911                 'vcodec': vcodec,
1912                 'preference': preference,
1913                 'quality': quality,
1914             })
1915         return formats
1916
1917     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1918         return {
1919             'format_id': join_nonempty(m3u8_id, 'meta'),
1920             'url': m3u8_url,
1921             'ext': ext,
1922             'protocol': 'm3u8',
1923             'preference': preference - 100 if preference else -100,
1924             'quality': quality,
1925             'resolution': 'multiple',
1926             'format_note': 'Quality selection URL',
1927         }
1928
1929     def _report_ignoring_subs(self, name):
1930         self.report_warning(bug_reports_message(
1931             f'Ignoring subtitle tracks found in the {name} manifest; '
1932             'if any subtitle tracks are missing,'
1933         ), only_once=True)
1934
1935     def _extract_m3u8_formats(self, *args, **kwargs):
1936         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1937         if subs:
1938             self._report_ignoring_subs('HLS')
1939         return fmts
1940
1941     def _extract_m3u8_formats_and_subtitles(
1942             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1943             preference=None, quality=None, m3u8_id=None, note=None,
1944             errnote=None, fatal=True, live=False, data=None, headers={},
1945             query={}):
1946
1947         if self.get_param('ignore_no_formats_error'):
1948             fatal = False
1949
1950         if not m3u8_url:
1951             if errnote is not False:
1952                 errnote = errnote or 'Failed to obtain m3u8 URL'
1953                 if fatal:
1954                     raise ExtractorError(errnote, video_id=video_id)
1955                 self.report_warning(f'{errnote}{bug_reports_message()}')
1956             return [], {}
1957
1958         res = self._download_webpage_handle(
1959             m3u8_url, video_id,
1960             note='Downloading m3u8 information' if note is None else note,
1961             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1962             fatal=fatal, data=data, headers=headers, query=query)
1963
1964         if res is False:
1965             return [], {}
1966
1967         m3u8_doc, urlh = res
1968         m3u8_url = urlh.geturl()
1969
1970         return self._parse_m3u8_formats_and_subtitles(
1971             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1972             preference=preference, quality=quality, m3u8_id=m3u8_id,
1973             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1974             headers=headers, query=query, video_id=video_id)
1975
1976     def _parse_m3u8_formats_and_subtitles(
1977             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
1978             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1979             errnote=None, fatal=True, data=None, headers={}, query={},
1980             video_id=None):
1981         formats, subtitles = [], {}
1982
1983         has_drm = re.search('|'.join([
1984             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
1985             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
1986         ]), m3u8_doc)
1987
1988         def format_url(url):
1989             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
1990
1991         if self.get_param('hls_split_discontinuity', False):
1992             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1993                 if not m3u8_doc:
1994                     if not manifest_url:
1995                         return []
1996                     m3u8_doc = self._download_webpage(
1997                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
1998                         note=False, errnote='Failed to download m3u8 playlist information')
1999                     if m3u8_doc is False:
2000                         return []
2001                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2002
2003         else:
2004             def _extract_m3u8_playlist_indices(*args, **kwargs):
2005                 return [None]
2006
2007         # References:
2008         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2009         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2010         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2011
2012         # We should try extracting formats only from master playlists [1, 4.3.4],
2013         # i.e. playlists that describe available qualities. On the other hand
2014         # media playlists [1, 4.3.3] should be returned as is since they contain
2015         # just the media without qualities renditions.
2016         # Fortunately, master playlist can be easily distinguished from media
2017         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2018         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2019         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2020         # media playlist and MUST NOT appear in master playlist thus we can
2021         # clearly detect media playlist with this criterion.
2022
2023         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2024             formats = [{
2025                 'format_id': join_nonempty(m3u8_id, idx),
2026                 'format_index': idx,
2027                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2028                 'ext': ext,
2029                 'protocol': entry_protocol,
2030                 'preference': preference,
2031                 'quality': quality,
2032                 'has_drm': has_drm,
2033             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2034
2035             return formats, subtitles
2036
2037         groups = {}
2038         last_stream_inf = {}
2039
2040         def extract_media(x_media_line):
2041             media = parse_m3u8_attributes(x_media_line)
2042             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2043             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2044             if not (media_type and group_id and name):
2045                 return
2046             groups.setdefault(group_id, []).append(media)
2047             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2048             if media_type == 'SUBTITLES':
2049                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2050                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2051                 # However, lack of URI has been spotted in the wild.
2052                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2053                 if not media.get('URI'):
2054                     return
2055                 url = format_url(media['URI'])
2056                 sub_info = {
2057                     'url': url,
2058                     'ext': determine_ext(url),
2059                 }
2060                 if sub_info['ext'] == 'm3u8':
2061                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2062                     # files may contain is WebVTT:
2063                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2064                     sub_info['ext'] = 'vtt'
2065                     sub_info['protocol'] = 'm3u8_native'
2066                 lang = media.get('LANGUAGE') or 'und'
2067                 subtitles.setdefault(lang, []).append(sub_info)
2068             if media_type not in ('VIDEO', 'AUDIO'):
2069                 return
2070             media_url = media.get('URI')
2071             if media_url:
2072                 manifest_url = format_url(media_url)
2073                 formats.extend({
2074                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2075                     'format_note': name,
2076                     'format_index': idx,
2077                     'url': manifest_url,
2078                     'manifest_url': m3u8_url,
2079                     'language': media.get('LANGUAGE'),
2080                     'ext': ext,
2081                     'protocol': entry_protocol,
2082                     'preference': preference,
2083                     'quality': quality,
2084                     'has_drm': has_drm,
2085                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2086                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2087
2088         def build_stream_name():
2089             # Despite specification does not mention NAME attribute for
2090             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2091             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2092             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2093             stream_name = last_stream_inf.get('NAME')
2094             if stream_name:
2095                 return stream_name
2096             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2097             # from corresponding rendition group
2098             stream_group_id = last_stream_inf.get('VIDEO')
2099             if not stream_group_id:
2100                 return
2101             stream_group = groups.get(stream_group_id)
2102             if not stream_group:
2103                 return stream_group_id
2104             rendition = stream_group[0]
2105             return rendition.get('NAME') or stream_group_id
2106
2107         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2108         # chance to detect video only formats when EXT-X-STREAM-INF tags
2109         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2110         for line in m3u8_doc.splitlines():
2111             if line.startswith('#EXT-X-MEDIA:'):
2112                 extract_media(line)
2113
2114         for line in m3u8_doc.splitlines():
2115             if line.startswith('#EXT-X-STREAM-INF:'):
2116                 last_stream_inf = parse_m3u8_attributes(line)
2117             elif line.startswith('#') or not line.strip():
2118                 continue
2119             else:
2120                 tbr = float_or_none(
2121                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2122                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2123                 manifest_url = format_url(line.strip())
2124
2125                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2126                     format_id = [m3u8_id, None, idx]
2127                     # Bandwidth of live streams may differ over time thus making
2128                     # format_id unpredictable. So it's better to keep provided
2129                     # format_id intact.
2130                     if not live:
2131                         stream_name = build_stream_name()
2132                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2133                     f = {
2134                         'format_id': join_nonempty(*format_id),
2135                         'format_index': idx,
2136                         'url': manifest_url,
2137                         'manifest_url': m3u8_url,
2138                         'tbr': tbr,
2139                         'ext': ext,
2140                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2141                         'protocol': entry_protocol,
2142                         'preference': preference,
2143                         'quality': quality,
2144                         'has_drm': has_drm,
2145                     }
2146                     resolution = last_stream_inf.get('RESOLUTION')
2147                     if resolution:
2148                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2149                         if mobj:
2150                             f['width'] = int(mobj.group('width'))
2151                             f['height'] = int(mobj.group('height'))
2152                     # Unified Streaming Platform
2153                     mobj = re.search(
2154                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2155                     if mobj:
2156                         abr, vbr = mobj.groups()
2157                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2158                         f.update({
2159                             'vbr': vbr,
2160                             'abr': abr,
2161                         })
2162                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2163                     f.update(codecs)
2164                     audio_group_id = last_stream_inf.get('AUDIO')
2165                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2166                     # references a rendition group MUST have a CODECS attribute.
2167                     # However, this is not always respected. E.g. [2]
2168                     # contains EXT-X-STREAM-INF tag which references AUDIO
2169                     # rendition group but does not have CODECS and despite
2170                     # referencing an audio group it represents a complete
2171                     # (with audio and video) format. So, for such cases we will
2172                     # ignore references to rendition groups and treat them
2173                     # as complete formats.
2174                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2175                         audio_group = groups.get(audio_group_id)
2176                         if audio_group and audio_group[0].get('URI'):
2177                             # TODO: update acodec for audio only formats with
2178                             # the same GROUP-ID
2179                             f['acodec'] = 'none'
2180                     if not f.get('ext'):
2181                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2182                     formats.append(f)
2183
2184                     # for DailyMotion
2185                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2186                     if progressive_uri:
2187                         http_f = f.copy()
2188                         del http_f['manifest_url']
2189                         http_f.update({
2190                             'format_id': f['format_id'].replace('hls-', 'http-'),
2191                             'protocol': 'http',
2192                             'url': progressive_uri,
2193                         })
2194                         formats.append(http_f)
2195
2196                 last_stream_inf = {}
2197         return formats, subtitles
2198
2199     def _extract_m3u8_vod_duration(
2200             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2201
2202         m3u8_vod = self._download_webpage(
2203             m3u8_vod_url, video_id,
2204             note='Downloading m3u8 VOD manifest' if note is None else note,
2205             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2206             fatal=False, data=data, headers=headers, query=query)
2207
2208         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2209
2210     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2211         if '#EXT-X-ENDLIST' not in m3u8_vod:
2212             return None
2213
2214         return int(sum(
2215             float(line[len('#EXTINF:'):].split(',')[0])
2216             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2217
2218     def _extract_mpd_vod_duration(
2219             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2220
2221         mpd_doc = self._download_xml(
2222             mpd_url, video_id,
2223             note='Downloading MPD VOD manifest' if note is None else note,
2224             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2225             fatal=False, data=data, headers=headers, query=query) or {}
2226         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2227
2228     @staticmethod
2229     def _xpath_ns(path, namespace=None):
2230         if not namespace:
2231             return path
2232         out = []
2233         for c in path.split('/'):
2234             if not c or c == '.':
2235                 out.append(c)
2236             else:
2237                 out.append('{%s}%s' % (namespace, c))
2238         return '/'.join(out)
2239
2240     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2241         if self.get_param('ignore_no_formats_error'):
2242             fatal = False
2243
2244         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2245         if res is False:
2246             assert not fatal
2247             return [], {}
2248
2249         smil, urlh = res
2250         smil_url = urlh.geturl()
2251
2252         namespace = self._parse_smil_namespace(smil)
2253
2254         fmts = self._parse_smil_formats(
2255             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2256         subs = self._parse_smil_subtitles(
2257             smil, namespace=namespace)
2258
2259         return fmts, subs
2260
2261     def _extract_smil_formats(self, *args, **kwargs):
2262         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2263         if subs:
2264             self._report_ignoring_subs('SMIL')
2265         return fmts
2266
2267     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2268         res = self._download_smil(smil_url, video_id, fatal=fatal)
2269         if res is False:
2270             return {}
2271
2272         smil, urlh = res
2273         smil_url = urlh.geturl()
2274
2275         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2276
2277     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2278         return self._download_xml_handle(
2279             smil_url, video_id, 'Downloading SMIL file',
2280             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2281
2282     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2283         namespace = self._parse_smil_namespace(smil)
2284
2285         formats = self._parse_smil_formats(
2286             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2287         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2288
2289         video_id = os.path.splitext(url_basename(smil_url))[0]
2290         title = None
2291         description = None
2292         upload_date = None
2293         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2294             name = meta.attrib.get('name')
2295             content = meta.attrib.get('content')
2296             if not name or not content:
2297                 continue
2298             if not title and name == 'title':
2299                 title = content
2300             elif not description and name in ('description', 'abstract'):
2301                 description = content
2302             elif not upload_date and name == 'date':
2303                 upload_date = unified_strdate(content)
2304
2305         thumbnails = [{
2306             'id': image.get('type'),
2307             'url': image.get('src'),
2308             'width': int_or_none(image.get('width')),
2309             'height': int_or_none(image.get('height')),
2310         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2311
2312         return {
2313             'id': video_id,
2314             'title': title or video_id,
2315             'description': description,
2316             'upload_date': upload_date,
2317             'thumbnails': thumbnails,
2318             'formats': formats,
2319             'subtitles': subtitles,
2320         }
2321
2322     def _parse_smil_namespace(self, smil):
2323         return self._search_regex(
2324             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2325
2326     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2327         base = smil_url
2328         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2329             b = meta.get('base') or meta.get('httpBase')
2330             if b:
2331                 base = b
2332                 break
2333
2334         formats = []
2335         rtmp_count = 0
2336         http_count = 0
2337         m3u8_count = 0
2338         imgs_count = 0
2339
2340         srcs = set()
2341         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2342         for medium in media:
2343             src = medium.get('src')
2344             if not src or src in srcs:
2345                 continue
2346             srcs.add(src)
2347
2348             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2349             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2350             width = int_or_none(medium.get('width'))
2351             height = int_or_none(medium.get('height'))
2352             proto = medium.get('proto')
2353             ext = medium.get('ext')
2354             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2355                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2356             streamer = medium.get('streamer') or base
2357
2358             if proto == 'rtmp' or streamer.startswith('rtmp'):
2359                 rtmp_count += 1
2360                 formats.append({
2361                     'url': streamer,
2362                     'play_path': src,
2363                     'ext': 'flv',
2364                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2365                     'tbr': bitrate,
2366                     'filesize': filesize,
2367                     'width': width,
2368                     'height': height,
2369                 })
2370                 if transform_rtmp_url:
2371                     streamer, src = transform_rtmp_url(streamer, src)
2372                     formats[-1].update({
2373                         'url': streamer,
2374                         'play_path': src,
2375                     })
2376                 continue
2377
2378             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2379             src_url = src_url.strip()
2380
2381             if proto == 'm3u8' or src_ext == 'm3u8':
2382                 m3u8_formats = self._extract_m3u8_formats(
2383                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2384                 if len(m3u8_formats) == 1:
2385                     m3u8_count += 1
2386                     m3u8_formats[0].update({
2387                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2388                         'tbr': bitrate,
2389                         'width': width,
2390                         'height': height,
2391                     })
2392                 formats.extend(m3u8_formats)
2393             elif src_ext == 'f4m':
2394                 f4m_url = src_url
2395                 if not f4m_params:
2396                     f4m_params = {
2397                         'hdcore': '3.2.0',
2398                         'plugin': 'flowplayer-3.2.0.1',
2399                     }
2400                 f4m_url += '&' if '?' in f4m_url else '?'
2401                 f4m_url += urllib.parse.urlencode(f4m_params)
2402                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2403             elif src_ext == 'mpd':
2404                 formats.extend(self._extract_mpd_formats(
2405                     src_url, video_id, mpd_id='dash', fatal=False))
2406             elif re.search(r'\.ism/[Mm]anifest', src_url):
2407                 formats.extend(self._extract_ism_formats(
2408                     src_url, video_id, ism_id='mss', fatal=False))
2409             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2410                 http_count += 1
2411                 formats.append({
2412                     'url': src_url,
2413                     'ext': ext or src_ext or 'flv',
2414                     'format_id': 'http-%d' % (bitrate or http_count),
2415                     'tbr': bitrate,
2416                     'filesize': filesize,
2417                     'width': width,
2418                     'height': height,
2419                 })
2420
2421         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2422             src = medium.get('src')
2423             if not src or src in srcs:
2424                 continue
2425             srcs.add(src)
2426
2427             imgs_count += 1
2428             formats.append({
2429                 'format_id': 'imagestream-%d' % (imgs_count),
2430                 'url': src,
2431                 'ext': mimetype2ext(medium.get('type')),
2432                 'acodec': 'none',
2433                 'vcodec': 'none',
2434                 'width': int_or_none(medium.get('width')),
2435                 'height': int_or_none(medium.get('height')),
2436                 'format_note': 'SMIL storyboards',
2437             })
2438
2439         return formats
2440
2441     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2442         urls = []
2443         subtitles = {}
2444         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2445             src = textstream.get('src')
2446             if not src or src in urls:
2447                 continue
2448             urls.append(src)
2449             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2450             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2451             subtitles.setdefault(lang, []).append({
2452                 'url': src,
2453                 'ext': ext,
2454             })
2455         return subtitles
2456
2457     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2458         res = self._download_xml_handle(
2459             xspf_url, playlist_id, 'Downloading xpsf playlist',
2460             'Unable to download xspf manifest', fatal=fatal)
2461         if res is False:
2462             return []
2463
2464         xspf, urlh = res
2465         xspf_url = urlh.geturl()
2466
2467         return self._parse_xspf(
2468             xspf, playlist_id, xspf_url=xspf_url,
2469             xspf_base_url=base_url(xspf_url))
2470
2471     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2472         NS_MAP = {
2473             'xspf': 'http://xspf.org/ns/0/',
2474             's1': 'http://static.streamone.nl/player/ns/0',
2475         }
2476
2477         entries = []
2478         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2479             title = xpath_text(
2480                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2481             description = xpath_text(
2482                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2483             thumbnail = xpath_text(
2484                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2485             duration = float_or_none(
2486                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2487
2488             formats = []
2489             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2490                 format_url = urljoin(xspf_base_url, location.text)
2491                 if not format_url:
2492                     continue
2493                 formats.append({
2494                     'url': format_url,
2495                     'manifest_url': xspf_url,
2496                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2497                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2498                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2499                 })
2500
2501             entries.append({
2502                 'id': playlist_id,
2503                 'title': title,
2504                 'description': description,
2505                 'thumbnail': thumbnail,
2506                 'duration': duration,
2507                 'formats': formats,
2508             })
2509         return entries
2510
2511     def _extract_mpd_formats(self, *args, **kwargs):
2512         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2513         if subs:
2514             self._report_ignoring_subs('DASH')
2515         return fmts
2516
2517     def _extract_mpd_formats_and_subtitles(
2518             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2519             fatal=True, data=None, headers={}, query={}):
2520
2521         if self.get_param('ignore_no_formats_error'):
2522             fatal = False
2523
2524         res = self._download_xml_handle(
2525             mpd_url, video_id,
2526             note='Downloading MPD manifest' if note is None else note,
2527             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2528             fatal=fatal, data=data, headers=headers, query=query)
2529         if res is False:
2530             return [], {}
2531         mpd_doc, urlh = res
2532         if mpd_doc is None:
2533             return [], {}
2534
2535         # We could have been redirected to a new url when we retrieved our mpd file.
2536         mpd_url = urlh.geturl()
2537         mpd_base_url = base_url(mpd_url)
2538
2539         return self._parse_mpd_formats_and_subtitles(
2540             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2541
2542     def _parse_mpd_formats(self, *args, **kwargs):
2543         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2544         if subs:
2545             self._report_ignoring_subs('DASH')
2546         return fmts
2547
2548     def _parse_mpd_formats_and_subtitles(
2549             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2550         """
2551         Parse formats from MPD manifest.
2552         References:
2553          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2554             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2555          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2556         """
2557         if not self.get_param('dynamic_mpd', True):
2558             if mpd_doc.get('type') == 'dynamic':
2559                 return [], {}
2560
2561         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2562
2563         def _add_ns(path):
2564             return self._xpath_ns(path, namespace)
2565
2566         def is_drm_protected(element):
2567             return element.find(_add_ns('ContentProtection')) is not None
2568
2569         def extract_multisegment_info(element, ms_parent_info):
2570             ms_info = ms_parent_info.copy()
2571
2572             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2573             # common attributes and elements.  We will only extract relevant
2574             # for us.
2575             def extract_common(source):
2576                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2577                 if segment_timeline is not None:
2578                     s_e = segment_timeline.findall(_add_ns('S'))
2579                     if s_e:
2580                         ms_info['total_number'] = 0
2581                         ms_info['s'] = []
2582                         for s in s_e:
2583                             r = int(s.get('r', 0))
2584                             ms_info['total_number'] += 1 + r
2585                             ms_info['s'].append({
2586                                 't': int(s.get('t', 0)),
2587                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2588                                 'd': int(s.attrib['d']),
2589                                 'r': r,
2590                             })
2591                 start_number = source.get('startNumber')
2592                 if start_number:
2593                     ms_info['start_number'] = int(start_number)
2594                 timescale = source.get('timescale')
2595                 if timescale:
2596                     ms_info['timescale'] = int(timescale)
2597                 segment_duration = source.get('duration')
2598                 if segment_duration:
2599                     ms_info['segment_duration'] = float(segment_duration)
2600
2601             def extract_Initialization(source):
2602                 initialization = source.find(_add_ns('Initialization'))
2603                 if initialization is not None:
2604                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2605
2606             segment_list = element.find(_add_ns('SegmentList'))
2607             if segment_list is not None:
2608                 extract_common(segment_list)
2609                 extract_Initialization(segment_list)
2610                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2611                 if segment_urls_e:
2612                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2613             else:
2614                 segment_template = element.find(_add_ns('SegmentTemplate'))
2615                 if segment_template is not None:
2616                     extract_common(segment_template)
2617                     media = segment_template.get('media')
2618                     if media:
2619                         ms_info['media'] = media
2620                     initialization = segment_template.get('initialization')
2621                     if initialization:
2622                         ms_info['initialization'] = initialization
2623                     else:
2624                         extract_Initialization(segment_template)
2625             return ms_info
2626
2627         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2628         formats, subtitles = [], {}
2629         stream_numbers = collections.defaultdict(int)
2630         for period in mpd_doc.findall(_add_ns('Period')):
2631             period_duration = parse_duration(period.get('duration')) or mpd_duration
2632             period_ms_info = extract_multisegment_info(period, {
2633                 'start_number': 1,
2634                 'timescale': 1,
2635             })
2636             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2637                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2638                 for representation in adaptation_set.findall(_add_ns('Representation')):
2639                     representation_attrib = adaptation_set.attrib.copy()
2640                     representation_attrib.update(representation.attrib)
2641                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2642                     mime_type = representation_attrib['mimeType']
2643                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2644
2645                     codec_str = representation_attrib.get('codecs', '')
2646                     # Some kind of binary subtitle found in some youtube livestreams
2647                     if mime_type == 'application/x-rawcc':
2648                         codecs = {'scodec': codec_str}
2649                     else:
2650                         codecs = parse_codecs(codec_str)
2651                     if content_type not in ('video', 'audio', 'text'):
2652                         if mime_type == 'image/jpeg':
2653                             content_type = mime_type
2654                         elif codecs.get('vcodec', 'none') != 'none':
2655                             content_type = 'video'
2656                         elif codecs.get('acodec', 'none') != 'none':
2657                             content_type = 'audio'
2658                         elif codecs.get('scodec', 'none') != 'none':
2659                             content_type = 'text'
2660                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2661                             content_type = 'text'
2662                         else:
2663                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2664                             continue
2665
2666                     base_url = ''
2667                     for element in (representation, adaptation_set, period, mpd_doc):
2668                         base_url_e = element.find(_add_ns('BaseURL'))
2669                         if try_call(lambda: base_url_e.text) is not None:
2670                             base_url = base_url_e.text + base_url
2671                             if re.match(r'^https?://', base_url):
2672                                 break
2673                     if mpd_base_url and base_url.startswith('/'):
2674                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2675                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2676                         if not mpd_base_url.endswith('/'):
2677                             mpd_base_url += '/'
2678                         base_url = mpd_base_url + base_url
2679                     representation_id = representation_attrib.get('id')
2680                     lang = representation_attrib.get('lang')
2681                     url_el = representation.find(_add_ns('BaseURL'))
2682                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2683                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2684                     if representation_id is not None:
2685                         format_id = representation_id
2686                     else:
2687                         format_id = content_type
2688                     if mpd_id:
2689                         format_id = mpd_id + '-' + format_id
2690                     if content_type in ('video', 'audio'):
2691                         f = {
2692                             'format_id': format_id,
2693                             'manifest_url': mpd_url,
2694                             'ext': mimetype2ext(mime_type),
2695                             'width': int_or_none(representation_attrib.get('width')),
2696                             'height': int_or_none(representation_attrib.get('height')),
2697                             'tbr': float_or_none(bandwidth, 1000),
2698                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2699                             'fps': int_or_none(representation_attrib.get('frameRate')),
2700                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2701                             'format_note': 'DASH %s' % content_type,
2702                             'filesize': filesize,
2703                             'container': mimetype2ext(mime_type) + '_dash',
2704                             **codecs
2705                         }
2706                     elif content_type == 'text':
2707                         f = {
2708                             'ext': mimetype2ext(mime_type),
2709                             'manifest_url': mpd_url,
2710                             'filesize': filesize,
2711                         }
2712                     elif content_type == 'image/jpeg':
2713                         # See test case in VikiIE
2714                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2715                         f = {
2716                             'format_id': format_id,
2717                             'ext': 'mhtml',
2718                             'manifest_url': mpd_url,
2719                             'format_note': 'DASH storyboards (jpeg)',
2720                             'acodec': 'none',
2721                             'vcodec': 'none',
2722                         }
2723                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2724                         f['has_drm'] = True
2725                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2726
2727                     def prepare_template(template_name, identifiers):
2728                         tmpl = representation_ms_info[template_name]
2729                         if representation_id is not None:
2730                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2731                         # First of, % characters outside $...$ templates
2732                         # must be escaped by doubling for proper processing
2733                         # by % operator string formatting used further (see
2734                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2735                         t = ''
2736                         in_template = False
2737                         for c in tmpl:
2738                             t += c
2739                             if c == '$':
2740                                 in_template = not in_template
2741                             elif c == '%' and not in_template:
2742                                 t += c
2743                         # Next, $...$ templates are translated to their
2744                         # %(...) counterparts to be used with % operator
2745                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2746                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2747                         t.replace('$$', '$')
2748                         return t
2749
2750                     # @initialization is a regular template like @media one
2751                     # so it should be handled just the same way (see
2752                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2753                     if 'initialization' in representation_ms_info:
2754                         initialization_template = prepare_template(
2755                             'initialization',
2756                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2757                             # $Time$ shall not be included for @initialization thus
2758                             # only $Bandwidth$ remains
2759                             ('Bandwidth', ))
2760                         representation_ms_info['initialization_url'] = initialization_template % {
2761                             'Bandwidth': bandwidth,
2762                         }
2763
2764                     def location_key(location):
2765                         return 'url' if re.match(r'^https?://', location) else 'path'
2766
2767                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2768
2769                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2770                         media_location_key = location_key(media_template)
2771
2772                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2773                         # can't be used at the same time
2774                         if '%(Number' in media_template and 's' not in representation_ms_info:
2775                             segment_duration = None
2776                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2777                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2778                                 representation_ms_info['total_number'] = int(math.ceil(
2779                                     float_or_none(period_duration, segment_duration, default=0)))
2780                             representation_ms_info['fragments'] = [{
2781                                 media_location_key: media_template % {
2782                                     'Number': segment_number,
2783                                     'Bandwidth': bandwidth,
2784                                 },
2785                                 'duration': segment_duration,
2786                             } for segment_number in range(
2787                                 representation_ms_info['start_number'],
2788                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2789                         else:
2790                             # $Number*$ or $Time$ in media template with S list available
2791                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2792                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2793                             representation_ms_info['fragments'] = []
2794                             segment_time = 0
2795                             segment_d = None
2796                             segment_number = representation_ms_info['start_number']
2797
2798                             def add_segment_url():
2799                                 segment_url = media_template % {
2800                                     'Time': segment_time,
2801                                     'Bandwidth': bandwidth,
2802                                     'Number': segment_number,
2803                                 }
2804                                 representation_ms_info['fragments'].append({
2805                                     media_location_key: segment_url,
2806                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2807                                 })
2808
2809                             for num, s in enumerate(representation_ms_info['s']):
2810                                 segment_time = s.get('t') or segment_time
2811                                 segment_d = s['d']
2812                                 add_segment_url()
2813                                 segment_number += 1
2814                                 for r in range(s.get('r', 0)):
2815                                     segment_time += segment_d
2816                                     add_segment_url()
2817                                     segment_number += 1
2818                                 segment_time += segment_d
2819                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2820                         # No media template,
2821                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2822                         # or any YouTube dashsegments video
2823                         fragments = []
2824                         segment_index = 0
2825                         timescale = representation_ms_info['timescale']
2826                         for s in representation_ms_info['s']:
2827                             duration = float_or_none(s['d'], timescale)
2828                             for r in range(s.get('r', 0) + 1):
2829                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2830                                 fragments.append({
2831                                     location_key(segment_uri): segment_uri,
2832                                     'duration': duration,
2833                                 })
2834                                 segment_index += 1
2835                         representation_ms_info['fragments'] = fragments
2836                     elif 'segment_urls' in representation_ms_info:
2837                         # Segment URLs with no SegmentTimeline
2838                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2839                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2840                         fragments = []
2841                         segment_duration = float_or_none(
2842                             representation_ms_info['segment_duration'],
2843                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2844                         for segment_url in representation_ms_info['segment_urls']:
2845                             fragment = {
2846                                 location_key(segment_url): segment_url,
2847                             }
2848                             if segment_duration:
2849                                 fragment['duration'] = segment_duration
2850                             fragments.append(fragment)
2851                         representation_ms_info['fragments'] = fragments
2852                     # If there is a fragments key available then we correctly recognized fragmented media.
2853                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2854                     # assumption is not necessarily correct since we may simply have no support for
2855                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2856                     if 'fragments' in representation_ms_info:
2857                         f.update({
2858                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2859                             'url': mpd_url or base_url,
2860                             'fragment_base_url': base_url,
2861                             'fragments': [],
2862                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2863                         })
2864                         if 'initialization_url' in representation_ms_info:
2865                             initialization_url = representation_ms_info['initialization_url']
2866                             if not f.get('url'):
2867                                 f['url'] = initialization_url
2868                             f['fragments'].append({location_key(initialization_url): initialization_url})
2869                         f['fragments'].extend(representation_ms_info['fragments'])
2870                         if not period_duration:
2871                             period_duration = try_get(
2872                                 representation_ms_info,
2873                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2874                     else:
2875                         # Assuming direct URL to unfragmented media.
2876                         f['url'] = base_url
2877                     if content_type in ('video', 'audio', 'image/jpeg'):
2878                         f['manifest_stream_number'] = stream_numbers[f['url']]
2879                         stream_numbers[f['url']] += 1
2880                         formats.append(f)
2881                     elif content_type == 'text':
2882                         subtitles.setdefault(lang or 'und', []).append(f)
2883
2884         return formats, subtitles
2885
2886     def _extract_ism_formats(self, *args, **kwargs):
2887         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2888         if subs:
2889             self._report_ignoring_subs('ISM')
2890         return fmts
2891
2892     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2893         if self.get_param('ignore_no_formats_error'):
2894             fatal = False
2895
2896         res = self._download_xml_handle(
2897             ism_url, video_id,
2898             note='Downloading ISM manifest' if note is None else note,
2899             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2900             fatal=fatal, data=data, headers=headers, query=query)
2901         if res is False:
2902             return [], {}
2903         ism_doc, urlh = res
2904         if ism_doc is None:
2905             return [], {}
2906
2907         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2908
2909     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2910         """
2911         Parse formats from ISM manifest.
2912         References:
2913          1. [MS-SSTR]: Smooth Streaming Protocol,
2914             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2915         """
2916         if ism_doc.get('IsLive') == 'TRUE':
2917             return [], {}
2918
2919         duration = int(ism_doc.attrib['Duration'])
2920         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2921
2922         formats = []
2923         subtitles = {}
2924         for stream in ism_doc.findall('StreamIndex'):
2925             stream_type = stream.get('Type')
2926             if stream_type not in ('video', 'audio', 'text'):
2927                 continue
2928             url_pattern = stream.attrib['Url']
2929             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2930             stream_name = stream.get('Name')
2931             stream_language = stream.get('Language', 'und')
2932             for track in stream.findall('QualityLevel'):
2933                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2934                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
2935                 # TODO: add support for WVC1 and WMAP
2936                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
2937                     self.report_warning('%s is not a supported codec' % fourcc)
2938                     continue
2939                 tbr = int(track.attrib['Bitrate']) // 1000
2940                 # [1] does not mention Width and Height attributes. However,
2941                 # they're often present while MaxWidth and MaxHeight are
2942                 # missing, so should be used as fallbacks
2943                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2944                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2945                 sampling_rate = int_or_none(track.get('SamplingRate'))
2946
2947                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2948                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
2949
2950                 fragments = []
2951                 fragment_ctx = {
2952                     'time': 0,
2953                 }
2954                 stream_fragments = stream.findall('c')
2955                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2956                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2957                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2958                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2959                     if not fragment_ctx['duration']:
2960                         try:
2961                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2962                         except IndexError:
2963                             next_fragment_time = duration
2964                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2965                     for _ in range(fragment_repeat):
2966                         fragments.append({
2967                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
2968                             'duration': fragment_ctx['duration'] / stream_timescale,
2969                         })
2970                         fragment_ctx['time'] += fragment_ctx['duration']
2971
2972                 if stream_type == 'text':
2973                     subtitles.setdefault(stream_language, []).append({
2974                         'ext': 'ismt',
2975                         'protocol': 'ism',
2976                         'url': ism_url,
2977                         'manifest_url': ism_url,
2978                         'fragments': fragments,
2979                         '_download_params': {
2980                             'stream_type': stream_type,
2981                             'duration': duration,
2982                             'timescale': stream_timescale,
2983                             'fourcc': fourcc,
2984                             'language': stream_language,
2985                             'codec_private_data': track.get('CodecPrivateData'),
2986                         }
2987                     })
2988                 elif stream_type in ('video', 'audio'):
2989                     formats.append({
2990                         'format_id': join_nonempty(ism_id, stream_name, tbr),
2991                         'url': ism_url,
2992                         'manifest_url': ism_url,
2993                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2994                         'width': width,
2995                         'height': height,
2996                         'tbr': tbr,
2997                         'asr': sampling_rate,
2998                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2999                         'acodec': 'none' if stream_type == 'video' else fourcc,
3000                         'protocol': 'ism',
3001                         'fragments': fragments,
3002                         'has_drm': ism_doc.find('Protection') is not None,
3003                         'language': stream_language,
3004                         'audio_channels': int_or_none(track.get('Channels')),
3005                         '_download_params': {
3006                             'stream_type': stream_type,
3007                             'duration': duration,
3008                             'timescale': stream_timescale,
3009                             'width': width or 0,
3010                             'height': height or 0,
3011                             'fourcc': fourcc,
3012                             'language': stream_language,
3013                             'codec_private_data': track.get('CodecPrivateData'),
3014                             'sampling_rate': sampling_rate,
3015                             'channels': int_or_none(track.get('Channels', 2)),
3016                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3017                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3018                         },
3019                     })
3020         return formats, subtitles
3021
3022     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3023         def absolute_url(item_url):
3024             return urljoin(base_url, item_url)
3025
3026         def parse_content_type(content_type):
3027             if not content_type:
3028                 return {}
3029             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3030             if ctr:
3031                 mimetype, codecs = ctr.groups()
3032                 f = parse_codecs(codecs)
3033                 f['ext'] = mimetype2ext(mimetype)
3034                 return f
3035             return {}
3036
3037         def _media_formats(src, cur_media_type, type_info=None):
3038             type_info = type_info or {}
3039             full_url = absolute_url(src)
3040             ext = type_info.get('ext') or determine_ext(full_url)
3041             if ext == 'm3u8':
3042                 is_plain_url = False
3043                 formats = self._extract_m3u8_formats(
3044                     full_url, video_id, ext='mp4',
3045                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3046                     preference=preference, quality=quality, fatal=False)
3047             elif ext == 'mpd':
3048                 is_plain_url = False
3049                 formats = self._extract_mpd_formats(
3050                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3051             else:
3052                 is_plain_url = True
3053                 formats = [{
3054                     'url': full_url,
3055                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3056                     'ext': ext,
3057                 }]
3058             return is_plain_url, formats
3059
3060         entries = []
3061         # amp-video and amp-audio are very similar to their HTML5 counterparts
3062         # so we will include them right here (see
3063         # https://www.ampproject.org/docs/reference/components/amp-video)
3064         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3065         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3066         media_tags = [(media_tag, media_tag_name, media_type, '')
3067                       for media_tag, media_tag_name, media_type
3068                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3069         media_tags.extend(re.findall(
3070             # We only allow video|audio followed by a whitespace or '>'.
3071             # Allowing more characters may end up in significant slow down (see
3072             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3073             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3074             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3075         for media_tag, _, media_type, media_content in media_tags:
3076             media_info = {
3077                 'formats': [],
3078                 'subtitles': {},
3079             }
3080             media_attributes = extract_attributes(media_tag)
3081             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3082             if src:
3083                 f = parse_content_type(media_attributes.get('type'))
3084                 _, formats = _media_formats(src, media_type, f)
3085                 media_info['formats'].extend(formats)
3086             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3087             if media_content:
3088                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3089                     s_attr = extract_attributes(source_tag)
3090                     # data-video-src and data-src are non standard but seen
3091                     # several times in the wild
3092                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3093                     if not src:
3094                         continue
3095                     f = parse_content_type(s_attr.get('type'))
3096                     is_plain_url, formats = _media_formats(src, media_type, f)
3097                     if is_plain_url:
3098                         # width, height, res, label and title attributes are
3099                         # all not standard but seen several times in the wild
3100                         labels = [
3101                             s_attr.get(lbl)
3102                             for lbl in ('label', 'title')
3103                             if str_or_none(s_attr.get(lbl))
3104                         ]
3105                         width = int_or_none(s_attr.get('width'))
3106                         height = (int_or_none(s_attr.get('height'))
3107                                   or int_or_none(s_attr.get('res')))
3108                         if not width or not height:
3109                             for lbl in labels:
3110                                 resolution = parse_resolution(lbl)
3111                                 if not resolution:
3112                                     continue
3113                                 width = width or resolution.get('width')
3114                                 height = height or resolution.get('height')
3115                         for lbl in labels:
3116                             tbr = parse_bitrate(lbl)
3117                             if tbr:
3118                                 break
3119                         else:
3120                             tbr = None
3121                         f.update({
3122                             'width': width,
3123                             'height': height,
3124                             'tbr': tbr,
3125                             'format_id': s_attr.get('label') or s_attr.get('title'),
3126                         })
3127                         f.update(formats[0])
3128                         media_info['formats'].append(f)
3129                     else:
3130                         media_info['formats'].extend(formats)
3131                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3132                     track_attributes = extract_attributes(track_tag)
3133                     kind = track_attributes.get('kind')
3134                     if not kind or kind in ('subtitles', 'captions'):
3135                         src = strip_or_none(track_attributes.get('src'))
3136                         if not src:
3137                             continue
3138                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3139                         media_info['subtitles'].setdefault(lang, []).append({
3140                             'url': absolute_url(src),
3141                         })
3142             for f in media_info['formats']:
3143                 f.setdefault('http_headers', {})['Referer'] = base_url
3144             if media_info['formats'] or media_info['subtitles']:
3145                 entries.append(media_info)
3146         return entries
3147
3148     def _extract_akamai_formats(self, *args, **kwargs):
3149         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3150         if subs:
3151             self._report_ignoring_subs('akamai')
3152         return fmts
3153
3154     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3155         signed = 'hdnea=' in manifest_url
3156         if not signed:
3157             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3158             manifest_url = re.sub(
3159                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3160                 '', manifest_url).strip('?')
3161
3162         formats = []
3163         subtitles = {}
3164
3165         hdcore_sign = 'hdcore=3.7.0'
3166         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3167         hds_host = hosts.get('hds')
3168         if hds_host:
3169             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3170         if 'hdcore=' not in f4m_url:
3171             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3172         f4m_formats = self._extract_f4m_formats(
3173             f4m_url, video_id, f4m_id='hds', fatal=False)
3174         for entry in f4m_formats:
3175             entry.update({'extra_param_to_segment_url': hdcore_sign})
3176         formats.extend(f4m_formats)
3177
3178         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3179         hls_host = hosts.get('hls')
3180         if hls_host:
3181             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3182         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3183             m3u8_url, video_id, 'mp4', 'm3u8_native',
3184             m3u8_id='hls', fatal=False)
3185         formats.extend(m3u8_formats)
3186         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3187
3188         http_host = hosts.get('http')
3189         if http_host and m3u8_formats and not signed:
3190             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3191             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3192             qualities_length = len(qualities)
3193             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3194                 i = 0
3195                 for f in m3u8_formats:
3196                     if f['vcodec'] != 'none':
3197                         for protocol in ('http', 'https'):
3198                             http_f = f.copy()
3199                             del http_f['manifest_url']
3200                             http_url = re.sub(
3201                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3202                             http_f.update({
3203                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3204                                 'url': http_url,
3205                                 'protocol': protocol,
3206                             })
3207                             formats.append(http_f)
3208                         i += 1
3209
3210         return formats, subtitles
3211
3212     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3213         query = urllib.parse.urlparse(url).query
3214         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3215         mobj = re.search(
3216             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3217         url_base = mobj.group('url')
3218         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3219         formats = []
3220
3221         def manifest_url(manifest):
3222             m_url = f'{http_base_url}/{manifest}'
3223             if query:
3224                 m_url += '?%s' % query
3225             return m_url
3226
3227         if 'm3u8' not in skip_protocols:
3228             formats.extend(self._extract_m3u8_formats(
3229                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3230                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3231         if 'f4m' not in skip_protocols:
3232             formats.extend(self._extract_f4m_formats(
3233                 manifest_url('manifest.f4m'),
3234                 video_id, f4m_id='hds', fatal=False))
3235         if 'dash' not in skip_protocols:
3236             formats.extend(self._extract_mpd_formats(
3237                 manifest_url('manifest.mpd'),
3238                 video_id, mpd_id='dash', fatal=False))
3239         if re.search(r'(?:/smil:|\.smil)', url_base):
3240             if 'smil' not in skip_protocols:
3241                 rtmp_formats = self._extract_smil_formats(
3242                     manifest_url('jwplayer.smil'),
3243                     video_id, fatal=False)
3244                 for rtmp_format in rtmp_formats:
3245                     rtsp_format = rtmp_format.copy()
3246                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3247                     del rtsp_format['play_path']
3248                     del rtsp_format['ext']
3249                     rtsp_format.update({
3250                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3251                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3252                         'protocol': 'rtsp',
3253                     })
3254                     formats.extend([rtmp_format, rtsp_format])
3255         else:
3256             for protocol in ('rtmp', 'rtsp'):
3257                 if protocol not in skip_protocols:
3258                     formats.append({
3259                         'url': f'{protocol}:{url_base}',
3260                         'format_id': protocol,
3261                         'protocol': protocol,
3262                     })
3263         return formats
3264
3265     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3266         mobj = re.search(
3267             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3268             webpage)
3269         if mobj:
3270             try:
3271                 jwplayer_data = self._parse_json(mobj.group('options'),
3272                                                  video_id=video_id,
3273                                                  transform_source=transform_source)
3274             except ExtractorError:
3275                 pass
3276             else:
3277                 if isinstance(jwplayer_data, dict):
3278                     return jwplayer_data
3279
3280     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3281         jwplayer_data = self._find_jwplayer_data(
3282             webpage, video_id, transform_source=js_to_json)
3283         return self._parse_jwplayer_data(
3284             jwplayer_data, video_id, *args, **kwargs)
3285
3286     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3287                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3288         entries = []
3289         if not isinstance(jwplayer_data, dict):
3290             return entries
3291
3292         playlist_items = jwplayer_data.get('playlist')
3293         # JWPlayer backward compatibility: single playlist item/flattened playlists
3294         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3295         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3296         if not isinstance(playlist_items, list):
3297             playlist_items = (playlist_items or jwplayer_data, )
3298
3299         for video_data in playlist_items:
3300             if not isinstance(video_data, dict):
3301                 continue
3302             # JWPlayer backward compatibility: flattened sources
3303             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3304             if 'sources' not in video_data:
3305                 video_data['sources'] = [video_data]
3306
3307             this_video_id = video_id or video_data['mediaid']
3308
3309             formats = self._parse_jwplayer_formats(
3310                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3311                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3312
3313             subtitles = {}
3314             tracks = video_data.get('tracks')
3315             if tracks and isinstance(tracks, list):
3316                 for track in tracks:
3317                     if not isinstance(track, dict):
3318                         continue
3319                     track_kind = track.get('kind')
3320                     if not track_kind or not isinstance(track_kind, str):
3321                         continue
3322                     if track_kind.lower() not in ('captions', 'subtitles'):
3323                         continue
3324                     track_url = urljoin(base_url, track.get('file'))
3325                     if not track_url:
3326                         continue
3327                     subtitles.setdefault(track.get('label') or 'en', []).append({
3328                         'url': self._proto_relative_url(track_url)
3329                     })
3330
3331             entry = {
3332                 'id': this_video_id,
3333                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3334                 'description': clean_html(video_data.get('description')),
3335                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3336                 'timestamp': int_or_none(video_data.get('pubdate')),
3337                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3338                 'subtitles': subtitles,
3339                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3340                 'genre': clean_html(video_data.get('genre')),
3341                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3342                 'season_number': int_or_none(video_data.get('season')),
3343                 'episode_number': int_or_none(video_data.get('episode')),
3344                 'release_year': int_or_none(video_data.get('releasedate')),
3345                 'age_limit': int_or_none(video_data.get('age_restriction')),
3346             }
3347             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3348             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3349                 entry.update({
3350                     '_type': 'url_transparent',
3351                     'url': formats[0]['url'],
3352                 })
3353             else:
3354                 entry['formats'] = formats
3355             entries.append(entry)
3356         if len(entries) == 1:
3357             return entries[0]
3358         else:
3359             return self.playlist_result(entries)
3360
3361     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3362                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3363         urls = set()
3364         formats = []
3365         for source in jwplayer_sources_data:
3366             if not isinstance(source, dict):
3367                 continue
3368             source_url = urljoin(
3369                 base_url, self._proto_relative_url(source.get('file')))
3370             if not source_url or source_url in urls:
3371                 continue
3372             urls.add(source_url)
3373             source_type = source.get('type') or ''
3374             ext = mimetype2ext(source_type) or determine_ext(source_url)
3375             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3376                 formats.extend(self._extract_m3u8_formats(
3377                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3378                     m3u8_id=m3u8_id, fatal=False))
3379             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3380                 formats.extend(self._extract_mpd_formats(
3381                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3382             elif ext == 'smil':
3383                 formats.extend(self._extract_smil_formats(
3384                     source_url, video_id, fatal=False))
3385             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3386             elif source_type.startswith('audio') or ext in (
3387                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3388                 formats.append({
3389                     'url': source_url,
3390                     'vcodec': 'none',
3391                     'ext': ext,
3392                 })
3393             else:
3394                 format_id = str_or_none(source.get('label'))
3395                 height = int_or_none(source.get('height'))
3396                 if height is None and format_id:
3397                     # Often no height is provided but there is a label in
3398                     # format like "1080p", "720p SD", or 1080.
3399                     height = parse_resolution(format_id).get('height')
3400                 a_format = {
3401                     'url': source_url,
3402                     'width': int_or_none(source.get('width')),
3403                     'height': height,
3404                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3405                     'filesize': int_or_none(source.get('filesize')),
3406                     'ext': ext,
3407                     'format_id': format_id
3408                 }
3409                 if source_url.startswith('rtmp'):
3410                     a_format['ext'] = 'flv'
3411                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3412                     # of jwplayer.flash.swf
3413                     rtmp_url_parts = re.split(
3414                         r'((?:mp4|mp3|flv):)', source_url, 1)
3415                     if len(rtmp_url_parts) == 3:
3416                         rtmp_url, prefix, play_path = rtmp_url_parts
3417                         a_format.update({
3418                             'url': rtmp_url,
3419                             'play_path': prefix + play_path,
3420                         })
3421                     if rtmp_params:
3422                         a_format.update(rtmp_params)
3423                 formats.append(a_format)
3424         return formats
3425
3426     def _live_title(self, name):
3427         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3428         return name
3429
3430     def _int(self, v, name, fatal=False, **kwargs):
3431         res = int_or_none(v, **kwargs)
3432         if res is None:
3433             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3434             if fatal:
3435                 raise ExtractorError(msg)
3436             else:
3437                 self.report_warning(msg)
3438         return res
3439
3440     def _float(self, v, name, fatal=False, **kwargs):
3441         res = float_or_none(v, **kwargs)
3442         if res is None:
3443             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3444             if fatal:
3445                 raise ExtractorError(msg)
3446             else:
3447                 self.report_warning(msg)
3448         return res
3449
3450     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3451                     path='/', secure=False, discard=False, rest={}, **kwargs):
3452         cookie = http.cookiejar.Cookie(
3453             0, name, value, port, port is not None, domain, True,
3454             domain.startswith('.'), path, True, secure, expire_time,
3455             discard, None, None, rest)
3456         self.cookiejar.set_cookie(cookie)
3457
3458     def _get_cookies(self, url):
3459         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3460         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3461
3462     def _apply_first_set_cookie_header(self, url_handle, cookie):
3463         """
3464         Apply first Set-Cookie header instead of the last. Experimental.
3465
3466         Some sites (e.g. [1-3]) may serve two cookies under the same name
3467         in Set-Cookie header and expect the first (old) one to be set rather
3468         than second (new). However, as of RFC6265 the newer one cookie
3469         should be set into cookie store what actually happens.
3470         We will workaround this issue by resetting the cookie to
3471         the first one manually.
3472         1. https://new.vk.com/
3473         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3474         3. https://learning.oreilly.com/
3475         """
3476         for header, cookies in url_handle.headers.items():
3477             if header.lower() != 'set-cookie':
3478                 continue
3479             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3480             cookie_value = re.search(
3481                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3482             if cookie_value:
3483                 value, domain = cookie_value.groups()
3484                 self._set_cookie(domain, cookie, value)
3485                 break
3486
3487     @classmethod
3488     def get_testcases(cls, include_onlymatching=False):
3489         # Do not look in super classes
3490         t = vars(cls).get('_TEST')
3491         if t:
3492             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3493             tests = [t]
3494         else:
3495             tests = vars(cls).get('_TESTS', [])
3496         for t in tests:
3497             if not include_onlymatching and t.get('only_matching', False):
3498                 continue
3499             t['name'] = cls.ie_key()
3500             yield t
3501         if getattr(cls, '__wrapped__', None):
3502             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3503
3504     @classmethod
3505     def get_webpage_testcases(cls):
3506         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3507         for t in tests:
3508             t['name'] = cls.ie_key()
3509             yield t
3510         if getattr(cls, '__wrapped__', None):
3511             yield from cls.__wrapped__.get_webpage_testcases()
3512
3513     @classproperty(cache=True)
3514     def age_limit(cls):
3515         """Get age limit from the testcases"""
3516         return max(traverse_obj(
3517             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3518             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3519
3520     @classproperty(cache=True)
3521     def _RETURN_TYPE(cls):
3522         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3523         tests = tuple(cls.get_testcases(include_onlymatching=False))
3524         if not tests:
3525             return None
3526         elif not any(k.startswith('playlist') for test in tests for k in test):
3527             return 'video'
3528         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3529             return 'playlist'
3530         return 'any'
3531
3532     @classmethod
3533     def is_single_video(cls, url):
3534         """Returns whether the URL is of a single video, None if unknown"""
3535         if cls.suitable(url):
3536             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3537
3538     @classmethod
3539     def is_suitable(cls, age_limit):
3540         """Test whether the extractor is generally suitable for the given age limit"""
3541         return not age_restricted(cls.age_limit, age_limit)
3542
3543     @classmethod
3544     def description(cls, *, markdown=True, search_examples=None):
3545         """Description of the extractor"""
3546         desc = ''
3547         if cls._NETRC_MACHINE:
3548             if markdown:
3549                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3550             else:
3551                 desc += f' [{cls._NETRC_MACHINE}]'
3552         if cls.IE_DESC is False:
3553             desc += ' [HIDDEN]'
3554         elif cls.IE_DESC:
3555             desc += f' {cls.IE_DESC}'
3556         if cls.SEARCH_KEY:
3557             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3558             if search_examples:
3559                 _COUNTS = ('', '5', '10', 'all')
3560                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3561         if not cls.working():
3562             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3563
3564         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3565         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3566         return f'{name}:{desc}' if desc else name
3567
3568     def extract_subtitles(self, *args, **kwargs):
3569         if (self.get_param('writesubtitles', False)
3570                 or self.get_param('listsubtitles')):
3571             return self._get_subtitles(*args, **kwargs)
3572         return {}
3573
3574     def _get_subtitles(self, *args, **kwargs):
3575         raise NotImplementedError('This method must be implemented by subclasses')
3576
3577     class CommentsDisabled(Exception):
3578         """Raise in _get_comments if comments are disabled for the video"""
3579
3580     def extract_comments(self, *args, **kwargs):
3581         if not self.get_param('getcomments'):
3582             return None
3583         generator = self._get_comments(*args, **kwargs)
3584
3585         def extractor():
3586             comments = []
3587             interrupted = True
3588             try:
3589                 while True:
3590                     comments.append(next(generator))
3591             except StopIteration:
3592                 interrupted = False
3593             except KeyboardInterrupt:
3594                 self.to_screen('Interrupted by user')
3595             except self.CommentsDisabled:
3596                 return {'comments': None, 'comment_count': None}
3597             except Exception as e:
3598                 if self.get_param('ignoreerrors') is not True:
3599                     raise
3600                 self._downloader.report_error(e)
3601             comment_count = len(comments)
3602             self.to_screen(f'Extracted {comment_count} comments')
3603             return {
3604                 'comments': comments,
3605                 'comment_count': None if interrupted else comment_count
3606             }
3607         return extractor
3608
3609     def _get_comments(self, *args, **kwargs):
3610         raise NotImplementedError('This method must be implemented by subclasses')
3611
3612     @staticmethod
3613     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3614         """ Merge subtitle items for one language. Items with duplicated URLs/data
3615         will be dropped. """
3616         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3617         ret = list(subtitle_list1)
3618         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3619         return ret
3620
3621     @classmethod
3622     def _merge_subtitles(cls, *dicts, target=None):
3623         """ Merge subtitle dictionaries, language by language. """
3624         if target is None:
3625             target = {}
3626         for d in dicts:
3627             for lang, subs in d.items():
3628                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3629         return target
3630
3631     def extract_automatic_captions(self, *args, **kwargs):
3632         if (self.get_param('writeautomaticsub', False)
3633                 or self.get_param('listsubtitles')):
3634             return self._get_automatic_captions(*args, **kwargs)
3635         return {}
3636
3637     def _get_automatic_captions(self, *args, **kwargs):
3638         raise NotImplementedError('This method must be implemented by subclasses')
3639
3640     @functools.cached_property
3641     def _cookies_passed(self):
3642         """Whether cookies have been passed to YoutubeDL"""
3643         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3644
3645     def mark_watched(self, *args, **kwargs):
3646         if not self.get_param('mark_watched', False):
3647             return
3648         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3649             self._mark_watched(*args, **kwargs)
3650
3651     def _mark_watched(self, *args, **kwargs):
3652         raise NotImplementedError('This method must be implemented by subclasses')
3653
3654     def geo_verification_headers(self):
3655         headers = {}
3656         geo_verification_proxy = self.get_param('geo_verification_proxy')
3657         if geo_verification_proxy:
3658             headers['Ytdl-request-proxy'] = geo_verification_proxy
3659         return headers
3660
3661     @staticmethod
3662     def _generic_id(url):
3663         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3664
3665     def _generic_title(self, url='', webpage='', *, default=None):
3666         return (self._og_search_title(webpage, default=None)
3667                 or self._html_extract_title(webpage, default=None)
3668                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3669                 or default)
3670
3671     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3672         if not duration:
3673             return
3674         chapter_list = [{
3675             'start_time': start_function(chapter),
3676             'title': title_function(chapter),
3677         } for chapter in chapter_list or []]
3678         if strict:
3679             warn = self.report_warning
3680         else:
3681             warn = self.write_debug
3682             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3683
3684         chapters = [{'start_time': 0}]
3685         for idx, chapter in enumerate(chapter_list):
3686             if chapter['start_time'] is None:
3687                 warn(f'Incomplete chapter {idx}')
3688             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3689                 chapters.append(chapter)
3690             elif chapter not in chapters:
3691                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3692                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3693                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3694         return chapters[1:]
3695
3696     def _extract_chapters_from_description(self, description, duration):
3697         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3698         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3699         return self._extract_chapters_helper(
3700             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3701             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3702             duration=duration, strict=False) or self._extract_chapters_helper(
3703             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3704             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3705             duration=duration, strict=False)
3706
3707     @staticmethod
3708     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3709         all_known = all(map(
3710             lambda x: x is not None,
3711             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3712         return (
3713             'private' if is_private
3714             else 'premium_only' if needs_premium
3715             else 'subscriber_only' if needs_subscription
3716             else 'needs_auth' if needs_auth
3717             else 'unlisted' if is_unlisted
3718             else 'public' if all_known
3719             else None)
3720
3721     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3722         '''
3723         @returns            A list of values for the extractor argument given by "key"
3724                             or "default" if no such key is present
3725         @param default      The default value to return when the key is not present (default: [])
3726         @param casesense    When false, the values are converted to lower case
3727         '''
3728         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3729         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3730         if val is None:
3731             return [] if default is NO_DEFAULT else default
3732         return list(val) if casesense else [x.lower() for x in val]
3733
3734     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3735         if not playlist_id or not video_id:
3736             return not video_id
3737
3738         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3739         if no_playlist is not None:
3740             return not no_playlist
3741
3742         video_id = '' if video_id is True else f' {video_id}'
3743         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3744         if self.get_param('noplaylist'):
3745             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3746             return False
3747         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3748         return True
3749
3750     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3751         RetryManager.report_retry(
3752             err, _count or int(fatal), _retries,
3753             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3754             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3755
3756     def RetryManager(self, **kwargs):
3757         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3758
3759     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3760         display_id = traverse_obj(info_dict, 'display_id', 'id')
3761         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3762         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3763             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3764
3765     @classmethod
3766     def extract_from_webpage(cls, ydl, url, webpage):
3767         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3768               else ydl.get_info_extractor(cls.ie_key()))
3769         for info in ie._extract_from_webpage(url, webpage) or []:
3770             # url = None since we do not want to set (webpage/original)_url
3771             ydl.add_default_extra_info(info, ie, None)
3772             yield info
3773
3774     @classmethod
3775     def _extract_from_webpage(cls, url, webpage):
3776         for embed_url in orderedSet(
3777                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3778             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3779
3780     @classmethod
3781     def _extract_embed_urls(cls, url, webpage):
3782         """@returns all the embed urls on the webpage"""
3783         if '_EMBED_URL_RE' not in cls.__dict__:
3784             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3785             for idx, regex in enumerate(cls._EMBED_REGEX):
3786                 assert regex.count('(?P<url>') == 1, \
3787                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3788             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3789
3790         for regex in cls._EMBED_URL_RE:
3791             for mobj in regex.finditer(webpage):
3792                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3793                 if cls._VALID_URL is False or cls.suitable(embed_url):
3794                     yield embed_url
3795
3796     class StopExtraction(Exception):
3797         pass
3798
3799     @classmethod
3800     def _extract_url(cls, webpage):  # TODO: Remove
3801         """Only for compatibility with some older extractors"""
3802         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3803
3804     @classmethod
3805     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3806         if plugin_name:
3807             mro = inspect.getmro(cls)
3808             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3809             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3810             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3811             while getattr(super_class, '__wrapped__', None):
3812                 super_class = super_class.__wrapped__
3813             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3814             _PLUGIN_OVERRIDES[super_class].append(cls)
3815
3816         return super().__init_subclass__(**kwargs)
3817
3818
3819 class SearchInfoExtractor(InfoExtractor):
3820     """
3821     Base class for paged search queries extractors.
3822     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3823     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3824     """
3825
3826     _MAX_RESULTS = float('inf')
3827     _RETURN_TYPE = 'playlist'
3828
3829     @classproperty
3830     def _VALID_URL(cls):
3831         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3832
3833     def _real_extract(self, query):
3834         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3835         if prefix == '':
3836             return self._get_n_results(query, 1)
3837         elif prefix == 'all':
3838             return self._get_n_results(query, self._MAX_RESULTS)
3839         else:
3840             n = int(prefix)
3841             if n <= 0:
3842                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3843             elif n > self._MAX_RESULTS:
3844                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3845                 n = self._MAX_RESULTS
3846             return self._get_n_results(query, n)
3847
3848     def _get_n_results(self, query, n):
3849         """Get a specified number of results for a query.
3850         Either this function or _search_results must be overridden by subclasses """
3851         return self.playlist_result(
3852             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3853             query, query)
3854
3855     def _search_results(self, query):
3856         """Returns an iterator of search results"""
3857         raise NotImplementedError('This method must be implemented by subclasses')
3858
3859     @classproperty
3860     def SEARCH_KEY(cls):
3861         return cls._SEARCH_KEY
3862
3863
3864 class UnsupportedURLIE(InfoExtractor):
3865     _VALID_URL = '.*'
3866     _ENABLED = False
3867     IE_DESC = False
3868
3869     def _real_extract(self, url):
3870         raise UnsupportedError(url)
3871
3872
3873 _PLUGIN_OVERRIDES = collections.defaultdict(list)