yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import subprocess
  17 import sys
  18 import time
  19 import types
  20 import urllib.error
  21 import urllib.parse
  22 import urllib.request
  23 import xml.etree.ElementTree
  24
  25 from ..compat import functools  # isort: split
  26 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  27 from ..cookies import LenientSimpleCookie
  28 from ..downloader.f4m import get_base_url, remove_encrypted_media
  29 from ..utils import (
  30     IDENTITY,
  31     JSON_LD_RE,
  32     NO_DEFAULT,
  33     ExtractorError,
  34     FormatSorter,
  35     GeoRestrictedError,
  36     GeoUtils,
  37     HEADRequest,
  38     LenientJSONDecoder,
  39     Popen,
  40     RegexNotFoundError,
  41     RetryManager,
  42     UnsupportedError,
  43     age_restricted,
  44     base_url,
  45     bug_reports_message,
  46     classproperty,
  47     clean_html,
  48     deprecation_warning,
  49     determine_ext,
  50     dict_get,
  51     encode_data_uri,
  52     error_to_compat_str,
  53     extract_attributes,
  54     filter_dict,
  55     fix_xml_ampersands,
  56     float_or_none,
  57     format_field,
  58     int_or_none,
  59     join_nonempty,
  60     js_to_json,
  61     mimetype2ext,
  62     netrc_from_content,
  63     network_exceptions,
  64     orderedSet,
  65     parse_bitrate,
  66     parse_codecs,
  67     parse_duration,
  68     parse_iso8601,
  69     parse_m3u8_attributes,
  70     parse_resolution,
  71     sanitize_filename,
  72     sanitize_url,
  73     sanitized_Request,
  74     smuggle_url,
  75     str_or_none,
  76     str_to_int,
  77     strip_or_none,
  78     traverse_obj,
  79     truncate_string,
  80     try_call,
  81     try_get,
  82     unescapeHTML,
  83     unified_strdate,
  84     unified_timestamp,
  85     update_Request,
  86     update_url_query,
  87     url_basename,
  88     url_or_none,
  89     urlhandle_detect_ext,
  90     urljoin,
  91     variadic,
  92     xpath_element,
  93     xpath_text,
  94     xpath_with_ns,
  95 )
  96
  97
  98 class InfoExtractor:
  99     """Information Extractor class.
 100
 101     Information extractors are the classes that, given a URL, extract
 102     information about the video (or videos) the URL refers to. This
 103     information includes the real video URL, the video title, author and
 104     others. The information is stored in a dictionary which is then
 105     passed to the YoutubeDL. The YoutubeDL processes this
 106     information possibly downloading the video to the file system, among
 107     other possible outcomes.
 108
 109     The type field determines the type of the result.
 110     By far the most common value (and the default if _type is missing) is
 111     "video", which indicates a single video.
 112
 113     For a video, the dictionaries must include the following fields:
 114
 115     id:             Video identifier.
 116     title:          Video title, unescaped. Set to an empty string if video has
 117                     no title as opposed to "None" which signifies that the
 118                     extractor failed to obtain a title
 119
 120     Additionally, it must contain either a formats entry or a url one:
 121
 122     formats:        A list of dictionaries for each format available, ordered
 123                     from worst to best quality.
 124
 125                     Potential fields:
 126                     * url        The mandatory URL representing the media:
 127                                    for plain file media - HTTP URL of this file,
 128                                    for RTMP - RTMP URL,
 129                                    for HLS - URL of the M3U8 media playlist,
 130                                    for HDS - URL of the F4M manifest,
 131                                    for DASH
 132                                      - HTTP URL to plain file media (in case of
 133                                        unfragmented media)
 134                                      - URL of the MPD manifest or base URL
 135                                        representing the media if MPD manifest
 136                                        is parsed from a string (in case of
 137                                        fragmented media)
 138                                    for MSS - URL of the ISM manifest.
 139                     * request_data  Data to send in POST request to the URL
 140                     * manifest_url
 141                                  The URL of the manifest file in case of
 142                                  fragmented media:
 143                                    for HLS - URL of the M3U8 master playlist,
 144                                    for HDS - URL of the F4M manifest,
 145                                    for DASH - URL of the MPD manifest,
 146                                    for MSS - URL of the ISM manifest.
 147                     * manifest_stream_number  (For internal use only)
 148                                  The index of the stream in the manifest file
 149                     * ext        Will be calculated from URL if missing
 150                     * format     A human-readable description of the format
 151                                  ("mp4 container with h264/opus").
 152                                  Calculated from the format_id, width, height.
 153                                  and format_note fields if missing.
 154                     * format_id  A short description of the format
 155                                  ("mp4_h264_opus" or "19").
 156                                 Technically optional, but strongly recommended.
 157                     * format_note Additional info about the format
 158                                  ("3D" or "DASH video")
 159                     * width      Width of the video, if known
 160                     * height     Height of the video, if known
 161                     * aspect_ratio  Aspect ratio of the video, if known
 162                                  Automatically calculated from width and height
 163                     * resolution Textual description of width and height
 164                                  Automatically calculated from width and height
 165                     * dynamic_range The dynamic range of the video. One of:
 166                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 167                     * tbr        Average bitrate of audio and video in KBit/s
 168                     * abr        Average audio bitrate in KBit/s
 169                     * acodec     Name of the audio codec in use
 170                     * asr        Audio sampling rate in Hertz
 171                     * audio_channels  Number of audio channels
 172                     * vbr        Average video bitrate in KBit/s
 173                     * fps        Frame rate
 174                     * vcodec     Name of the video codec in use
 175                     * container  Name of the container format
 176                     * filesize   The number of bytes, if known in advance
 177                     * filesize_approx  An estimate for the number of bytes
 178                     * player_url SWF Player URL (used for rtmpdump).
 179                     * protocol   The protocol that will be used for the actual
 180                                  download, lower-case. One of "http", "https" or
 181                                  one of the protocols defined in downloader.PROTOCOL_MAP
 182                     * fragment_base_url
 183                                  Base URL for fragments. Each fragment's path
 184                                  value (if present) will be relative to
 185                                  this URL.
 186                     * fragments  A list of fragments of a fragmented media.
 187                                  Each fragment entry must contain either an url
 188                                  or a path. If an url is present it should be
 189                                  considered by a client. Otherwise both path and
 190                                  fragment_base_url must be present. Here is
 191                                  the list of all potential fields:
 192                                  * "url" - fragment's URL
 193                                  * "path" - fragment's path relative to
 194                                             fragment_base_url
 195                                  * "duration" (optional, int or float)
 196                                  * "filesize" (optional, int)
 197                     * is_from_start  Is a live format that can be downloaded
 198                                 from the start. Boolean
 199                     * preference Order number of this format. If this field is
 200                                  present and not None, the formats get sorted
 201                                  by this field, regardless of all other values.
 202                                  -1 for default (order by other properties),
 203                                  -2 or smaller for less than default.
 204                                  < -1000 to hide the format (if there is
 205                                     another one which is strictly better)
 206                     * language   Language code, e.g. "de" or "en-US".
 207                     * language_preference  Is this in the language mentioned in
 208                                  the URL?
 209                                  10 if it's what the URL is about,
 210                                  -1 for default (don't know),
 211                                  -10 otherwise, other values reserved for now.
 212                     * quality    Order number of the video quality of this
 213                                  format, irrespective of the file format.
 214                                  -1 for default (order by other properties),
 215                                  -2 or smaller for less than default.
 216                     * source_preference  Order number for this video source
 217                                   (quality takes higher priority)
 218                                  -1 for default (order by other properties),
 219                                  -2 or smaller for less than default.
 220                     * http_headers  A dictionary of additional HTTP headers
 221                                  to add to the request.
 222                     * stretched_ratio  If given and not 1, indicates that the
 223                                  video's pixels are not square.
 224                                  width : height ratio as float.
 225                     * no_resume  The server does not support resuming the
 226                                  (HTTP or RTMP) download. Boolean.
 227                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 228                     * extra_param_to_segment_url  A query string to append to each
 229                                  fragment's URL, or to update each existing query string
 230                                  with. Only applied by the native HLS/DASH downloaders.
 231                     * hls_aes    A dictionary of HLS AES-128 decryption information
 232                                  used by the native HLS downloader to override the
 233                                  values in the media playlist when an '#EXT-X-KEY' tag
 234                                  is present in the playlist:
 235                                  * uri  The URI from which the key will be downloaded
 236                                  * key  The key (as hex) used to decrypt fragments.
 237                                         If `key` is given, any key URI will be ignored
 238                                  * iv   The IV (as hex) used to decrypt fragments
 239                     * downloader_options  A dictionary of downloader options
 240                                  (For internal use only)
 241                                  * http_chunk_size Chunk size for HTTP downloads
 242                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 243                     RTMP formats can also have the additional fields: page_url,
 244                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 245                     rtmp_protocol, rtmp_real_time
 246
 247     url:            Final video URL.
 248     ext:            Video filename extension.
 249     format:         The video format, defaults to ext (used for --get-format)
 250     player_url:     SWF Player URL (used for rtmpdump).
 251
 252     The following fields are optional:
 253
 254     direct:         True if a direct video file was given (must only be set by GenericIE)
 255     alt_title:      A secondary title of the video.
 256     display_id      An alternative identifier for the video, not necessarily
 257                     unique, but available before title. Typically, id is
 258                     something like "4234987", title "Dancing naked mole rats",
 259                     and display_id "dancing-naked-mole-rats"
 260     thumbnails:     A list of dictionaries, with the following entries:
 261                         * "id" (optional, string) - Thumbnail format ID
 262                         * "url"
 263                         * "preference" (optional, int) - quality of the image
 264                         * "width" (optional, int)
 265                         * "height" (optional, int)
 266                         * "resolution" (optional, string "{width}x{height}",
 267                                         deprecated)
 268                         * "filesize" (optional, int)
 269                         * "http_headers" (dict) - HTTP headers for the request
 270     thumbnail:      Full URL to a video thumbnail image.
 271     description:    Full video description.
 272     uploader:       Full name of the video uploader.
 273     license:        License name the video is licensed under.
 274     creator:        The creator of the video.
 275     timestamp:      UNIX timestamp of the moment the video was uploaded
 276     upload_date:    Video upload date in UTC (YYYYMMDD).
 277                     If not explicitly set, calculated from timestamp
 278     release_timestamp: UNIX timestamp of the moment the video was released.
 279                     If it is not clear whether to use timestamp or this, use the former
 280     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 281                     If not explicitly set, calculated from release_timestamp
 282     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 283     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 284                     If not explicitly set, calculated from modified_timestamp
 285     uploader_id:    Nickname or id of the video uploader.
 286     uploader_url:   Full URL to a personal webpage of the video uploader.
 287     channel:        Full name of the channel the video is uploaded on.
 288                     Note that channel fields may or may not repeat uploader
 289                     fields. This depends on a particular extractor.
 290     channel_id:     Id of the channel.
 291     channel_url:    Full URL to a channel webpage.
 292     channel_follower_count: Number of followers of the channel.
 293     channel_is_verified: Whether the channel is verified on the platform.
 294     location:       Physical location where the video was filmed.
 295     subtitles:      The available subtitles as a dictionary in the format
 296                     {tag: subformats}. "tag" is usually a language code, and
 297                     "subformats" is a list sorted from lower to higher
 298                     preference, each element is a dictionary with the "ext"
 299                     entry and one of:
 300                         * "data": The subtitles file contents
 301                         * "url": A URL pointing to the subtitles file
 302                     It can optionally also have:
 303                         * "name": Name or description of the subtitles
 304                         * "http_headers": A dictionary of additional HTTP headers
 305                                   to add to the request.
 306                     "ext" will be calculated from URL if missing
 307     automatic_captions: Like 'subtitles'; contains automatically generated
 308                     captions instead of normal subtitles
 309     duration:       Length of the video in seconds, as an integer or float.
 310     view_count:     How many users have watched the video on the platform.
 311     concurrent_view_count: How many users are currently watching the video on the platform.
 312     like_count:     Number of positive ratings of the video
 313     dislike_count:  Number of negative ratings of the video
 314     repost_count:   Number of reposts of the video
 315     average_rating: Average rating give by users, the scale used depends on the webpage
 316     comment_count:  Number of comments on the video
 317     comments:       A list of comments, each with one or more of the following
 318                     properties (all but one of text or html optional):
 319                         * "author" - human-readable name of the comment author
 320                         * "author_id" - user ID of the comment author
 321                         * "author_thumbnail" - The thumbnail of the comment author
 322                         * "author_url" - The url to the comment author's page
 323                         * "author_is_verified" - Whether the author is verified
 324                                                  on the platform
 325                         * "author_is_uploader" - Whether the comment is made by
 326                                                  the video uploader
 327                         * "id" - Comment ID
 328                         * "html" - Comment as HTML
 329                         * "text" - Plain text of the comment
 330                         * "timestamp" - UNIX timestamp of comment
 331                         * "parent" - ID of the comment this one is replying to.
 332                                      Set to "root" to indicate that this is a
 333                                      comment to the original video.
 334                         * "like_count" - Number of positive ratings of the comment
 335                         * "dislike_count" - Number of negative ratings of the comment
 336                         * "is_favorited" - Whether the comment is marked as
 337                                            favorite by the video uploader
 338                         * "is_pinned" - Whether the comment is pinned to
 339                                         the top of the comments
 340     age_limit:      Age restriction for the video, as an integer (years)
 341     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 342                     should allow to get the same result again. (It will be set
 343                     by YoutubeDL if it's missing)
 344     categories:     A list of categories that the video falls in, for example
 345                     ["Sports", "Berlin"]
 346     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 347     cast:           A list of the video cast
 348     is_live:        True, False, or None (=unknown). Whether this video is a
 349                     live stream that goes on instead of a fixed-length video.
 350     was_live:       True, False, or None (=unknown). Whether this video was
 351                     originally a live stream.
 352     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 353                     or 'post_live' (was live, but VOD is not yet processed)
 354                     If absent, automatically set from is_live, was_live
 355     start_time:     Time in seconds where the reproduction should start, as
 356                     specified in the URL.
 357     end_time:       Time in seconds where the reproduction should end, as
 358                     specified in the URL.
 359     chapters:       A list of dictionaries, with the following entries:
 360                         * "start_time" - The start time of the chapter in seconds
 361                         * "end_time" - The end time of the chapter in seconds
 362                         * "title" (optional, string)
 363     heatmap:        A list of dictionaries, with the following entries:
 364                         * "start_time" - The start time of the data point in seconds
 365                         * "end_time" - The end time of the data point in seconds
 366                         * "value" - The normalized value of the data point (float between 0 and 1)
 367     playable_in_embed: Whether this video is allowed to play in embedded
 368                     players on other sites. Can be True (=always allowed),
 369                     False (=never allowed), None (=unknown), or a string
 370                     specifying the criteria for embedability; e.g. 'whitelist'
 371     availability:   Under what condition the video is available. One of
 372                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 373                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 374                     to set it
 375     _old_archive_ids: A list of old archive ids needed for backward compatibility
 376     _format_sort_fields: A list of fields to use for sorting formats
 377     __post_extractor: A function to be called just before the metadata is
 378                     written to either disk, logger or console. The function
 379                     must return a dict which will be added to the info_dict.
 380                     This is usefull for additional information that is
 381                     time-consuming to extract. Note that the fields thus
 382                     extracted will not be available to output template and
 383                     match_filter. So, only "comments" and "comment_count" are
 384                     currently allowed to be extracted via this method.
 385
 386     The following fields should only be used when the video belongs to some logical
 387     chapter or section:
 388
 389     chapter:        Name or title of the chapter the video belongs to.
 390     chapter_number: Number of the chapter the video belongs to, as an integer.
 391     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 392
 393     The following fields should only be used when the video is an episode of some
 394     series, programme or podcast:
 395
 396     series:         Title of the series or programme the video episode belongs to.
 397     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 398     season:         Title of the season the video episode belongs to.
 399     season_number:  Number of the season the video episode belongs to, as an integer.
 400     season_id:      Id of the season the video episode belongs to, as a unicode string.
 401     episode:        Title of the video episode. Unlike mandatory video title field,
 402                     this field should denote the exact title of the video episode
 403                     without any kind of decoration.
 404     episode_number: Number of the video episode within a season, as an integer.
 405     episode_id:     Id of the video episode, as a unicode string.
 406
 407     The following fields should only be used when the media is a track or a part of
 408     a music album:
 409
 410     track:          Title of the track.
 411     track_number:   Number of the track within an album or a disc, as an integer.
 412     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 413                     as a unicode string.
 414     artist:         Artist(s) of the track.
 415     genre:          Genre(s) of the track.
 416     album:          Title of the album the track belongs to.
 417     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 418     album_artist:   List of all artists appeared on the album (e.g.
 419                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 420                     and compilations).
 421     disc_number:    Number of the disc or other physical medium the track belongs to,
 422                     as an integer.
 423     release_year:   Year (YYYY) when the album was released.
 424     composer:       Composer of the piece
 425
 426     The following fields should only be set for clips that should be cut from the original video:
 427
 428     section_start:  Start time of the section in seconds
 429     section_end:    End time of the section in seconds
 430
 431     The following fields should only be set for storyboards:
 432     rows:           Number of rows in each storyboard fragment, as an integer
 433     columns:        Number of columns in each storyboard fragment, as an integer
 434
 435     Unless mentioned otherwise, the fields should be Unicode strings.
 436
 437     Unless mentioned otherwise, None is equivalent to absence of information.
 438
 439
 440     _type "playlist" indicates multiple videos.
 441     There must be a key "entries", which is a list, an iterable, or a PagedList
 442     object, each element of which is a valid dictionary by this specification.
 443
 444     Additionally, playlists can have "id", "title", and any other relevant
 445     attributes with the same semantics as videos (see above).
 446
 447     It can also have the following optional fields:
 448
 449     playlist_count: The total number of videos in a playlist. If not given,
 450                     YoutubeDL tries to calculate it from "entries"
 451
 452
 453     _type "multi_video" indicates that there are multiple videos that
 454     form a single show, for examples multiple acts of an opera or TV episode.
 455     It must have an entries key like a playlist and contain all the keys
 456     required for a video at the same time.
 457
 458
 459     _type "url" indicates that the video must be extracted from another
 460     location, possibly by a different extractor. Its only required key is:
 461     "url" - the next URL to extract.
 462     The key "ie_key" can be set to the class name (minus the trailing "IE",
 463     e.g. "Youtube") if the extractor class is known in advance.
 464     Additionally, the dictionary may have any properties of the resolved entity
 465     known in advance, for example "title" if the title of the referred video is
 466     known ahead of time.
 467
 468
 469     _type "url_transparent" entities have the same specification as "url", but
 470     indicate that the given additional information is more precise than the one
 471     associated with the resolved URL.
 472     This is useful when a site employs a video service that hosts the video and
 473     its technical metadata, but that video service does not embed a useful
 474     title, description etc.
 475
 476
 477     Subclasses of this should also be added to the list of extractors and
 478     should define a _VALID_URL regexp and, re-define the _real_extract() and
 479     (optionally) _real_initialize() methods.
 480
 481     Subclasses may also override suitable() if necessary, but ensure the function
 482     signature is preserved and that this function imports everything it needs
 483     (except other extractors), so that lazy_extractors works correctly.
 484
 485     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 486     the HTML of Generic webpages. It may also override _extract_embed_urls
 487     or _extract_from_webpage as necessary. While these are normally classmethods,
 488     _extract_from_webpage is allowed to be an instance method.
 489
 490     _extract_from_webpage may raise self.StopExtraction() to stop further
 491     processing of the webpage and obtain exclusive rights to it. This is useful
 492     when the extractor cannot reliably be matched using just the URL,
 493     e.g. invidious/peertube instances
 494
 495     Embed-only extractors can be defined by setting _VALID_URL = False.
 496
 497     To support username + password (or netrc) login, the extractor must define a
 498     _NETRC_MACHINE and re-define _perform_login(username, password) and
 499     (optionally) _initialize_pre_login() methods. The _perform_login method will
 500     be called between _initialize_pre_login and _real_initialize if credentials
 501     are passed by the user. In cases where it is necessary to have the login
 502     process as part of the extraction rather than initialization, _perform_login
 503     can be left undefined.
 504
 505     _GEO_BYPASS attribute may be set to False in order to disable
 506     geo restriction bypass mechanisms for a particular extractor.
 507     Though it won't disable explicit geo restriction bypass based on
 508     country code provided with geo_bypass_country.
 509
 510     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 511     countries for this extractor. One of these countries will be used by
 512     geo restriction bypass mechanism right away in order to bypass
 513     geo restriction, of course, if the mechanism is not disabled.
 514
 515     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 516     IP blocks in CIDR notation for this extractor. One of these IP blocks
 517     will be used by geo restriction bypass mechanism similarly
 518     to _GEO_COUNTRIES.
 519
 520     The _ENABLED attribute should be set to False for IEs that
 521     are disabled by default and must be explicitly enabled.
 522
 523     The _WORKING attribute should be set to False for broken IEs
 524     in order to warn the users and skip the tests.
 525     """
 526
 527     _ready = False
 528     _downloader = None
 529     _x_forwarded_for_ip = None
 530     _GEO_BYPASS = True
 531     _GEO_COUNTRIES = None
 532     _GEO_IP_BLOCKS = None
 533     _WORKING = True
 534     _ENABLED = True
 535     _NETRC_MACHINE = None
 536     IE_DESC = None
 537     SEARCH_KEY = None
 538     _VALID_URL = None
 539     _EMBED_REGEX = []
 540
 541     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 542         password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 543         return {
 544             None: '',
 545             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 546             'password': f'Use {password_hint}',
 547             'cookies': (
 548                 'Use --cookies-from-browser or --cookies for the authentication. '
 549                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 550         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 551
 552     def __init__(self, downloader=None):
 553         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 554         If a downloader is not passed during initialization,
 555         it must be set using "set_downloader()" before "extract()" is called"""
 556         self._ready = False
 557         self._x_forwarded_for_ip = None
 558         self._printed_messages = set()
 559         self.set_downloader(downloader)
 560
 561     @classmethod
 562     def _match_valid_url(cls, url):
 563         if cls._VALID_URL is False:
 564             return None
 565         # This does not use has/getattr intentionally - we want to know whether
 566         # we have cached the regexp for *this* class, whereas getattr would also
 567         # match the superclass
 568         if '_VALID_URL_RE' not in cls.__dict__:
 569             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 570         return cls._VALID_URL_RE.match(url)
 571
 572     @classmethod
 573     def suitable(cls, url):
 574         """Receives a URL and returns True if suitable for this IE."""
 575         # This function must import everything it needs (except other extractors),
 576         # so that lazy_extractors works correctly
 577         return cls._match_valid_url(url) is not None
 578
 579     @classmethod
 580     def _match_id(cls, url):
 581         return cls._match_valid_url(url).group('id')
 582
 583     @classmethod
 584     def get_temp_id(cls, url):
 585         try:
 586             return cls._match_id(url)
 587         except (IndexError, AttributeError):
 588             return None
 589
 590     @classmethod
 591     def working(cls):
 592         """Getter method for _WORKING."""
 593         return cls._WORKING
 594
 595     @classmethod
 596     def supports_login(cls):
 597         return bool(cls._NETRC_MACHINE)
 598
 599     def initialize(self):
 600         """Initializes an instance (authentication, etc)."""
 601         self._printed_messages = set()
 602         self._initialize_geo_bypass({
 603             'countries': self._GEO_COUNTRIES,
 604             'ip_blocks': self._GEO_IP_BLOCKS,
 605         })
 606         if not self._ready:
 607             self._initialize_pre_login()
 608             if self.supports_login():
 609                 username, password = self._get_login_info()
 610                 if username:
 611                     self._perform_login(username, password)
 612             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 613                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 614             self._real_initialize()
 615             self._ready = True
 616
 617     def _initialize_geo_bypass(self, geo_bypass_context):
 618         """
 619         Initialize geo restriction bypass mechanism.
 620
 621         This method is used to initialize geo bypass mechanism based on faking
 622         X-Forwarded-For HTTP header. A random country from provided country list
 623         is selected and a random IP belonging to this country is generated. This
 624         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 625         HTTP requests.
 626
 627         This method will be used for initial geo bypass mechanism initialization
 628         during the instance initialization with _GEO_COUNTRIES and
 629         _GEO_IP_BLOCKS.
 630
 631         You may also manually call it from extractor's code if geo bypass
 632         information is not available beforehand (e.g. obtained during
 633         extraction) or due to some other reason. In this case you should pass
 634         this information in geo bypass context passed as first argument. It may
 635         contain following fields:
 636
 637         countries:  List of geo unrestricted countries (similar
 638                     to _GEO_COUNTRIES)
 639         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 640                     (similar to _GEO_IP_BLOCKS)
 641
 642         """
 643         if not self._x_forwarded_for_ip:
 644
 645             # Geo bypass mechanism is explicitly disabled by user
 646             if not self.get_param('geo_bypass', True):
 647                 return
 648
 649             if not geo_bypass_context:
 650                 geo_bypass_context = {}
 651
 652             # Backward compatibility: previously _initialize_geo_bypass
 653             # expected a list of countries, some 3rd party code may still use
 654             # it this way
 655             if isinstance(geo_bypass_context, (list, tuple)):
 656                 geo_bypass_context = {
 657                     'countries': geo_bypass_context,
 658                 }
 659
 660             # The whole point of geo bypass mechanism is to fake IP
 661             # as X-Forwarded-For HTTP header based on some IP block or
 662             # country code.
 663
 664             # Path 1: bypassing based on IP block in CIDR notation
 665
 666             # Explicit IP block specified by user, use it right away
 667             # regardless of whether extractor is geo bypassable or not
 668             ip_block = self.get_param('geo_bypass_ip_block', None)
 669
 670             # Otherwise use random IP block from geo bypass context but only
 671             # if extractor is known as geo bypassable
 672             if not ip_block:
 673                 ip_blocks = geo_bypass_context.get('ip_blocks')
 674                 if self._GEO_BYPASS and ip_blocks:
 675                     ip_block = random.choice(ip_blocks)
 676
 677             if ip_block:
 678                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 679                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 680                 return
 681
 682             # Path 2: bypassing based on country code
 683
 684             # Explicit country code specified by user, use it right away
 685             # regardless of whether extractor is geo bypassable or not
 686             country = self.get_param('geo_bypass_country', None)
 687
 688             # Otherwise use random country code from geo bypass context but
 689             # only if extractor is known as geo bypassable
 690             if not country:
 691                 countries = geo_bypass_context.get('countries')
 692                 if self._GEO_BYPASS and countries:
 693                     country = random.choice(countries)
 694
 695             if country:
 696                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 697                 self._downloader.write_debug(
 698                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 699
 700     def extract(self, url):
 701         """Extracts URL information and returns it in list of dicts."""
 702         try:
 703             for _ in range(2):
 704                 try:
 705                     self.initialize()
 706                     self.to_screen('Extracting URL: %s' % (
 707                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 708                     ie_result = self._real_extract(url)
 709                     if ie_result is None:
 710                         return None
 711                     if self._x_forwarded_for_ip:
 712                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 713                     subtitles = ie_result.get('subtitles') or {}
 714                     if 'no-live-chat' in self.get_param('compat_opts'):
 715                         for lang in ('live_chat', 'comments', 'danmaku'):
 716                             subtitles.pop(lang, None)
 717                     return ie_result
 718                 except GeoRestrictedError as e:
 719                     if self.__maybe_fake_ip_and_retry(e.countries):
 720                         continue
 721                     raise
 722         except UnsupportedError:
 723             raise
 724         except ExtractorError as e:
 725             e.video_id = e.video_id or self.get_temp_id(url),
 726             e.ie = e.ie or self.IE_NAME,
 727             e.traceback = e.traceback or sys.exc_info()[2]
 728             raise
 729         except http.client.IncompleteRead as e:
 730             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 731         except (KeyError, StopIteration) as e:
 732             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 733
 734     def __maybe_fake_ip_and_retry(self, countries):
 735         if (not self.get_param('geo_bypass_country', None)
 736                 and self._GEO_BYPASS
 737                 and self.get_param('geo_bypass', True)
 738                 and not self._x_forwarded_for_ip
 739                 and countries):
 740             country_code = random.choice(countries)
 741             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 742             if self._x_forwarded_for_ip:
 743                 self.report_warning(
 744                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 745                     % (self._x_forwarded_for_ip, country_code.upper()))
 746                 return True
 747         return False
 748
 749     def set_downloader(self, downloader):
 750         """Sets a YoutubeDL instance as the downloader for this IE."""
 751         self._downloader = downloader
 752
 753     @property
 754     def cache(self):
 755         return self._downloader.cache
 756
 757     @property
 758     def cookiejar(self):
 759         return self._downloader.cookiejar
 760
 761     def _initialize_pre_login(self):
 762         """ Initialization before login. Redefine in subclasses."""
 763         pass
 764
 765     def _perform_login(self, username, password):
 766         """ Login with username and password. Redefine in subclasses."""
 767         pass
 768
 769     def _real_initialize(self):
 770         """Real initialization process. Redefine in subclasses."""
 771         pass
 772
 773     def _real_extract(self, url):
 774         """Real extraction process. Redefine in subclasses."""
 775         raise NotImplementedError('This method must be implemented by subclasses')
 776
 777     @classmethod
 778     def ie_key(cls):
 779         """A string for getting the InfoExtractor with get_info_extractor"""
 780         return cls.__name__[:-2]
 781
 782     @classproperty
 783     def IE_NAME(cls):
 784         return cls.__name__[:-2]
 785
 786     @staticmethod
 787     def __can_accept_status_code(err, expected_status):
 788         assert isinstance(err, urllib.error.HTTPError)
 789         if expected_status is None:
 790             return False
 791         elif callable(expected_status):
 792             return expected_status(err.code) is True
 793         else:
 794             return err.code in variadic(expected_status)
 795
 796     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 797         if isinstance(url_or_request, urllib.request.Request):
 798             return update_Request(url_or_request, data=data, headers=headers, query=query)
 799         if query:
 800             url_or_request = update_url_query(url_or_request, query)
 801         return sanitized_Request(url_or_request, data, headers or {})
 802
 803     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 804         """
 805         Return the response handle.
 806
 807         See _download_webpage docstring for arguments specification.
 808         """
 809         if not self._downloader._first_webpage_request:
 810             sleep_interval = self.get_param('sleep_interval_requests') or 0
 811             if sleep_interval > 0:
 812                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 813                 time.sleep(sleep_interval)
 814         else:
 815             self._downloader._first_webpage_request = False
 816
 817         if note is None:
 818             self.report_download_webpage(video_id)
 819         elif note is not False:
 820             if video_id is None:
 821                 self.to_screen(str(note))
 822             else:
 823                 self.to_screen(f'{video_id}: {note}')
 824
 825         # Some sites check X-Forwarded-For HTTP header in order to figure out
 826         # the origin of the client behind proxy. This allows bypassing geo
 827         # restriction by faking this header's value to IP that belongs to some
 828         # geo unrestricted country. We will do so once we encounter any
 829         # geo restriction error.
 830         if self._x_forwarded_for_ip:
 831             headers = (headers or {}).copy()
 832             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 833
 834         try:
 835             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 836         except network_exceptions as err:
 837             if isinstance(err, urllib.error.HTTPError):
 838                 if self.__can_accept_status_code(err, expected_status):
 839                     # Retain reference to error to prevent file object from
 840                     # being closed before it can be read. Works around the
 841                     # effects of <https://bugs.python.org/issue15002>
 842                     # introduced in Python 3.4.1.
 843                     err.fp._error = err
 844                     return err.fp
 845
 846             if errnote is False:
 847                 return False
 848             if errnote is None:
 849                 errnote = 'Unable to download webpage'
 850
 851             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 852             if fatal:
 853                 raise ExtractorError(errmsg, cause=err)
 854             else:
 855                 self.report_warning(errmsg)
 856                 return False
 857
 858     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 859                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 860         """
 861         Return a tuple (page content as string, URL handle).
 862
 863         Arguments:
 864         url_or_request -- plain text URL as a string or
 865             a urllib.request.Request object
 866         video_id -- Video/playlist/item identifier (string)
 867
 868         Keyword arguments:
 869         note -- note printed before downloading (string)
 870         errnote -- note printed in case of an error (string)
 871         fatal -- flag denoting whether error should be considered fatal,
 872             i.e. whether it should cause ExtractionError to be raised,
 873             otherwise a warning will be reported and extraction continued
 874         encoding -- encoding for a page content decoding, guessed automatically
 875             when not explicitly specified
 876         data -- POST data (bytes)
 877         headers -- HTTP headers (dict)
 878         query -- URL query (dict)
 879         expected_status -- allows to accept failed HTTP requests (non 2xx
 880             status code) by explicitly specifying a set of accepted status
 881             codes. Can be any of the following entities:
 882                 - an integer type specifying an exact failed status code to
 883                   accept
 884                 - a list or a tuple of integer types specifying a list of
 885                   failed status codes to accept
 886                 - a callable accepting an actual failed status code and
 887                   returning True if it should be accepted
 888             Note that this argument does not affect success status codes (2xx)
 889             which are always accepted.
 890         """
 891
 892         # Strip hashes from the URL (#1038)
 893         if isinstance(url_or_request, str):
 894             url_or_request = url_or_request.partition('#')[0]
 895
 896         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 897         if urlh is False:
 898             assert not fatal
 899             return False
 900         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 901         return (content, urlh)
 902
 903     @staticmethod
 904     def _guess_encoding_from_content(content_type, webpage_bytes):
 905         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 906         if m:
 907             encoding = m.group(1)
 908         else:
 909             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 910                           webpage_bytes[:1024])
 911             if m:
 912                 encoding = m.group(1).decode('ascii')
 913             elif webpage_bytes.startswith(b'\xff\xfe'):
 914                 encoding = 'utf-16'
 915             else:
 916                 encoding = 'utf-8'
 917
 918         return encoding
 919
 920     def __check_blocked(self, content):
 921         first_block = content[:512]
 922         if ('<title>Access to this site is blocked</title>' in content
 923                 and 'Websense' in first_block):
 924             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 925             blocked_iframe = self._html_search_regex(
 926                 r'<iframe src="([^"]+)"', content,
 927                 'Websense information URL', default=None)
 928             if blocked_iframe:
 929                 msg += ' Visit %s for more details' % blocked_iframe
 930             raise ExtractorError(msg, expected=True)
 931         if '<title>The URL you requested has been blocked</title>' in first_block:
 932             msg = (
 933                 'Access to this webpage has been blocked by Indian censorship. '
 934                 'Use a VPN or proxy server (with --proxy) to route around it.')
 935             block_msg = self._html_search_regex(
 936                 r'</h1><p>(.*?)</p>',
 937                 content, 'block message', default=None)
 938             if block_msg:
 939                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 940             raise ExtractorError(msg, expected=True)
 941         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 942                 and 'blocklist.rkn.gov.ru' in content):
 943             raise ExtractorError(
 944                 'Access to this webpage has been blocked by decision of the Russian government. '
 945                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 946                 expected=True)
 947
 948     def _request_dump_filename(self, url, video_id):
 949         basen = f'{video_id}_{url}'
 950         trim_length = self.get_param('trim_file_name') or 240
 951         if len(basen) > trim_length:
 952             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 953             basen = basen[:trim_length - len(h)] + h
 954         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 955         # Working around MAX_PATH limitation on Windows (see
 956         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 957         if compat_os_name == 'nt':
 958             absfilepath = os.path.abspath(filename)
 959             if len(absfilepath) > 259:
 960                 filename = fR'\\?\{absfilepath}'
 961         return filename
 962
 963     def __decode_webpage(self, webpage_bytes, encoding, headers):
 964         if not encoding:
 965             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 966         try:
 967             return webpage_bytes.decode(encoding, 'replace')
 968         except LookupError:
 969             return webpage_bytes.decode('utf-8', 'replace')
 970
 971     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 972         webpage_bytes = urlh.read()
 973         if prefix is not None:
 974             webpage_bytes = prefix + webpage_bytes
 975         if self.get_param('dump_intermediate_pages', False):
 976             self.to_screen('Dumping request to ' + urlh.geturl())
 977             dump = base64.b64encode(webpage_bytes).decode('ascii')
 978             self._downloader.to_screen(dump)
 979         if self.get_param('write_pages'):
 980             filename = self._request_dump_filename(urlh.geturl(), video_id)
 981             self.to_screen(f'Saving request to {filename}')
 982             with open(filename, 'wb') as outf:
 983                 outf.write(webpage_bytes)
 984
 985         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 986         self.__check_blocked(content)
 987
 988         return content
 989
 990     def __print_error(self, errnote, fatal, video_id, err):
 991         if fatal:
 992             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 993         elif errnote:
 994             self.report_warning(f'{video_id}: {errnote}: {err}')
 995
 996     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 997         if transform_source:
 998             xml_string = transform_source(xml_string)
 999         try:
1000             return compat_etree_fromstring(xml_string.encode('utf-8'))
1001         except xml.etree.ElementTree.ParseError as ve:
1002             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1003
1004     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1005         try:
1006             return json.loads(
1007                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1008         except ValueError as ve:
1009             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1010
1011     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1012         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1013
1014     def __create_download_methods(name, parser, note, errnote, return_value):
1015
1016         def parse(ie, content, *args, errnote=errnote, **kwargs):
1017             if parser is None:
1018                 return content
1019             if errnote is False:
1020                 kwargs['errnote'] = errnote
1021             # parser is fetched by name so subclasses can override it
1022             return getattr(ie, parser)(content, *args, **kwargs)
1023
1024         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1025                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1026             res = self._download_webpage_handle(
1027                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1028                 data=data, headers=headers, query=query, expected_status=expected_status)
1029             if res is False:
1030                 return res
1031             content, urlh = res
1032             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1033
1034         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1035                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1036             if self.get_param('load_pages'):
1037                 url_or_request = self._create_request(url_or_request, data, headers, query)
1038                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1039                 self.to_screen(f'Loading request from {filename}')
1040                 try:
1041                     with open(filename, 'rb') as dumpf:
1042                         webpage_bytes = dumpf.read()
1043                 except OSError as e:
1044                     self.report_warning(f'Unable to load request from disk: {e}')
1045                 else:
1046                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1047                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1048             kwargs = {
1049                 'note': note,
1050                 'errnote': errnote,
1051                 'transform_source': transform_source,
1052                 'fatal': fatal,
1053                 'encoding': encoding,
1054                 'data': data,
1055                 'headers': headers,
1056                 'query': query,
1057                 'expected_status': expected_status,
1058             }
1059             if parser is None:
1060                 kwargs.pop('transform_source')
1061             # The method is fetched by name so subclasses can override _download_..._handle
1062             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1063             return res if res is False else res[0]
1064
1065         def impersonate(func, name, return_value):
1066             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1067             func.__doc__ = f'''
1068                 @param transform_source     Apply this transformation before parsing
1069                 @returns                    {return_value}
1070
1071                 See _download_webpage_handle docstring for other arguments specification
1072             '''
1073
1074         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1075         impersonate(download_content, f'_download_{name}', f'{return_value}')
1076         return download_handle, download_content
1077
1078     _download_xml_handle, _download_xml = __create_download_methods(
1079         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1080     _download_json_handle, _download_json = __create_download_methods(
1081         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1082     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1083         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1084     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1085
1086     def _download_webpage(
1087             self, url_or_request, video_id, note=None, errnote=None,
1088             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1089         """
1090         Return the data of the page as a string.
1091
1092         Keyword arguments:
1093         tries -- number of tries
1094         timeout -- sleep interval between tries
1095
1096         See _download_webpage_handle docstring for other arguments specification.
1097         """
1098
1099         R''' # NB: These are unused; should they be deprecated?
1100         if tries != 1:
1101             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1102         if timeout is NO_DEFAULT:
1103             timeout = 5
1104         else:
1105             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1106         '''
1107
1108         try_count = 0
1109         while True:
1110             try:
1111                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1112             except http.client.IncompleteRead as e:
1113                 try_count += 1
1114                 if try_count >= tries:
1115                     raise e
1116                 self._sleep(timeout, video_id)
1117
1118     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1119         idstr = format_field(video_id, None, '%s: ')
1120         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1121         if only_once:
1122             if f'WARNING: {msg}' in self._printed_messages:
1123                 return
1124             self._printed_messages.add(f'WARNING: {msg}')
1125         self._downloader.report_warning(msg, *args, **kwargs)
1126
1127     def to_screen(self, msg, *args, **kwargs):
1128         """Print msg to screen, prefixing it with '[ie_name]'"""
1129         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1130
1131     def write_debug(self, msg, *args, **kwargs):
1132         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1133
1134     def get_param(self, name, default=None, *args, **kwargs):
1135         if self._downloader:
1136             return self._downloader.params.get(name, default, *args, **kwargs)
1137         return default
1138
1139     def report_drm(self, video_id, partial=NO_DEFAULT):
1140         if partial is not NO_DEFAULT:
1141             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1142         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1143
1144     def report_extraction(self, id_or_name):
1145         """Report information extraction."""
1146         self.to_screen('%s: Extracting information' % id_or_name)
1147
1148     def report_download_webpage(self, video_id):
1149         """Report webpage download."""
1150         self.to_screen('%s: Downloading webpage' % video_id)
1151
1152     def report_age_confirmation(self):
1153         """Report attempt to confirm age."""
1154         self.to_screen('Confirming age')
1155
1156     def report_login(self):
1157         """Report attempt to log in."""
1158         self.to_screen('Logging in')
1159
1160     def raise_login_required(
1161             self, msg='This video is only available for registered users',
1162             metadata_available=False, method=NO_DEFAULT):
1163         if metadata_available and (
1164                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1165             self.report_warning(msg)
1166             return
1167         msg += format_field(self._login_hint(method), None, '. %s')
1168         raise ExtractorError(msg, expected=True)
1169
1170     def raise_geo_restricted(
1171             self, msg='This video is not available from your location due to geo restriction',
1172             countries=None, metadata_available=False):
1173         if metadata_available and (
1174                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1175             self.report_warning(msg)
1176         else:
1177             raise GeoRestrictedError(msg, countries=countries)
1178
1179     def raise_no_formats(self, msg, expected=False, video_id=None):
1180         if expected and (
1181                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1182             self.report_warning(msg, video_id)
1183         elif isinstance(msg, ExtractorError):
1184             raise msg
1185         else:
1186             raise ExtractorError(msg, expected=expected, video_id=video_id)
1187
1188     # Methods for following #608
1189     @staticmethod
1190     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1191         """Returns a URL that points to a page that should be processed"""
1192         if ie is not None:
1193             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1194         if video_id is not None:
1195             kwargs['id'] = video_id
1196         if video_title is not None:
1197             kwargs['title'] = video_title
1198         return {
1199             **kwargs,
1200             '_type': 'url_transparent' if url_transparent else 'url',
1201             'url': url,
1202         }
1203
1204     @classmethod
1205     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1206                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1207         return cls.playlist_result(
1208             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1209             playlist_id, playlist_title, **kwargs)
1210
1211     @staticmethod
1212     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1213         """Returns a playlist"""
1214         if playlist_id:
1215             kwargs['id'] = playlist_id
1216         if playlist_title:
1217             kwargs['title'] = playlist_title
1218         if playlist_description is not None:
1219             kwargs['description'] = playlist_description
1220         return {
1221             **kwargs,
1222             '_type': 'multi_video' if multi_video else 'playlist',
1223             'entries': entries,
1224         }
1225
1226     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1227         """
1228         Perform a regex search on the given string, using a single or a list of
1229         patterns returning the first matching group.
1230         In case of failure return a default value or raise a WARNING or a
1231         RegexNotFoundError, depending on fatal, specifying the field name.
1232         """
1233         if string is None:
1234             mobj = None
1235         elif isinstance(pattern, (str, re.Pattern)):
1236             mobj = re.search(pattern, string, flags)
1237         else:
1238             for p in pattern:
1239                 mobj = re.search(p, string, flags)
1240                 if mobj:
1241                     break
1242
1243         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1244
1245         if mobj:
1246             if group is None:
1247                 # return the first matching group
1248                 return next(g for g in mobj.groups() if g is not None)
1249             elif isinstance(group, (list, tuple)):
1250                 return tuple(mobj.group(g) for g in group)
1251             else:
1252                 return mobj.group(group)
1253         elif default is not NO_DEFAULT:
1254             return default
1255         elif fatal:
1256             raise RegexNotFoundError('Unable to extract %s' % _name)
1257         else:
1258             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1259             return None
1260
1261     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1262                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1263         """Searches string for the JSON object specified by start_pattern"""
1264         # NB: end_pattern is only used to reduce the size of the initial match
1265         if default is NO_DEFAULT:
1266             default, has_default = {}, False
1267         else:
1268             fatal, has_default = False, True
1269
1270         json_string = self._search_regex(
1271             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1272             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1273         if not json_string:
1274             return default
1275
1276         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1277         try:
1278             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1279         except ExtractorError as e:
1280             if fatal:
1281                 raise ExtractorError(
1282                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1283             elif not has_default:
1284                 self.report_warning(
1285                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1286         return default
1287
1288     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1289         """
1290         Like _search_regex, but strips HTML tags and unescapes entities.
1291         """
1292         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1293         if isinstance(res, tuple):
1294             return tuple(map(clean_html, res))
1295         return clean_html(res)
1296
1297     def _get_netrc_login_info(self, netrc_machine=None):
1298         netrc_machine = netrc_machine or self._NETRC_MACHINE
1299
1300         cmd = self.get_param('netrc_cmd', '').format(netrc_machine)
1301         if cmd:
1302             self.to_screen(f'Executing command: {cmd}')
1303             stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1304             if ret != 0:
1305                 raise OSError(f'Command returned error code {ret}')
1306             info = netrc_from_content(stdout).authenticators(netrc_machine)
1307
1308         elif self.get_param('usenetrc', False):
1309             netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1310             if os.path.isdir(netrc_file):
1311                 netrc_file = os.path.join(netrc_file, '.netrc')
1312             info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1313
1314         else:
1315             return None, None
1316         if not info:
1317             raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')
1318         return info[0], info[2]
1319
1320     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1321         """
1322         Get the login info as (username, password)
1323         First look for the manually specified credentials using username_option
1324         and password_option as keys in params dictionary. If no such credentials
1325         are available try the netrc_cmd if it is defined or look in the
1326         netrc file using the netrc_machine or _NETRC_MACHINE value.
1327         If there's no info available, return (None, None)
1328         """
1329
1330         username = self.get_param(username_option)
1331         if username is not None:
1332             password = self.get_param(password_option)
1333         else:
1334             try:
1335                 username, password = self._get_netrc_login_info(netrc_machine)
1336             except (OSError, netrc.NetrcParseError) as err:
1337                 self.report_warning(f'Failed to parse .netrc: {err}')
1338                 return None, None
1339         return username, password
1340
1341     def _get_tfa_info(self, note='two-factor verification code'):
1342         """
1343         Get the two-factor authentication info
1344         TODO - asking the user will be required for sms/phone verify
1345         currently just uses the command line option
1346         If there's no info available, return None
1347         """
1348
1349         tfa = self.get_param('twofactor')
1350         if tfa is not None:
1351             return tfa
1352
1353         return getpass.getpass('Type %s and press [Return]: ' % note)
1354
1355     # Helper functions for extracting OpenGraph info
1356     @staticmethod
1357     def _og_regexes(prop):
1358         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1359         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1360                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1361         template = r'<meta[^>]+?%s[^>]+?%s'
1362         return [
1363             template % (property_re, content_re),
1364             template % (content_re, property_re),
1365         ]
1366
1367     @staticmethod
1368     def _meta_regex(prop):
1369         return r'''(?isx)<meta
1370                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1371                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1372
1373     def _og_search_property(self, prop, html, name=None, **kargs):
1374         prop = variadic(prop)
1375         if name is None:
1376             name = 'OpenGraph %s' % prop[0]
1377         og_regexes = []
1378         for p in prop:
1379             og_regexes.extend(self._og_regexes(p))
1380         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1381         if escaped is None:
1382             return None
1383         return unescapeHTML(escaped)
1384
1385     def _og_search_thumbnail(self, html, **kargs):
1386         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1387
1388     def _og_search_description(self, html, **kargs):
1389         return self._og_search_property('description', html, fatal=False, **kargs)
1390
1391     def _og_search_title(self, html, *, fatal=False, **kargs):
1392         return self._og_search_property('title', html, fatal=fatal, **kargs)
1393
1394     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1395         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1396         if secure:
1397             regexes = self._og_regexes('video:secure_url') + regexes
1398         return self._html_search_regex(regexes, html, name, **kargs)
1399
1400     def _og_search_url(self, html, **kargs):
1401         return self._og_search_property('url', html, **kargs)
1402
1403     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1404         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1405
1406     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1407         name = variadic(name)
1408         if display_name is None:
1409             display_name = name[0]
1410         return self._html_search_regex(
1411             [self._meta_regex(n) for n in name],
1412             html, display_name, fatal=fatal, group='content', **kwargs)
1413
1414     def _dc_search_uploader(self, html):
1415         return self._html_search_meta('dc.creator', html, 'uploader')
1416
1417     @staticmethod
1418     def _rta_search(html):
1419         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1420         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1421                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1422                      html):
1423             return 18
1424
1425         # And then there are the jokers who advertise that they use RTA, but actually don't.
1426         AGE_LIMIT_MARKERS = [
1427             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1428             r'>[^<]*you acknowledge you are at least (\d+) years old',
1429             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1430         ]
1431
1432         age_limit = 0
1433         for marker in AGE_LIMIT_MARKERS:
1434             mobj = re.search(marker, html)
1435             if mobj:
1436                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1437         return age_limit
1438
1439     def _media_rating_search(self, html):
1440         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1441         rating = self._html_search_meta('rating', html)
1442
1443         if not rating:
1444             return None
1445
1446         RATING_TABLE = {
1447             'safe for kids': 0,
1448             'general': 8,
1449             '14 years': 14,
1450             'mature': 17,
1451             'restricted': 19,
1452         }
1453         return RATING_TABLE.get(rating.lower())
1454
1455     def _family_friendly_search(self, html):
1456         # See http://schema.org/VideoObject
1457         family_friendly = self._html_search_meta(
1458             'isFamilyFriendly', html, default=None)
1459
1460         if not family_friendly:
1461             return None
1462
1463         RATING_TABLE = {
1464             '1': 0,
1465             'true': 0,
1466             '0': 18,
1467             'false': 18,
1468         }
1469         return RATING_TABLE.get(family_friendly.lower())
1470
1471     def _twitter_search_player(self, html):
1472         return self._html_search_meta('twitter:player', html,
1473                                       'twitter card player')
1474
1475     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1476         """Yield all json ld objects in the html"""
1477         if default is not NO_DEFAULT:
1478             fatal = False
1479         for mobj in re.finditer(JSON_LD_RE, html):
1480             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1481             for json_ld in variadic(json_ld_item):
1482                 if isinstance(json_ld, dict):
1483                     yield json_ld
1484
1485     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1486         """Search for a video in any json ld in the html"""
1487         if default is not NO_DEFAULT:
1488             fatal = False
1489         info = self._json_ld(
1490             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1491             video_id, fatal=fatal, expected_type=expected_type)
1492         if info:
1493             return info
1494         if default is not NO_DEFAULT:
1495             return default
1496         elif fatal:
1497             raise RegexNotFoundError('Unable to extract JSON-LD')
1498         else:
1499             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1500             return {}
1501
1502     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1503         if isinstance(json_ld, str):
1504             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1505         if not json_ld:
1506             return {}
1507         info = {}
1508
1509         INTERACTION_TYPE_MAP = {
1510             'CommentAction': 'comment',
1511             'AgreeAction': 'like',
1512             'DisagreeAction': 'dislike',
1513             'LikeAction': 'like',
1514             'DislikeAction': 'dislike',
1515             'ListenAction': 'view',
1516             'WatchAction': 'view',
1517             'ViewAction': 'view',
1518         }
1519
1520         def is_type(e, *expected_types):
1521             type = variadic(traverse_obj(e, '@type'))
1522             return any(x in type for x in expected_types)
1523
1524         def extract_interaction_type(e):
1525             interaction_type = e.get('interactionType')
1526             if isinstance(interaction_type, dict):
1527                 interaction_type = interaction_type.get('@type')
1528             return str_or_none(interaction_type)
1529
1530         def extract_interaction_statistic(e):
1531             interaction_statistic = e.get('interactionStatistic')
1532             if isinstance(interaction_statistic, dict):
1533                 interaction_statistic = [interaction_statistic]
1534             if not isinstance(interaction_statistic, list):
1535                 return
1536             for is_e in interaction_statistic:
1537                 if not is_type(is_e, 'InteractionCounter'):
1538                     continue
1539                 interaction_type = extract_interaction_type(is_e)
1540                 if not interaction_type:
1541                     continue
1542                 # For interaction count some sites provide string instead of
1543                 # an integer (as per spec) with non digit characters (e.g. ",")
1544                 # so extracting count with more relaxed str_to_int
1545                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1546                 if interaction_count is None:
1547                     continue
1548                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1549                 if not count_kind:
1550                     continue
1551                 count_key = '%s_count' % count_kind
1552                 if info.get(count_key) is not None:
1553                     continue
1554                 info[count_key] = interaction_count
1555
1556         def extract_chapter_information(e):
1557             chapters = [{
1558                 'title': part.get('name'),
1559                 'start_time': part.get('startOffset'),
1560                 'end_time': part.get('endOffset'),
1561             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1562             for idx, (last_c, current_c, next_c) in enumerate(zip(
1563                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1564                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1565                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1566                 if None in current_c.values():
1567                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1568                     return
1569             if chapters:
1570                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1571                 info['chapters'] = chapters
1572
1573         def extract_video_object(e):
1574             author = e.get('author')
1575             info.update({
1576                 'url': url_or_none(e.get('contentUrl')),
1577                 'ext': mimetype2ext(e.get('encodingFormat')),
1578                 'title': unescapeHTML(e.get('name')),
1579                 'description': unescapeHTML(e.get('description')),
1580                 'thumbnails': [{'url': unescapeHTML(url)}
1581                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1582                                if url_or_none(url)],
1583                 'duration': parse_duration(e.get('duration')),
1584                 'timestamp': unified_timestamp(e.get('uploadDate')),
1585                 # author can be an instance of 'Organization' or 'Person' types.
1586                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1587                 # however some websites are using 'Text' type instead.
1588                 # 1. https://schema.org/VideoObject
1589                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1590                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1591                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1592                 'tbr': int_or_none(e.get('bitrate')),
1593                 'width': int_or_none(e.get('width')),
1594                 'height': int_or_none(e.get('height')),
1595                 'view_count': int_or_none(e.get('interactionCount')),
1596                 'tags': try_call(lambda: e.get('keywords').split(',')),
1597             })
1598             if is_type(e, 'AudioObject'):
1599                 info.update({
1600                     'vcodec': 'none',
1601                     'abr': int_or_none(e.get('bitrate')),
1602                 })
1603             extract_interaction_statistic(e)
1604             extract_chapter_information(e)
1605
1606         def traverse_json_ld(json_ld, at_top_level=True):
1607             for e in variadic(json_ld):
1608                 if not isinstance(e, dict):
1609                     continue
1610                 if at_top_level and '@context' not in e:
1611                     continue
1612                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1613                     traverse_json_ld(e['@graph'], at_top_level=False)
1614                     continue
1615                 if expected_type is not None and not is_type(e, expected_type):
1616                     continue
1617                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1618                 if rating is not None:
1619                     info['average_rating'] = rating
1620                 if is_type(e, 'TVEpisode', 'Episode'):
1621                     episode_name = unescapeHTML(e.get('name'))
1622                     info.update({
1623                         'episode': episode_name,
1624                         'episode_number': int_or_none(e.get('episodeNumber')),
1625                         'description': unescapeHTML(e.get('description')),
1626                     })
1627                     if not info.get('title') and episode_name:
1628                         info['title'] = episode_name
1629                     part_of_season = e.get('partOfSeason')
1630                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1631                         info.update({
1632                             'season': unescapeHTML(part_of_season.get('name')),
1633                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1634                         })
1635                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1636                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1637                         info['series'] = unescapeHTML(part_of_series.get('name'))
1638                 elif is_type(e, 'Movie'):
1639                     info.update({
1640                         'title': unescapeHTML(e.get('name')),
1641                         'description': unescapeHTML(e.get('description')),
1642                         'duration': parse_duration(e.get('duration')),
1643                         'timestamp': unified_timestamp(e.get('dateCreated')),
1644                     })
1645                 elif is_type(e, 'Article', 'NewsArticle'):
1646                     info.update({
1647                         'timestamp': parse_iso8601(e.get('datePublished')),
1648                         'title': unescapeHTML(e.get('headline')),
1649                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1650                     })
1651                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1652                         extract_video_object(e['video'][0])
1653                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1654                         extract_video_object(e['subjectOf'][0])
1655                 elif is_type(e, 'VideoObject', 'AudioObject'):
1656                     extract_video_object(e)
1657                     if expected_type is None:
1658                         continue
1659                     else:
1660                         break
1661                 video = e.get('video')
1662                 if is_type(video, 'VideoObject'):
1663                     extract_video_object(video)
1664                 if expected_type is None:
1665                     continue
1666                 else:
1667                     break
1668
1669         traverse_json_ld(json_ld)
1670         return filter_dict(info)
1671
1672     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1673         return self._parse_json(
1674             self._search_regex(
1675                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1676                 webpage, 'next.js data', fatal=fatal, **kw),
1677             video_id, transform_source=transform_source, fatal=fatal)
1678
1679     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1680         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1681         rectx = re.escape(context_name)
1682         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1683         js, arg_keys, arg_vals = self._search_regex(
1684             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1685             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1686             default=NO_DEFAULT if fatal else (None, None, None))
1687         if js is None:
1688             return {}
1689
1690         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1691             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1692
1693         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1694         return traverse_obj(ret, traverse) or {}
1695
1696     @staticmethod
1697     def _hidden_inputs(html):
1698         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1699         hidden_inputs = {}
1700         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1701             attrs = extract_attributes(input)
1702             if not input:
1703                 continue
1704             if attrs.get('type') not in ('hidden', 'submit'):
1705                 continue
1706             name = attrs.get('name') or attrs.get('id')
1707             value = attrs.get('value')
1708             if name and value is not None:
1709                 hidden_inputs[name] = value
1710         return hidden_inputs
1711
1712     def _form_hidden_inputs(self, form_id, html):
1713         form = self._search_regex(
1714             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1715             html, '%s form' % form_id, group='form')
1716         return self._hidden_inputs(form)
1717
1718     @classproperty(cache=True)
1719     def FormatSort(cls):
1720         class FormatSort(FormatSorter):
1721             def __init__(ie, *args, **kwargs):
1722                 super().__init__(ie._downloader, *args, **kwargs)
1723
1724         deprecation_warning(
1725             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1726             'Use yt_dlp.utils.FormatSorter instead')
1727         return FormatSort
1728
1729     def _sort_formats(self, formats, field_preference=[]):
1730         if not field_preference:
1731             self._downloader.deprecation_warning(
1732                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1733             return
1734         self._downloader.deprecation_warning(
1735             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1736             'Return _format_sort_fields in the info_dict instead')
1737         if formats:
1738             formats[0]['__sort_fields'] = field_preference
1739
1740     def _check_formats(self, formats, video_id):
1741         if formats:
1742             formats[:] = filter(
1743                 lambda f: self._is_valid_url(
1744                     f['url'], video_id,
1745                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1746                 formats)
1747
1748     @staticmethod
1749     def _remove_duplicate_formats(formats):
1750         format_urls = set()
1751         unique_formats = []
1752         for f in formats:
1753             if f['url'] not in format_urls:
1754                 format_urls.add(f['url'])
1755                 unique_formats.append(f)
1756         formats[:] = unique_formats
1757
1758     def _is_valid_url(self, url, video_id, item='video', headers={}):
1759         url = self._proto_relative_url(url, scheme='http:')
1760         # For now assume non HTTP(S) URLs always valid
1761         if not (url.startswith('http://') or url.startswith('https://')):
1762             return True
1763         try:
1764             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1765             return True
1766         except ExtractorError as e:
1767             self.to_screen(
1768                 '%s: %s URL is invalid, skipping: %s'
1769                 % (video_id, item, error_to_compat_str(e.cause)))
1770             return False
1771
1772     def http_scheme(self):
1773         """ Either "http:" or "https:", depending on the user's preferences """
1774         return (
1775             'http:'
1776             if self.get_param('prefer_insecure', False)
1777             else 'https:')
1778
1779     def _proto_relative_url(self, url, scheme=None):
1780         scheme = scheme or self.http_scheme()
1781         assert scheme.endswith(':')
1782         return sanitize_url(url, scheme=scheme[:-1])
1783
1784     def _sleep(self, timeout, video_id, msg_template=None):
1785         if msg_template is None:
1786             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1787         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1788         self.to_screen(msg)
1789         time.sleep(timeout)
1790
1791     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1792                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1793                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1794         if self.get_param('ignore_no_formats_error'):
1795             fatal = False
1796
1797         res = self._download_xml_handle(
1798             manifest_url, video_id, 'Downloading f4m manifest',
1799             'Unable to download f4m manifest',
1800             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1801             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1802             transform_source=transform_source,
1803             fatal=fatal, data=data, headers=headers, query=query)
1804         if res is False:
1805             return []
1806
1807         manifest, urlh = res
1808         manifest_url = urlh.geturl()
1809
1810         return self._parse_f4m_formats(
1811             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1812             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1813
1814     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1815                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1816                            fatal=True, m3u8_id=None):
1817         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1818             return []
1819
1820         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1821         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1822         if akamai_pv is not None and ';' in akamai_pv.text:
1823             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1824             if playerVerificationChallenge.strip() != '':
1825                 return []
1826
1827         formats = []
1828         manifest_version = '1.0'
1829         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1830         if not media_nodes:
1831             manifest_version = '2.0'
1832             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1833         # Remove unsupported DRM protected media from final formats
1834         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1835         media_nodes = remove_encrypted_media(media_nodes)
1836         if not media_nodes:
1837             return formats
1838
1839         manifest_base_url = get_base_url(manifest)
1840
1841         bootstrap_info = xpath_element(
1842             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1843             'bootstrap info', default=None)
1844
1845         vcodec = None
1846         mime_type = xpath_text(
1847             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1848             'base URL', default=None)
1849         if mime_type and mime_type.startswith('audio/'):
1850             vcodec = 'none'
1851
1852         for i, media_el in enumerate(media_nodes):
1853             tbr = int_or_none(media_el.attrib.get('bitrate'))
1854             width = int_or_none(media_el.attrib.get('width'))
1855             height = int_or_none(media_el.attrib.get('height'))
1856             format_id = join_nonempty(f4m_id, tbr or i)
1857             # If <bootstrapInfo> is present, the specified f4m is a
1858             # stream-level manifest, and only set-level manifests may refer to
1859             # external resources.  See section 11.4 and section 4 of F4M spec
1860             if bootstrap_info is None:
1861                 media_url = None
1862                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1863                 if manifest_version == '2.0':
1864                     media_url = media_el.attrib.get('href')
1865                 if media_url is None:
1866                     media_url = media_el.attrib.get('url')
1867                 if not media_url:
1868                     continue
1869                 manifest_url = (
1870                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1871                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1872                 # If media_url is itself a f4m manifest do the recursive extraction
1873                 # since bitrates in parent manifest (this one) and media_url manifest
1874                 # may differ leading to inability to resolve the format by requested
1875                 # bitrate in f4m downloader
1876                 ext = determine_ext(manifest_url)
1877                 if ext == 'f4m':
1878                     f4m_formats = self._extract_f4m_formats(
1879                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1880                         transform_source=transform_source, fatal=fatal)
1881                     # Sometimes stream-level manifest contains single media entry that
1882                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1883                     # At the same time parent's media entry in set-level manifest may
1884                     # contain it. We will copy it from parent in such cases.
1885                     if len(f4m_formats) == 1:
1886                         f = f4m_formats[0]
1887                         f.update({
1888                             'tbr': f.get('tbr') or tbr,
1889                             'width': f.get('width') or width,
1890                             'height': f.get('height') or height,
1891                             'format_id': f.get('format_id') if not tbr else format_id,
1892                             'vcodec': vcodec,
1893                         })
1894                     formats.extend(f4m_formats)
1895                     continue
1896                 elif ext == 'm3u8':
1897                     formats.extend(self._extract_m3u8_formats(
1898                         manifest_url, video_id, 'mp4', preference=preference,
1899                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1900                     continue
1901             formats.append({
1902                 'format_id': format_id,
1903                 'url': manifest_url,
1904                 'manifest_url': manifest_url,
1905                 'ext': 'flv' if bootstrap_info is not None else None,
1906                 'protocol': 'f4m',
1907                 'tbr': tbr,
1908                 'width': width,
1909                 'height': height,
1910                 'vcodec': vcodec,
1911                 'preference': preference,
1912                 'quality': quality,
1913             })
1914         return formats
1915
1916     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1917         return {
1918             'format_id': join_nonempty(m3u8_id, 'meta'),
1919             'url': m3u8_url,
1920             'ext': ext,
1921             'protocol': 'm3u8',
1922             'preference': preference - 100 if preference else -100,
1923             'quality': quality,
1924             'resolution': 'multiple',
1925             'format_note': 'Quality selection URL',
1926         }
1927
1928     def _report_ignoring_subs(self, name):
1929         self.report_warning(bug_reports_message(
1930             f'Ignoring subtitle tracks found in the {name} manifest; '
1931             'if any subtitle tracks are missing,'
1932         ), only_once=True)
1933
1934     def _extract_m3u8_formats(self, *args, **kwargs):
1935         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1936         if subs:
1937             self._report_ignoring_subs('HLS')
1938         return fmts
1939
1940     def _extract_m3u8_formats_and_subtitles(
1941             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1942             preference=None, quality=None, m3u8_id=None, note=None,
1943             errnote=None, fatal=True, live=False, data=None, headers={},
1944             query={}):
1945
1946         if self.get_param('ignore_no_formats_error'):
1947             fatal = False
1948
1949         if not m3u8_url:
1950             if errnote is not False:
1951                 errnote = errnote or 'Failed to obtain m3u8 URL'
1952                 if fatal:
1953                     raise ExtractorError(errnote, video_id=video_id)
1954                 self.report_warning(f'{errnote}{bug_reports_message()}')
1955             return [], {}
1956
1957         res = self._download_webpage_handle(
1958             m3u8_url, video_id,
1959             note='Downloading m3u8 information' if note is None else note,
1960             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1961             fatal=fatal, data=data, headers=headers, query=query)
1962
1963         if res is False:
1964             return [], {}
1965
1966         m3u8_doc, urlh = res
1967         m3u8_url = urlh.geturl()
1968
1969         return self._parse_m3u8_formats_and_subtitles(
1970             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1971             preference=preference, quality=quality, m3u8_id=m3u8_id,
1972             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1973             headers=headers, query=query, video_id=video_id)
1974
1975     def _parse_m3u8_formats_and_subtitles(
1976             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
1977             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1978             errnote=None, fatal=True, data=None, headers={}, query={},
1979             video_id=None):
1980         formats, subtitles = [], {}
1981
1982         has_drm = re.search('|'.join([
1983             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
1984             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
1985         ]), m3u8_doc)
1986
1987         def format_url(url):
1988             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
1989
1990         if self.get_param('hls_split_discontinuity', False):
1991             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1992                 if not m3u8_doc:
1993                     if not manifest_url:
1994                         return []
1995                     m3u8_doc = self._download_webpage(
1996                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
1997                         note=False, errnote='Failed to download m3u8 playlist information')
1998                     if m3u8_doc is False:
1999                         return []
2000                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2001
2002         else:
2003             def _extract_m3u8_playlist_indices(*args, **kwargs):
2004                 return [None]
2005
2006         # References:
2007         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2008         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2009         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2010
2011         # We should try extracting formats only from master playlists [1, 4.3.4],
2012         # i.e. playlists that describe available qualities. On the other hand
2013         # media playlists [1, 4.3.3] should be returned as is since they contain
2014         # just the media without qualities renditions.
2015         # Fortunately, master playlist can be easily distinguished from media
2016         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2017         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2018         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2019         # media playlist and MUST NOT appear in master playlist thus we can
2020         # clearly detect media playlist with this criterion.
2021
2022         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2023             formats = [{
2024                 'format_id': join_nonempty(m3u8_id, idx),
2025                 'format_index': idx,
2026                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2027                 'ext': ext,
2028                 'protocol': entry_protocol,
2029                 'preference': preference,
2030                 'quality': quality,
2031                 'has_drm': has_drm,
2032             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2033
2034             return formats, subtitles
2035
2036         groups = {}
2037         last_stream_inf = {}
2038
2039         def extract_media(x_media_line):
2040             media = parse_m3u8_attributes(x_media_line)
2041             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2042             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2043             if not (media_type and group_id and name):
2044                 return
2045             groups.setdefault(group_id, []).append(media)
2046             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2047             if media_type == 'SUBTITLES':
2048                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2049                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2050                 # However, lack of URI has been spotted in the wild.
2051                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2052                 if not media.get('URI'):
2053                     return
2054                 url = format_url(media['URI'])
2055                 sub_info = {
2056                     'url': url,
2057                     'ext': determine_ext(url),
2058                 }
2059                 if sub_info['ext'] == 'm3u8':
2060                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2061                     # files may contain is WebVTT:
2062                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2063                     sub_info['ext'] = 'vtt'
2064                     sub_info['protocol'] = 'm3u8_native'
2065                 lang = media.get('LANGUAGE') or 'und'
2066                 subtitles.setdefault(lang, []).append(sub_info)
2067             if media_type not in ('VIDEO', 'AUDIO'):
2068                 return
2069             media_url = media.get('URI')
2070             if media_url:
2071                 manifest_url = format_url(media_url)
2072                 formats.extend({
2073                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2074                     'format_note': name,
2075                     'format_index': idx,
2076                     'url': manifest_url,
2077                     'manifest_url': m3u8_url,
2078                     'language': media.get('LANGUAGE'),
2079                     'ext': ext,
2080                     'protocol': entry_protocol,
2081                     'preference': preference,
2082                     'quality': quality,
2083                     'has_drm': has_drm,
2084                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2085                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2086
2087         def build_stream_name():
2088             # Despite specification does not mention NAME attribute for
2089             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2090             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2091             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2092             stream_name = last_stream_inf.get('NAME')
2093             if stream_name:
2094                 return stream_name
2095             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2096             # from corresponding rendition group
2097             stream_group_id = last_stream_inf.get('VIDEO')
2098             if not stream_group_id:
2099                 return
2100             stream_group = groups.get(stream_group_id)
2101             if not stream_group:
2102                 return stream_group_id
2103             rendition = stream_group[0]
2104             return rendition.get('NAME') or stream_group_id
2105
2106         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2107         # chance to detect video only formats when EXT-X-STREAM-INF tags
2108         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2109         for line in m3u8_doc.splitlines():
2110             if line.startswith('#EXT-X-MEDIA:'):
2111                 extract_media(line)
2112
2113         for line in m3u8_doc.splitlines():
2114             if line.startswith('#EXT-X-STREAM-INF:'):
2115                 last_stream_inf = parse_m3u8_attributes(line)
2116             elif line.startswith('#') or not line.strip():
2117                 continue
2118             else:
2119                 tbr = float_or_none(
2120                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2121                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2122                 manifest_url = format_url(line.strip())
2123
2124                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2125                     format_id = [m3u8_id, None, idx]
2126                     # Bandwidth of live streams may differ over time thus making
2127                     # format_id unpredictable. So it's better to keep provided
2128                     # format_id intact.
2129                     if not live:
2130                         stream_name = build_stream_name()
2131                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2132                     f = {
2133                         'format_id': join_nonempty(*format_id),
2134                         'format_index': idx,
2135                         'url': manifest_url,
2136                         'manifest_url': m3u8_url,
2137                         'tbr': tbr,
2138                         'ext': ext,
2139                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2140                         'protocol': entry_protocol,
2141                         'preference': preference,
2142                         'quality': quality,
2143                         'has_drm': has_drm,
2144                     }
2145                     resolution = last_stream_inf.get('RESOLUTION')
2146                     if resolution:
2147                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2148                         if mobj:
2149                             f['width'] = int(mobj.group('width'))
2150                             f['height'] = int(mobj.group('height'))
2151                     # Unified Streaming Platform
2152                     mobj = re.search(
2153                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2154                     if mobj:
2155                         abr, vbr = mobj.groups()
2156                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2157                         f.update({
2158                             'vbr': vbr,
2159                             'abr': abr,
2160                         })
2161                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2162                     f.update(codecs)
2163                     audio_group_id = last_stream_inf.get('AUDIO')
2164                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2165                     # references a rendition group MUST have a CODECS attribute.
2166                     # However, this is not always respected. E.g. [2]
2167                     # contains EXT-X-STREAM-INF tag which references AUDIO
2168                     # rendition group but does not have CODECS and despite
2169                     # referencing an audio group it represents a complete
2170                     # (with audio and video) format. So, for such cases we will
2171                     # ignore references to rendition groups and treat them
2172                     # as complete formats.
2173                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2174                         audio_group = groups.get(audio_group_id)
2175                         if audio_group and audio_group[0].get('URI'):
2176                             # TODO: update acodec for audio only formats with
2177                             # the same GROUP-ID
2178                             f['acodec'] = 'none'
2179                     if not f.get('ext'):
2180                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2181                     formats.append(f)
2182
2183                     # for DailyMotion
2184                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2185                     if progressive_uri:
2186                         http_f = f.copy()
2187                         del http_f['manifest_url']
2188                         http_f.update({
2189                             'format_id': f['format_id'].replace('hls-', 'http-'),
2190                             'protocol': 'http',
2191                             'url': progressive_uri,
2192                         })
2193                         formats.append(http_f)
2194
2195                 last_stream_inf = {}
2196         return formats, subtitles
2197
2198     def _extract_m3u8_vod_duration(
2199             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2200
2201         m3u8_vod = self._download_webpage(
2202             m3u8_vod_url, video_id,
2203             note='Downloading m3u8 VOD manifest' if note is None else note,
2204             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2205             fatal=False, data=data, headers=headers, query=query)
2206
2207         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2208
2209     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2210         if '#EXT-X-ENDLIST' not in m3u8_vod:
2211             return None
2212
2213         return int(sum(
2214             float(line[len('#EXTINF:'):].split(',')[0])
2215             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2216
2217     def _extract_mpd_vod_duration(
2218             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2219
2220         mpd_doc = self._download_xml(
2221             mpd_url, video_id,
2222             note='Downloading MPD VOD manifest' if note is None else note,
2223             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2224             fatal=False, data=data, headers=headers, query=query) or {}
2225         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2226
2227     @staticmethod
2228     def _xpath_ns(path, namespace=None):
2229         if not namespace:
2230             return path
2231         out = []
2232         for c in path.split('/'):
2233             if not c or c == '.':
2234                 out.append(c)
2235             else:
2236                 out.append('{%s}%s' % (namespace, c))
2237         return '/'.join(out)
2238
2239     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2240         if self.get_param('ignore_no_formats_error'):
2241             fatal = False
2242
2243         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2244         if res is False:
2245             assert not fatal
2246             return [], {}
2247
2248         smil, urlh = res
2249         smil_url = urlh.geturl()
2250
2251         namespace = self._parse_smil_namespace(smil)
2252
2253         fmts = self._parse_smil_formats(
2254             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2255         subs = self._parse_smil_subtitles(
2256             smil, namespace=namespace)
2257
2258         return fmts, subs
2259
2260     def _extract_smil_formats(self, *args, **kwargs):
2261         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2262         if subs:
2263             self._report_ignoring_subs('SMIL')
2264         return fmts
2265
2266     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2267         res = self._download_smil(smil_url, video_id, fatal=fatal)
2268         if res is False:
2269             return {}
2270
2271         smil, urlh = res
2272         smil_url = urlh.geturl()
2273
2274         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2275
2276     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2277         return self._download_xml_handle(
2278             smil_url, video_id, 'Downloading SMIL file',
2279             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2280
2281     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2282         namespace = self._parse_smil_namespace(smil)
2283
2284         formats = self._parse_smil_formats(
2285             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2286         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2287
2288         video_id = os.path.splitext(url_basename(smil_url))[0]
2289         title = None
2290         description = None
2291         upload_date = None
2292         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2293             name = meta.attrib.get('name')
2294             content = meta.attrib.get('content')
2295             if not name or not content:
2296                 continue
2297             if not title and name == 'title':
2298                 title = content
2299             elif not description and name in ('description', 'abstract'):
2300                 description = content
2301             elif not upload_date and name == 'date':
2302                 upload_date = unified_strdate(content)
2303
2304         thumbnails = [{
2305             'id': image.get('type'),
2306             'url': image.get('src'),
2307             'width': int_or_none(image.get('width')),
2308             'height': int_or_none(image.get('height')),
2309         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2310
2311         return {
2312             'id': video_id,
2313             'title': title or video_id,
2314             'description': description,
2315             'upload_date': upload_date,
2316             'thumbnails': thumbnails,
2317             'formats': formats,
2318             'subtitles': subtitles,
2319         }
2320
2321     def _parse_smil_namespace(self, smil):
2322         return self._search_regex(
2323             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2324
2325     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2326         base = smil_url
2327         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2328             b = meta.get('base') or meta.get('httpBase')
2329             if b:
2330                 base = b
2331                 break
2332
2333         formats = []
2334         rtmp_count = 0
2335         http_count = 0
2336         m3u8_count = 0
2337         imgs_count = 0
2338
2339         srcs = set()
2340         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2341         for medium in media:
2342             src = medium.get('src')
2343             if not src or src in srcs:
2344                 continue
2345             srcs.add(src)
2346
2347             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2348             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2349             width = int_or_none(medium.get('width'))
2350             height = int_or_none(medium.get('height'))
2351             proto = medium.get('proto')
2352             ext = medium.get('ext')
2353             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2354                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2355             streamer = medium.get('streamer') or base
2356
2357             if proto == 'rtmp' or streamer.startswith('rtmp'):
2358                 rtmp_count += 1
2359                 formats.append({
2360                     'url': streamer,
2361                     'play_path': src,
2362                     'ext': 'flv',
2363                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2364                     'tbr': bitrate,
2365                     'filesize': filesize,
2366                     'width': width,
2367                     'height': height,
2368                 })
2369                 if transform_rtmp_url:
2370                     streamer, src = transform_rtmp_url(streamer, src)
2371                     formats[-1].update({
2372                         'url': streamer,
2373                         'play_path': src,
2374                     })
2375                 continue
2376
2377             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2378             src_url = src_url.strip()
2379
2380             if proto == 'm3u8' or src_ext == 'm3u8':
2381                 m3u8_formats = self._extract_m3u8_formats(
2382                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2383                 if len(m3u8_formats) == 1:
2384                     m3u8_count += 1
2385                     m3u8_formats[0].update({
2386                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2387                         'tbr': bitrate,
2388                         'width': width,
2389                         'height': height,
2390                     })
2391                 formats.extend(m3u8_formats)
2392             elif src_ext == 'f4m':
2393                 f4m_url = src_url
2394                 if not f4m_params:
2395                     f4m_params = {
2396                         'hdcore': '3.2.0',
2397                         'plugin': 'flowplayer-3.2.0.1',
2398                     }
2399                 f4m_url += '&' if '?' in f4m_url else '?'
2400                 f4m_url += urllib.parse.urlencode(f4m_params)
2401                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2402             elif src_ext == 'mpd':
2403                 formats.extend(self._extract_mpd_formats(
2404                     src_url, video_id, mpd_id='dash', fatal=False))
2405             elif re.search(r'\.ism/[Mm]anifest', src_url):
2406                 formats.extend(self._extract_ism_formats(
2407                     src_url, video_id, ism_id='mss', fatal=False))
2408             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2409                 http_count += 1
2410                 formats.append({
2411                     'url': src_url,
2412                     'ext': ext or src_ext or 'flv',
2413                     'format_id': 'http-%d' % (bitrate or http_count),
2414                     'tbr': bitrate,
2415                     'filesize': filesize,
2416                     'width': width,
2417                     'height': height,
2418                 })
2419
2420         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2421             src = medium.get('src')
2422             if not src or src in srcs:
2423                 continue
2424             srcs.add(src)
2425
2426             imgs_count += 1
2427             formats.append({
2428                 'format_id': 'imagestream-%d' % (imgs_count),
2429                 'url': src,
2430                 'ext': mimetype2ext(medium.get('type')),
2431                 'acodec': 'none',
2432                 'vcodec': 'none',
2433                 'width': int_or_none(medium.get('width')),
2434                 'height': int_or_none(medium.get('height')),
2435                 'format_note': 'SMIL storyboards',
2436             })
2437
2438         return formats
2439
2440     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2441         urls = []
2442         subtitles = {}
2443         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2444             src = textstream.get('src')
2445             if not src or src in urls:
2446                 continue
2447             urls.append(src)
2448             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2449             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2450             subtitles.setdefault(lang, []).append({
2451                 'url': src,
2452                 'ext': ext,
2453             })
2454         return subtitles
2455
2456     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2457         res = self._download_xml_handle(
2458             xspf_url, playlist_id, 'Downloading xpsf playlist',
2459             'Unable to download xspf manifest', fatal=fatal)
2460         if res is False:
2461             return []
2462
2463         xspf, urlh = res
2464         xspf_url = urlh.geturl()
2465
2466         return self._parse_xspf(
2467             xspf, playlist_id, xspf_url=xspf_url,
2468             xspf_base_url=base_url(xspf_url))
2469
2470     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2471         NS_MAP = {
2472             'xspf': 'http://xspf.org/ns/0/',
2473             's1': 'http://static.streamone.nl/player/ns/0',
2474         }
2475
2476         entries = []
2477         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2478             title = xpath_text(
2479                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2480             description = xpath_text(
2481                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2482             thumbnail = xpath_text(
2483                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2484             duration = float_or_none(
2485                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2486
2487             formats = []
2488             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2489                 format_url = urljoin(xspf_base_url, location.text)
2490                 if not format_url:
2491                     continue
2492                 formats.append({
2493                     'url': format_url,
2494                     'manifest_url': xspf_url,
2495                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2496                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2497                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2498                 })
2499
2500             entries.append({
2501                 'id': playlist_id,
2502                 'title': title,
2503                 'description': description,
2504                 'thumbnail': thumbnail,
2505                 'duration': duration,
2506                 'formats': formats,
2507             })
2508         return entries
2509
2510     def _extract_mpd_formats(self, *args, **kwargs):
2511         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2512         if subs:
2513             self._report_ignoring_subs('DASH')
2514         return fmts
2515
2516     def _extract_mpd_formats_and_subtitles(
2517             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2518             fatal=True, data=None, headers={}, query={}):
2519
2520         if self.get_param('ignore_no_formats_error'):
2521             fatal = False
2522
2523         res = self._download_xml_handle(
2524             mpd_url, video_id,
2525             note='Downloading MPD manifest' if note is None else note,
2526             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2527             fatal=fatal, data=data, headers=headers, query=query)
2528         if res is False:
2529             return [], {}
2530         mpd_doc, urlh = res
2531         if mpd_doc is None:
2532             return [], {}
2533
2534         # We could have been redirected to a new url when we retrieved our mpd file.
2535         mpd_url = urlh.geturl()
2536         mpd_base_url = base_url(mpd_url)
2537
2538         return self._parse_mpd_formats_and_subtitles(
2539             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2540
2541     def _parse_mpd_formats(self, *args, **kwargs):
2542         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2543         if subs:
2544             self._report_ignoring_subs('DASH')
2545         return fmts
2546
2547     def _parse_mpd_formats_and_subtitles(
2548             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2549         """
2550         Parse formats from MPD manifest.
2551         References:
2552          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2553             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2554          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2555         """
2556         if not self.get_param('dynamic_mpd', True):
2557             if mpd_doc.get('type') == 'dynamic':
2558                 return [], {}
2559
2560         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2561
2562         def _add_ns(path):
2563             return self._xpath_ns(path, namespace)
2564
2565         def is_drm_protected(element):
2566             return element.find(_add_ns('ContentProtection')) is not None
2567
2568         def extract_multisegment_info(element, ms_parent_info):
2569             ms_info = ms_parent_info.copy()
2570
2571             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2572             # common attributes and elements.  We will only extract relevant
2573             # for us.
2574             def extract_common(source):
2575                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2576                 if segment_timeline is not None:
2577                     s_e = segment_timeline.findall(_add_ns('S'))
2578                     if s_e:
2579                         ms_info['total_number'] = 0
2580                         ms_info['s'] = []
2581                         for s in s_e:
2582                             r = int(s.get('r', 0))
2583                             ms_info['total_number'] += 1 + r
2584                             ms_info['s'].append({
2585                                 't': int(s.get('t', 0)),
2586                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2587                                 'd': int(s.attrib['d']),
2588                                 'r': r,
2589                             })
2590                 start_number = source.get('startNumber')
2591                 if start_number:
2592                     ms_info['start_number'] = int(start_number)
2593                 timescale = source.get('timescale')
2594                 if timescale:
2595                     ms_info['timescale'] = int(timescale)
2596                 segment_duration = source.get('duration')
2597                 if segment_duration:
2598                     ms_info['segment_duration'] = float(segment_duration)
2599
2600             def extract_Initialization(source):
2601                 initialization = source.find(_add_ns('Initialization'))
2602                 if initialization is not None:
2603                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2604
2605             segment_list = element.find(_add_ns('SegmentList'))
2606             if segment_list is not None:
2607                 extract_common(segment_list)
2608                 extract_Initialization(segment_list)
2609                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2610                 if segment_urls_e:
2611                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2612             else:
2613                 segment_template = element.find(_add_ns('SegmentTemplate'))
2614                 if segment_template is not None:
2615                     extract_common(segment_template)
2616                     media = segment_template.get('media')
2617                     if media:
2618                         ms_info['media'] = media
2619                     initialization = segment_template.get('initialization')
2620                     if initialization:
2621                         ms_info['initialization'] = initialization
2622                     else:
2623                         extract_Initialization(segment_template)
2624             return ms_info
2625
2626         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2627         formats, subtitles = [], {}
2628         stream_numbers = collections.defaultdict(int)
2629         for period in mpd_doc.findall(_add_ns('Period')):
2630             period_duration = parse_duration(period.get('duration')) or mpd_duration
2631             period_ms_info = extract_multisegment_info(period, {
2632                 'start_number': 1,
2633                 'timescale': 1,
2634             })
2635             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2636                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2637                 for representation in adaptation_set.findall(_add_ns('Representation')):
2638                     representation_attrib = adaptation_set.attrib.copy()
2639                     representation_attrib.update(representation.attrib)
2640                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2641                     mime_type = representation_attrib['mimeType']
2642                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2643
2644                     codec_str = representation_attrib.get('codecs', '')
2645                     # Some kind of binary subtitle found in some youtube livestreams
2646                     if mime_type == 'application/x-rawcc':
2647                         codecs = {'scodec': codec_str}
2648                     else:
2649                         codecs = parse_codecs(codec_str)
2650                     if content_type not in ('video', 'audio', 'text'):
2651                         if mime_type == 'image/jpeg':
2652                             content_type = mime_type
2653                         elif codecs.get('vcodec', 'none') != 'none':
2654                             content_type = 'video'
2655                         elif codecs.get('acodec', 'none') != 'none':
2656                             content_type = 'audio'
2657                         elif codecs.get('scodec', 'none') != 'none':
2658                             content_type = 'text'
2659                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2660                             content_type = 'text'
2661                         else:
2662                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2663                             continue
2664
2665                     base_url = ''
2666                     for element in (representation, adaptation_set, period, mpd_doc):
2667                         base_url_e = element.find(_add_ns('BaseURL'))
2668                         if try_call(lambda: base_url_e.text) is not None:
2669                             base_url = base_url_e.text + base_url
2670                             if re.match(r'^https?://', base_url):
2671                                 break
2672                     if mpd_base_url and base_url.startswith('/'):
2673                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2674                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2675                         if not mpd_base_url.endswith('/'):
2676                             mpd_base_url += '/'
2677                         base_url = mpd_base_url + base_url
2678                     representation_id = representation_attrib.get('id')
2679                     lang = representation_attrib.get('lang')
2680                     url_el = representation.find(_add_ns('BaseURL'))
2681                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2682                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2683                     if representation_id is not None:
2684                         format_id = representation_id
2685                     else:
2686                         format_id = content_type
2687                     if mpd_id:
2688                         format_id = mpd_id + '-' + format_id
2689                     if content_type in ('video', 'audio'):
2690                         f = {
2691                             'format_id': format_id,
2692                             'manifest_url': mpd_url,
2693                             'ext': mimetype2ext(mime_type),
2694                             'width': int_or_none(representation_attrib.get('width')),
2695                             'height': int_or_none(representation_attrib.get('height')),
2696                             'tbr': float_or_none(bandwidth, 1000),
2697                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2698                             'fps': int_or_none(representation_attrib.get('frameRate')),
2699                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2700                             'format_note': 'DASH %s' % content_type,
2701                             'filesize': filesize,
2702                             'container': mimetype2ext(mime_type) + '_dash',
2703                             **codecs
2704                         }
2705                     elif content_type == 'text':
2706                         f = {
2707                             'ext': mimetype2ext(mime_type),
2708                             'manifest_url': mpd_url,
2709                             'filesize': filesize,
2710                         }
2711                     elif content_type == 'image/jpeg':
2712                         # See test case in VikiIE
2713                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2714                         f = {
2715                             'format_id': format_id,
2716                             'ext': 'mhtml',
2717                             'manifest_url': mpd_url,
2718                             'format_note': 'DASH storyboards (jpeg)',
2719                             'acodec': 'none',
2720                             'vcodec': 'none',
2721                         }
2722                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2723                         f['has_drm'] = True
2724                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2725
2726                     def prepare_template(template_name, identifiers):
2727                         tmpl = representation_ms_info[template_name]
2728                         if representation_id is not None:
2729                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2730                         # First of, % characters outside $...$ templates
2731                         # must be escaped by doubling for proper processing
2732                         # by % operator string formatting used further (see
2733                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2734                         t = ''
2735                         in_template = False
2736                         for c in tmpl:
2737                             t += c
2738                             if c == '$':
2739                                 in_template = not in_template
2740                             elif c == '%' and not in_template:
2741                                 t += c
2742                         # Next, $...$ templates are translated to their
2743                         # %(...) counterparts to be used with % operator
2744                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2745                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2746                         t.replace('$$', '$')
2747                         return t
2748
2749                     # @initialization is a regular template like @media one
2750                     # so it should be handled just the same way (see
2751                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2752                     if 'initialization' in representation_ms_info:
2753                         initialization_template = prepare_template(
2754                             'initialization',
2755                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2756                             # $Time$ shall not be included for @initialization thus
2757                             # only $Bandwidth$ remains
2758                             ('Bandwidth', ))
2759                         representation_ms_info['initialization_url'] = initialization_template % {
2760                             'Bandwidth': bandwidth,
2761                         }
2762
2763                     def location_key(location):
2764                         return 'url' if re.match(r'^https?://', location) else 'path'
2765
2766                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2767
2768                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2769                         media_location_key = location_key(media_template)
2770
2771                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2772                         # can't be used at the same time
2773                         if '%(Number' in media_template and 's' not in representation_ms_info:
2774                             segment_duration = None
2775                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2776                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2777                                 representation_ms_info['total_number'] = int(math.ceil(
2778                                     float_or_none(period_duration, segment_duration, default=0)))
2779                             representation_ms_info['fragments'] = [{
2780                                 media_location_key: media_template % {
2781                                     'Number': segment_number,
2782                                     'Bandwidth': bandwidth,
2783                                 },
2784                                 'duration': segment_duration,
2785                             } for segment_number in range(
2786                                 representation_ms_info['start_number'],
2787                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2788                         else:
2789                             # $Number*$ or $Time$ in media template with S list available
2790                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2791                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2792                             representation_ms_info['fragments'] = []
2793                             segment_time = 0
2794                             segment_d = None
2795                             segment_number = representation_ms_info['start_number']
2796
2797                             def add_segment_url():
2798                                 segment_url = media_template % {
2799                                     'Time': segment_time,
2800                                     'Bandwidth': bandwidth,
2801                                     'Number': segment_number,
2802                                 }
2803                                 representation_ms_info['fragments'].append({
2804                                     media_location_key: segment_url,
2805                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2806                                 })
2807
2808                             for num, s in enumerate(representation_ms_info['s']):
2809                                 segment_time = s.get('t') or segment_time
2810                                 segment_d = s['d']
2811                                 add_segment_url()
2812                                 segment_number += 1
2813                                 for r in range(s.get('r', 0)):
2814                                     segment_time += segment_d
2815                                     add_segment_url()
2816                                     segment_number += 1
2817                                 segment_time += segment_d
2818                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2819                         # No media template,
2820                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2821                         # or any YouTube dashsegments video
2822                         fragments = []
2823                         segment_index = 0
2824                         timescale = representation_ms_info['timescale']
2825                         for s in representation_ms_info['s']:
2826                             duration = float_or_none(s['d'], timescale)
2827                             for r in range(s.get('r', 0) + 1):
2828                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2829                                 fragments.append({
2830                                     location_key(segment_uri): segment_uri,
2831                                     'duration': duration,
2832                                 })
2833                                 segment_index += 1
2834                         representation_ms_info['fragments'] = fragments
2835                     elif 'segment_urls' in representation_ms_info:
2836                         # Segment URLs with no SegmentTimeline
2837                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2838                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2839                         fragments = []
2840                         segment_duration = float_or_none(
2841                             representation_ms_info['segment_duration'],
2842                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2843                         for segment_url in representation_ms_info['segment_urls']:
2844                             fragment = {
2845                                 location_key(segment_url): segment_url,
2846                             }
2847                             if segment_duration:
2848                                 fragment['duration'] = segment_duration
2849                             fragments.append(fragment)
2850                         representation_ms_info['fragments'] = fragments
2851                     # If there is a fragments key available then we correctly recognized fragmented media.
2852                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2853                     # assumption is not necessarily correct since we may simply have no support for
2854                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2855                     if 'fragments' in representation_ms_info:
2856                         f.update({
2857                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2858                             'url': mpd_url or base_url,
2859                             'fragment_base_url': base_url,
2860                             'fragments': [],
2861                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2862                         })
2863                         if 'initialization_url' in representation_ms_info:
2864                             initialization_url = representation_ms_info['initialization_url']
2865                             if not f.get('url'):
2866                                 f['url'] = initialization_url
2867                             f['fragments'].append({location_key(initialization_url): initialization_url})
2868                         f['fragments'].extend(representation_ms_info['fragments'])
2869                         if not period_duration:
2870                             period_duration = try_get(
2871                                 representation_ms_info,
2872                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2873                     else:
2874                         # Assuming direct URL to unfragmented media.
2875                         f['url'] = base_url
2876                     if content_type in ('video', 'audio', 'image/jpeg'):
2877                         f['manifest_stream_number'] = stream_numbers[f['url']]
2878                         stream_numbers[f['url']] += 1
2879                         formats.append(f)
2880                     elif content_type == 'text':
2881                         subtitles.setdefault(lang or 'und', []).append(f)
2882
2883         return formats, subtitles
2884
2885     def _extract_ism_formats(self, *args, **kwargs):
2886         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2887         if subs:
2888             self._report_ignoring_subs('ISM')
2889         return fmts
2890
2891     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2892         if self.get_param('ignore_no_formats_error'):
2893             fatal = False
2894
2895         res = self._download_xml_handle(
2896             ism_url, video_id,
2897             note='Downloading ISM manifest' if note is None else note,
2898             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2899             fatal=fatal, data=data, headers=headers, query=query)
2900         if res is False:
2901             return [], {}
2902         ism_doc, urlh = res
2903         if ism_doc is None:
2904             return [], {}
2905
2906         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2907
2908     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2909         """
2910         Parse formats from ISM manifest.
2911         References:
2912          1. [MS-SSTR]: Smooth Streaming Protocol,
2913             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2914         """
2915         if ism_doc.get('IsLive') == 'TRUE':
2916             return [], {}
2917
2918         duration = int(ism_doc.attrib['Duration'])
2919         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2920
2921         formats = []
2922         subtitles = {}
2923         for stream in ism_doc.findall('StreamIndex'):
2924             stream_type = stream.get('Type')
2925             if stream_type not in ('video', 'audio', 'text'):
2926                 continue
2927             url_pattern = stream.attrib['Url']
2928             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2929             stream_name = stream.get('Name')
2930             stream_language = stream.get('Language', 'und')
2931             for track in stream.findall('QualityLevel'):
2932                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2933                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
2934                 # TODO: add support for WVC1 and WMAP
2935                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
2936                     self.report_warning('%s is not a supported codec' % fourcc)
2937                     continue
2938                 tbr = int(track.attrib['Bitrate']) // 1000
2939                 # [1] does not mention Width and Height attributes. However,
2940                 # they're often present while MaxWidth and MaxHeight are
2941                 # missing, so should be used as fallbacks
2942                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2943                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2944                 sampling_rate = int_or_none(track.get('SamplingRate'))
2945
2946                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2947                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
2948
2949                 fragments = []
2950                 fragment_ctx = {
2951                     'time': 0,
2952                 }
2953                 stream_fragments = stream.findall('c')
2954                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2955                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2956                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2957                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2958                     if not fragment_ctx['duration']:
2959                         try:
2960                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2961                         except IndexError:
2962                             next_fragment_time = duration
2963                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2964                     for _ in range(fragment_repeat):
2965                         fragments.append({
2966                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
2967                             'duration': fragment_ctx['duration'] / stream_timescale,
2968                         })
2969                         fragment_ctx['time'] += fragment_ctx['duration']
2970
2971                 if stream_type == 'text':
2972                     subtitles.setdefault(stream_language, []).append({
2973                         'ext': 'ismt',
2974                         'protocol': 'ism',
2975                         'url': ism_url,
2976                         'manifest_url': ism_url,
2977                         'fragments': fragments,
2978                         '_download_params': {
2979                             'stream_type': stream_type,
2980                             'duration': duration,
2981                             'timescale': stream_timescale,
2982                             'fourcc': fourcc,
2983                             'language': stream_language,
2984                             'codec_private_data': track.get('CodecPrivateData'),
2985                         }
2986                     })
2987                 elif stream_type in ('video', 'audio'):
2988                     formats.append({
2989                         'format_id': join_nonempty(ism_id, stream_name, tbr),
2990                         'url': ism_url,
2991                         'manifest_url': ism_url,
2992                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2993                         'width': width,
2994                         'height': height,
2995                         'tbr': tbr,
2996                         'asr': sampling_rate,
2997                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2998                         'acodec': 'none' if stream_type == 'video' else fourcc,
2999                         'protocol': 'ism',
3000                         'fragments': fragments,
3001                         'has_drm': ism_doc.find('Protection') is not None,
3002                         'language': stream_language,
3003                         'audio_channels': int_or_none(track.get('Channels')),
3004                         '_download_params': {
3005                             'stream_type': stream_type,
3006                             'duration': duration,
3007                             'timescale': stream_timescale,
3008                             'width': width or 0,
3009                             'height': height or 0,
3010                             'fourcc': fourcc,
3011                             'language': stream_language,
3012                             'codec_private_data': track.get('CodecPrivateData'),
3013                             'sampling_rate': sampling_rate,
3014                             'channels': int_or_none(track.get('Channels', 2)),
3015                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3016                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3017                         },
3018                     })
3019         return formats, subtitles
3020
3021     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3022         def absolute_url(item_url):
3023             return urljoin(base_url, item_url)
3024
3025         def parse_content_type(content_type):
3026             if not content_type:
3027                 return {}
3028             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3029             if ctr:
3030                 mimetype, codecs = ctr.groups()
3031                 f = parse_codecs(codecs)
3032                 f['ext'] = mimetype2ext(mimetype)
3033                 return f
3034             return {}
3035
3036         def _media_formats(src, cur_media_type, type_info=None):
3037             type_info = type_info or {}
3038             full_url = absolute_url(src)
3039             ext = type_info.get('ext') or determine_ext(full_url)
3040             if ext == 'm3u8':
3041                 is_plain_url = False
3042                 formats = self._extract_m3u8_formats(
3043                     full_url, video_id, ext='mp4',
3044                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3045                     preference=preference, quality=quality, fatal=False)
3046             elif ext == 'mpd':
3047                 is_plain_url = False
3048                 formats = self._extract_mpd_formats(
3049                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3050             else:
3051                 is_plain_url = True
3052                 formats = [{
3053                     'url': full_url,
3054                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3055                     'ext': ext,
3056                 }]
3057             return is_plain_url, formats
3058
3059         entries = []
3060         # amp-video and amp-audio are very similar to their HTML5 counterparts
3061         # so we will include them right here (see
3062         # https://www.ampproject.org/docs/reference/components/amp-video)
3063         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3064         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3065         media_tags = [(media_tag, media_tag_name, media_type, '')
3066                       for media_tag, media_tag_name, media_type
3067                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3068         media_tags.extend(re.findall(
3069             # We only allow video|audio followed by a whitespace or '>'.
3070             # Allowing more characters may end up in significant slow down (see
3071             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3072             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3073             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3074         for media_tag, _, media_type, media_content in media_tags:
3075             media_info = {
3076                 'formats': [],
3077                 'subtitles': {},
3078             }
3079             media_attributes = extract_attributes(media_tag)
3080             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3081             if src:
3082                 f = parse_content_type(media_attributes.get('type'))
3083                 _, formats = _media_formats(src, media_type, f)
3084                 media_info['formats'].extend(formats)
3085             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3086             if media_content:
3087                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3088                     s_attr = extract_attributes(source_tag)
3089                     # data-video-src and data-src are non standard but seen
3090                     # several times in the wild
3091                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3092                     if not src:
3093                         continue
3094                     f = parse_content_type(s_attr.get('type'))
3095                     is_plain_url, formats = _media_formats(src, media_type, f)
3096                     if is_plain_url:
3097                         # width, height, res, label and title attributes are
3098                         # all not standard but seen several times in the wild
3099                         labels = [
3100                             s_attr.get(lbl)
3101                             for lbl in ('label', 'title')
3102                             if str_or_none(s_attr.get(lbl))
3103                         ]
3104                         width = int_or_none(s_attr.get('width'))
3105                         height = (int_or_none(s_attr.get('height'))
3106                                   or int_or_none(s_attr.get('res')))
3107                         if not width or not height:
3108                             for lbl in labels:
3109                                 resolution = parse_resolution(lbl)
3110                                 if not resolution:
3111                                     continue
3112                                 width = width or resolution.get('width')
3113                                 height = height or resolution.get('height')
3114                         for lbl in labels:
3115                             tbr = parse_bitrate(lbl)
3116                             if tbr:
3117                                 break
3118                         else:
3119                             tbr = None
3120                         f.update({
3121                             'width': width,
3122                             'height': height,
3123                             'tbr': tbr,
3124                             'format_id': s_attr.get('label') or s_attr.get('title'),
3125                         })
3126                         f.update(formats[0])
3127                         media_info['formats'].append(f)
3128                     else:
3129                         media_info['formats'].extend(formats)
3130                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3131                     track_attributes = extract_attributes(track_tag)
3132                     kind = track_attributes.get('kind')
3133                     if not kind or kind in ('subtitles', 'captions'):
3134                         src = strip_or_none(track_attributes.get('src'))
3135                         if not src:
3136                             continue
3137                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3138                         media_info['subtitles'].setdefault(lang, []).append({
3139                             'url': absolute_url(src),
3140                         })
3141             for f in media_info['formats']:
3142                 f.setdefault('http_headers', {})['Referer'] = base_url
3143             if media_info['formats'] or media_info['subtitles']:
3144                 entries.append(media_info)
3145         return entries
3146
3147     def _extract_akamai_formats(self, *args, **kwargs):
3148         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3149         if subs:
3150             self._report_ignoring_subs('akamai')
3151         return fmts
3152
3153     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3154         signed = 'hdnea=' in manifest_url
3155         if not signed:
3156             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3157             manifest_url = re.sub(
3158                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3159                 '', manifest_url).strip('?')
3160
3161         formats = []
3162         subtitles = {}
3163
3164         hdcore_sign = 'hdcore=3.7.0'
3165         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3166         hds_host = hosts.get('hds')
3167         if hds_host:
3168             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3169         if 'hdcore=' not in f4m_url:
3170             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3171         f4m_formats = self._extract_f4m_formats(
3172             f4m_url, video_id, f4m_id='hds', fatal=False)
3173         for entry in f4m_formats:
3174             entry.update({'extra_param_to_segment_url': hdcore_sign})
3175         formats.extend(f4m_formats)
3176
3177         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3178         hls_host = hosts.get('hls')
3179         if hls_host:
3180             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3181         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3182             m3u8_url, video_id, 'mp4', 'm3u8_native',
3183             m3u8_id='hls', fatal=False)
3184         formats.extend(m3u8_formats)
3185         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3186
3187         http_host = hosts.get('http')
3188         if http_host and m3u8_formats and not signed:
3189             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3190             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3191             qualities_length = len(qualities)
3192             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3193                 i = 0
3194                 for f in m3u8_formats:
3195                     if f['vcodec'] != 'none':
3196                         for protocol in ('http', 'https'):
3197                             http_f = f.copy()
3198                             del http_f['manifest_url']
3199                             http_url = re.sub(
3200                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3201                             http_f.update({
3202                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3203                                 'url': http_url,
3204                                 'protocol': protocol,
3205                             })
3206                             formats.append(http_f)
3207                         i += 1
3208
3209         return formats, subtitles
3210
3211     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3212         query = urllib.parse.urlparse(url).query
3213         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3214         mobj = re.search(
3215             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3216         url_base = mobj.group('url')
3217         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3218         formats = []
3219
3220         def manifest_url(manifest):
3221             m_url = f'{http_base_url}/{manifest}'
3222             if query:
3223                 m_url += '?%s' % query
3224             return m_url
3225
3226         if 'm3u8' not in skip_protocols:
3227             formats.extend(self._extract_m3u8_formats(
3228                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3229                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3230         if 'f4m' not in skip_protocols:
3231             formats.extend(self._extract_f4m_formats(
3232                 manifest_url('manifest.f4m'),
3233                 video_id, f4m_id='hds', fatal=False))
3234         if 'dash' not in skip_protocols:
3235             formats.extend(self._extract_mpd_formats(
3236                 manifest_url('manifest.mpd'),
3237                 video_id, mpd_id='dash', fatal=False))
3238         if re.search(r'(?:/smil:|\.smil)', url_base):
3239             if 'smil' not in skip_protocols:
3240                 rtmp_formats = self._extract_smil_formats(
3241                     manifest_url('jwplayer.smil'),
3242                     video_id, fatal=False)
3243                 for rtmp_format in rtmp_formats:
3244                     rtsp_format = rtmp_format.copy()
3245                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3246                     del rtsp_format['play_path']
3247                     del rtsp_format['ext']
3248                     rtsp_format.update({
3249                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3250                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3251                         'protocol': 'rtsp',
3252                     })
3253                     formats.extend([rtmp_format, rtsp_format])
3254         else:
3255             for protocol in ('rtmp', 'rtsp'):
3256                 if protocol not in skip_protocols:
3257                     formats.append({
3258                         'url': f'{protocol}:{url_base}',
3259                         'format_id': protocol,
3260                         'protocol': protocol,
3261                     })
3262         return formats
3263
3264     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3265         mobj = re.search(
3266             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3267             webpage)
3268         if mobj:
3269             try:
3270                 jwplayer_data = self._parse_json(mobj.group('options'),
3271                                                  video_id=video_id,
3272                                                  transform_source=transform_source)
3273             except ExtractorError:
3274                 pass
3275             else:
3276                 if isinstance(jwplayer_data, dict):
3277                     return jwplayer_data
3278
3279     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3280         jwplayer_data = self._find_jwplayer_data(
3281             webpage, video_id, transform_source=js_to_json)
3282         return self._parse_jwplayer_data(
3283             jwplayer_data, video_id, *args, **kwargs)
3284
3285     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3286                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3287         entries = []
3288         if not isinstance(jwplayer_data, dict):
3289             return entries
3290
3291         playlist_items = jwplayer_data.get('playlist')
3292         # JWPlayer backward compatibility: single playlist item/flattened playlists
3293         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3294         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3295         if not isinstance(playlist_items, list):
3296             playlist_items = (playlist_items or jwplayer_data, )
3297
3298         for video_data in playlist_items:
3299             if not isinstance(video_data, dict):
3300                 continue
3301             # JWPlayer backward compatibility: flattened sources
3302             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3303             if 'sources' not in video_data:
3304                 video_data['sources'] = [video_data]
3305
3306             this_video_id = video_id or video_data['mediaid']
3307
3308             formats = self._parse_jwplayer_formats(
3309                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3310                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3311
3312             subtitles = {}
3313             tracks = video_data.get('tracks')
3314             if tracks and isinstance(tracks, list):
3315                 for track in tracks:
3316                     if not isinstance(track, dict):
3317                         continue
3318                     track_kind = track.get('kind')
3319                     if not track_kind or not isinstance(track_kind, str):
3320                         continue
3321                     if track_kind.lower() not in ('captions', 'subtitles'):
3322                         continue
3323                     track_url = urljoin(base_url, track.get('file'))
3324                     if not track_url:
3325                         continue
3326                     subtitles.setdefault(track.get('label') or 'en', []).append({
3327                         'url': self._proto_relative_url(track_url)
3328                     })
3329
3330             entry = {
3331                 'id': this_video_id,
3332                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3333                 'description': clean_html(video_data.get('description')),
3334                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3335                 'timestamp': int_or_none(video_data.get('pubdate')),
3336                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3337                 'subtitles': subtitles,
3338                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3339                 'genre': clean_html(video_data.get('genre')),
3340                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3341                 'season_number': int_or_none(video_data.get('season')),
3342                 'episode_number': int_or_none(video_data.get('episode')),
3343                 'release_year': int_or_none(video_data.get('releasedate')),
3344                 'age_limit': int_or_none(video_data.get('age_restriction')),
3345             }
3346             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3347             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3348                 entry.update({
3349                     '_type': 'url_transparent',
3350                     'url': formats[0]['url'],
3351                 })
3352             else:
3353                 entry['formats'] = formats
3354             entries.append(entry)
3355         if len(entries) == 1:
3356             return entries[0]
3357         else:
3358             return self.playlist_result(entries)
3359
3360     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3361                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3362         urls = set()
3363         formats = []
3364         for source in jwplayer_sources_data:
3365             if not isinstance(source, dict):
3366                 continue
3367             source_url = urljoin(
3368                 base_url, self._proto_relative_url(source.get('file')))
3369             if not source_url or source_url in urls:
3370                 continue
3371             urls.add(source_url)
3372             source_type = source.get('type') or ''
3373             ext = mimetype2ext(source_type) or determine_ext(source_url)
3374             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3375                 formats.extend(self._extract_m3u8_formats(
3376                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3377                     m3u8_id=m3u8_id, fatal=False))
3378             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3379                 formats.extend(self._extract_mpd_formats(
3380                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3381             elif ext == 'smil':
3382                 formats.extend(self._extract_smil_formats(
3383                     source_url, video_id, fatal=False))
3384             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3385             elif source_type.startswith('audio') or ext in (
3386                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3387                 formats.append({
3388                     'url': source_url,
3389                     'vcodec': 'none',
3390                     'ext': ext,
3391                 })
3392             else:
3393                 format_id = str_or_none(source.get('label'))
3394                 height = int_or_none(source.get('height'))
3395                 if height is None and format_id:
3396                     # Often no height is provided but there is a label in
3397                     # format like "1080p", "720p SD", or 1080.
3398                     height = parse_resolution(format_id).get('height')
3399                 a_format = {
3400                     'url': source_url,
3401                     'width': int_or_none(source.get('width')),
3402                     'height': height,
3403                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3404                     'filesize': int_or_none(source.get('filesize')),
3405                     'ext': ext,
3406                     'format_id': format_id
3407                 }
3408                 if source_url.startswith('rtmp'):
3409                     a_format['ext'] = 'flv'
3410                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3411                     # of jwplayer.flash.swf
3412                     rtmp_url_parts = re.split(
3413                         r'((?:mp4|mp3|flv):)', source_url, 1)
3414                     if len(rtmp_url_parts) == 3:
3415                         rtmp_url, prefix, play_path = rtmp_url_parts
3416                         a_format.update({
3417                             'url': rtmp_url,
3418                             'play_path': prefix + play_path,
3419                         })
3420                     if rtmp_params:
3421                         a_format.update(rtmp_params)
3422                 formats.append(a_format)
3423         return formats
3424
3425     def _live_title(self, name):
3426         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3427         return name
3428
3429     def _int(self, v, name, fatal=False, **kwargs):
3430         res = int_or_none(v, **kwargs)
3431         if res is None:
3432             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3433             if fatal:
3434                 raise ExtractorError(msg)
3435             else:
3436                 self.report_warning(msg)
3437         return res
3438
3439     def _float(self, v, name, fatal=False, **kwargs):
3440         res = float_or_none(v, **kwargs)
3441         if res is None:
3442             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3443             if fatal:
3444                 raise ExtractorError(msg)
3445             else:
3446                 self.report_warning(msg)
3447         return res
3448
3449     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3450                     path='/', secure=False, discard=False, rest={}, **kwargs):
3451         cookie = http.cookiejar.Cookie(
3452             0, name, value, port, port is not None, domain, True,
3453             domain.startswith('.'), path, True, secure, expire_time,
3454             discard, None, None, rest)
3455         self.cookiejar.set_cookie(cookie)
3456
3457     def _get_cookies(self, url):
3458         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3459         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3460
3461     def _apply_first_set_cookie_header(self, url_handle, cookie):
3462         """
3463         Apply first Set-Cookie header instead of the last. Experimental.
3464
3465         Some sites (e.g. [1-3]) may serve two cookies under the same name
3466         in Set-Cookie header and expect the first (old) one to be set rather
3467         than second (new). However, as of RFC6265 the newer one cookie
3468         should be set into cookie store what actually happens.
3469         We will workaround this issue by resetting the cookie to
3470         the first one manually.
3471         1. https://new.vk.com/
3472         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3473         3. https://learning.oreilly.com/
3474         """
3475         for header, cookies in url_handle.headers.items():
3476             if header.lower() != 'set-cookie':
3477                 continue
3478             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3479             cookie_value = re.search(
3480                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3481             if cookie_value:
3482                 value, domain = cookie_value.groups()
3483                 self._set_cookie(domain, cookie, value)
3484                 break
3485
3486     @classmethod
3487     def get_testcases(cls, include_onlymatching=False):
3488         # Do not look in super classes
3489         t = vars(cls).get('_TEST')
3490         if t:
3491             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3492             tests = [t]
3493         else:
3494             tests = vars(cls).get('_TESTS', [])
3495         for t in tests:
3496             if not include_onlymatching and t.get('only_matching', False):
3497                 continue
3498             t['name'] = cls.ie_key()
3499             yield t
3500         if getattr(cls, '__wrapped__', None):
3501             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3502
3503     @classmethod
3504     def get_webpage_testcases(cls):
3505         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3506         for t in tests:
3507             t['name'] = cls.ie_key()
3508             yield t
3509         if getattr(cls, '__wrapped__', None):
3510             yield from cls.__wrapped__.get_webpage_testcases()
3511
3512     @classproperty(cache=True)
3513     def age_limit(cls):
3514         """Get age limit from the testcases"""
3515         return max(traverse_obj(
3516             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3517             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3518
3519     @classproperty(cache=True)
3520     def _RETURN_TYPE(cls):
3521         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3522         tests = tuple(cls.get_testcases(include_onlymatching=False))
3523         if not tests:
3524             return None
3525         elif not any(k.startswith('playlist') for test in tests for k in test):
3526             return 'video'
3527         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3528             return 'playlist'
3529         return 'any'
3530
3531     @classmethod
3532     def is_single_video(cls, url):
3533         """Returns whether the URL is of a single video, None if unknown"""
3534         if cls.suitable(url):
3535             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3536
3537     @classmethod
3538     def is_suitable(cls, age_limit):
3539         """Test whether the extractor is generally suitable for the given age limit"""
3540         return not age_restricted(cls.age_limit, age_limit)
3541
3542     @classmethod
3543     def description(cls, *, markdown=True, search_examples=None):
3544         """Description of the extractor"""
3545         desc = ''
3546         if cls._NETRC_MACHINE:
3547             if markdown:
3548                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3549             else:
3550                 desc += f' [{cls._NETRC_MACHINE}]'
3551         if cls.IE_DESC is False:
3552             desc += ' [HIDDEN]'
3553         elif cls.IE_DESC:
3554             desc += f' {cls.IE_DESC}'
3555         if cls.SEARCH_KEY:
3556             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3557             if search_examples:
3558                 _COUNTS = ('', '5', '10', 'all')
3559                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3560         if not cls.working():
3561             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3562
3563         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3564         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3565         return f'{name}:{desc}' if desc else name
3566
3567     def extract_subtitles(self, *args, **kwargs):
3568         if (self.get_param('writesubtitles', False)
3569                 or self.get_param('listsubtitles')):
3570             return self._get_subtitles(*args, **kwargs)
3571         return {}
3572
3573     def _get_subtitles(self, *args, **kwargs):
3574         raise NotImplementedError('This method must be implemented by subclasses')
3575
3576     class CommentsDisabled(Exception):
3577         """Raise in _get_comments if comments are disabled for the video"""
3578
3579     def extract_comments(self, *args, **kwargs):
3580         if not self.get_param('getcomments'):
3581             return None
3582         generator = self._get_comments(*args, **kwargs)
3583
3584         def extractor():
3585             comments = []
3586             interrupted = True
3587             try:
3588                 while True:
3589                     comments.append(next(generator))
3590             except StopIteration:
3591                 interrupted = False
3592             except KeyboardInterrupt:
3593                 self.to_screen('Interrupted by user')
3594             except self.CommentsDisabled:
3595                 return {'comments': None, 'comment_count': None}
3596             except Exception as e:
3597                 if self.get_param('ignoreerrors') is not True:
3598                     raise
3599                 self._downloader.report_error(e)
3600             comment_count = len(comments)
3601             self.to_screen(f'Extracted {comment_count} comments')
3602             return {
3603                 'comments': comments,
3604                 'comment_count': None if interrupted else comment_count
3605             }
3606         return extractor
3607
3608     def _get_comments(self, *args, **kwargs):
3609         raise NotImplementedError('This method must be implemented by subclasses')
3610
3611     @staticmethod
3612     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3613         """ Merge subtitle items for one language. Items with duplicated URLs/data
3614         will be dropped. """
3615         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3616         ret = list(subtitle_list1)
3617         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3618         return ret
3619
3620     @classmethod
3621     def _merge_subtitles(cls, *dicts, target=None):
3622         """ Merge subtitle dictionaries, language by language. """
3623         if target is None:
3624             target = {}
3625         for d in dicts:
3626             for lang, subs in d.items():
3627                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3628         return target
3629
3630     def extract_automatic_captions(self, *args, **kwargs):
3631         if (self.get_param('writeautomaticsub', False)
3632                 or self.get_param('listsubtitles')):
3633             return self._get_automatic_captions(*args, **kwargs)
3634         return {}
3635
3636     def _get_automatic_captions(self, *args, **kwargs):
3637         raise NotImplementedError('This method must be implemented by subclasses')
3638
3639     @functools.cached_property
3640     def _cookies_passed(self):
3641         """Whether cookies have been passed to YoutubeDL"""
3642         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3643
3644     def mark_watched(self, *args, **kwargs):
3645         if not self.get_param('mark_watched', False):
3646             return
3647         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3648             self._mark_watched(*args, **kwargs)
3649
3650     def _mark_watched(self, *args, **kwargs):
3651         raise NotImplementedError('This method must be implemented by subclasses')
3652
3653     def geo_verification_headers(self):
3654         headers = {}
3655         geo_verification_proxy = self.get_param('geo_verification_proxy')
3656         if geo_verification_proxy:
3657             headers['Ytdl-request-proxy'] = geo_verification_proxy
3658         return headers
3659
3660     @staticmethod
3661     def _generic_id(url):
3662         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3663
3664     def _generic_title(self, url='', webpage='', *, default=None):
3665         return (self._og_search_title(webpage, default=None)
3666                 or self._html_extract_title(webpage, default=None)
3667                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3668                 or default)
3669
3670     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3671         if not duration:
3672             return
3673         chapter_list = [{
3674             'start_time': start_function(chapter),
3675             'title': title_function(chapter),
3676         } for chapter in chapter_list or []]
3677         if strict:
3678             warn = self.report_warning
3679         else:
3680             warn = self.write_debug
3681             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3682
3683         chapters = [{'start_time': 0}]
3684         for idx, chapter in enumerate(chapter_list):
3685             if chapter['start_time'] is None:
3686                 warn(f'Incomplete chapter {idx}')
3687             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3688                 chapters.append(chapter)
3689             elif chapter not in chapters:
3690                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3691                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3692                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3693         return chapters[1:]
3694
3695     def _extract_chapters_from_description(self, description, duration):
3696         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3697         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3698         return self._extract_chapters_helper(
3699             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3700             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3701             duration=duration, strict=False) or self._extract_chapters_helper(
3702             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3703             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3704             duration=duration, strict=False)
3705
3706     @staticmethod
3707     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3708         all_known = all(map(
3709             lambda x: x is not None,
3710             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3711         return (
3712             'private' if is_private
3713             else 'premium_only' if needs_premium
3714             else 'subscriber_only' if needs_subscription
3715             else 'needs_auth' if needs_auth
3716             else 'unlisted' if is_unlisted
3717             else 'public' if all_known
3718             else None)
3719
3720     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3721         '''
3722         @returns            A list of values for the extractor argument given by "key"
3723                             or "default" if no such key is present
3724         @param default      The default value to return when the key is not present (default: [])
3725         @param casesense    When false, the values are converted to lower case
3726         '''
3727         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3728         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3729         if val is None:
3730             return [] if default is NO_DEFAULT else default
3731         return list(val) if casesense else [x.lower() for x in val]
3732
3733     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3734         if not playlist_id or not video_id:
3735             return not video_id
3736
3737         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3738         if no_playlist is not None:
3739             return not no_playlist
3740
3741         video_id = '' if video_id is True else f' {video_id}'
3742         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3743         if self.get_param('noplaylist'):
3744             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3745             return False
3746         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3747         return True
3748
3749     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3750         RetryManager.report_retry(
3751             err, _count or int(fatal), _retries,
3752             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3753             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3754
3755     def RetryManager(self, **kwargs):
3756         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3757
3758     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3759         display_id = traverse_obj(info_dict, 'display_id', 'id')
3760         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3761         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3762             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3763
3764     @classmethod
3765     def extract_from_webpage(cls, ydl, url, webpage):
3766         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3767               else ydl.get_info_extractor(cls.ie_key()))
3768         for info in ie._extract_from_webpage(url, webpage) or []:
3769             # url = None since we do not want to set (webpage/original)_url
3770             ydl.add_default_extra_info(info, ie, None)
3771             yield info
3772
3773     @classmethod
3774     def _extract_from_webpage(cls, url, webpage):
3775         for embed_url in orderedSet(
3776                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3777             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3778
3779     @classmethod
3780     def _extract_embed_urls(cls, url, webpage):
3781         """@returns all the embed urls on the webpage"""
3782         if '_EMBED_URL_RE' not in cls.__dict__:
3783             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3784             for idx, regex in enumerate(cls._EMBED_REGEX):
3785                 assert regex.count('(?P<url>') == 1, \
3786                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3787             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3788
3789         for regex in cls._EMBED_URL_RE:
3790             for mobj in regex.finditer(webpage):
3791                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3792                 if cls._VALID_URL is False or cls.suitable(embed_url):
3793                     yield embed_url
3794
3795     class StopExtraction(Exception):
3796         pass
3797
3798     @classmethod
3799     def _extract_url(cls, webpage):  # TODO: Remove
3800         """Only for compatibility with some older extractors"""
3801         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3802
3803     @classmethod
3804     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3805         if plugin_name:
3806             mro = inspect.getmro(cls)
3807             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3808             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3809             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3810             while getattr(super_class, '__wrapped__', None):
3811                 super_class = super_class.__wrapped__
3812             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3813             _PLUGIN_OVERRIDES[super_class].append(cls)
3814
3815         return super().__init_subclass__(**kwargs)
3816
3817
3818 class SearchInfoExtractor(InfoExtractor):
3819     """
3820     Base class for paged search queries extractors.
3821     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3822     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3823     """
3824
3825     _MAX_RESULTS = float('inf')
3826     _RETURN_TYPE = 'playlist'
3827
3828     @classproperty
3829     def _VALID_URL(cls):
3830         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3831
3832     def _real_extract(self, query):
3833         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3834         if prefix == '':
3835             return self._get_n_results(query, 1)
3836         elif prefix == 'all':
3837             return self._get_n_results(query, self._MAX_RESULTS)
3838         else:
3839             n = int(prefix)
3840             if n <= 0:
3841                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3842             elif n > self._MAX_RESULTS:
3843                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3844                 n = self._MAX_RESULTS
3845             return self._get_n_results(query, n)
3846
3847     def _get_n_results(self, query, n):
3848         """Get a specified number of results for a query.
3849         Either this function or _search_results must be overridden by subclasses """
3850         return self.playlist_result(
3851             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3852             query, query)
3853
3854     def _search_results(self, query):
3855         """Returns an iterator of search results"""
3856         raise NotImplementedError('This method must be implemented by subclasses')
3857
3858     @classproperty
3859     def SEARCH_KEY(cls):
3860         return cls._SEARCH_KEY
3861
3862
3863 class UnsupportedURLIE(InfoExtractor):
3864     _VALID_URL = '.*'
3865     _ENABLED = False
3866     IE_DESC = False
3867
3868     def _real_extract(self, url):
3869         raise UnsupportedError(url)
3870
3871
3872 _PLUGIN_OVERRIDES = collections.defaultdict(list)