yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import sys
  17 import time
  18 import types
  19 import urllib.parse
  20 import urllib.request
  21 import xml.etree.ElementTree
  22
  23 from ..compat import functools  # isort: split
  24 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  25 from ..cookies import LenientSimpleCookie
  26 from ..downloader.f4m import get_base_url, remove_encrypted_media
  27 from ..utils import (
  28     IDENTITY,
  29     JSON_LD_RE,
  30     NO_DEFAULT,
  31     ExtractorError,
  32     FormatSorter,
  33     GeoRestrictedError,
  34     GeoUtils,
  35     HEADRequest,
  36     LenientJSONDecoder,
  37     RegexNotFoundError,
  38     RetryManager,
  39     UnsupportedError,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     classproperty,
  44     clean_html,
  45     deprecation_warning,
  46     determine_ext,
  47     dict_get,
  48     encode_data_uri,
  49     error_to_compat_str,
  50     extract_attributes,
  51     filter_dict,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     int_or_none,
  56     join_nonempty,
  57     js_to_json,
  58     mimetype2ext,
  59     network_exceptions,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     sanitize_filename,
  68     sanitize_url,
  69     sanitized_Request,
  70     smuggle_url,
  71     str_or_none,
  72     str_to_int,
  73     strip_or_none,
  74     traverse_obj,
  75     truncate_string,
  76     try_call,
  77     try_get,
  78     unescapeHTML,
  79     unified_strdate,
  80     unified_timestamp,
  81     update_Request,
  82     update_url_query,
  83     url_basename,
  84     url_or_none,
  85     urlhandle_detect_ext,
  86     urljoin,
  87     variadic,
  88     xpath_element,
  89     xpath_text,
  90     xpath_with_ns,
  91 )
  92
  93
  94 class InfoExtractor:
  95     """Information Extractor class.
  96
  97     Information extractors are the classes that, given a URL, extract
  98     information about the video (or videos) the URL refers to. This
  99     information includes the real video URL, the video title, author and
 100     others. The information is stored in a dictionary which is then
 101     passed to the YoutubeDL. The YoutubeDL processes this
 102     information possibly downloading the video to the file system, among
 103     other possible outcomes.
 104
 105     The type field determines the type of the result.
 106     By far the most common value (and the default if _type is missing) is
 107     "video", which indicates a single video.
 108
 109     For a video, the dictionaries must include the following fields:
 110
 111     id:             Video identifier.
 112     title:          Video title, unescaped. Set to an empty string if video has
 113                     no title as opposed to "None" which signifies that the
 114                     extractor failed to obtain a title
 115
 116     Additionally, it must contain either a formats entry or a url one:
 117
 118     formats:        A list of dictionaries for each format available, ordered
 119                     from worst to best quality.
 120
 121                     Potential fields:
 122                     * url        The mandatory URL representing the media:
 123                                    for plain file media - HTTP URL of this file,
 124                                    for RTMP - RTMP URL,
 125                                    for HLS - URL of the M3U8 media playlist,
 126                                    for HDS - URL of the F4M manifest,
 127                                    for DASH
 128                                      - HTTP URL to plain file media (in case of
 129                                        unfragmented media)
 130                                      - URL of the MPD manifest or base URL
 131                                        representing the media if MPD manifest
 132                                        is parsed from a string (in case of
 133                                        fragmented media)
 134                                    for MSS - URL of the ISM manifest.
 135                     * request_data  Data to send in POST request to the URL
 136                     * manifest_url
 137                                  The URL of the manifest file in case of
 138                                  fragmented media:
 139                                    for HLS - URL of the M3U8 master playlist,
 140                                    for HDS - URL of the F4M manifest,
 141                                    for DASH - URL of the MPD manifest,
 142                                    for MSS - URL of the ISM manifest.
 143                     * manifest_stream_number  (For internal use only)
 144                                  The index of the stream in the manifest file
 145                     * ext        Will be calculated from URL if missing
 146                     * format     A human-readable description of the format
 147                                  ("mp4 container with h264/opus").
 148                                  Calculated from the format_id, width, height.
 149                                  and format_note fields if missing.
 150                     * format_id  A short description of the format
 151                                  ("mp4_h264_opus" or "19").
 152                                 Technically optional, but strongly recommended.
 153                     * format_note Additional info about the format
 154                                  ("3D" or "DASH video")
 155                     * width      Width of the video, if known
 156                     * height     Height of the video, if known
 157                     * aspect_ratio  Aspect ratio of the video, if known
 158                                  Automatically calculated from width and height
 159                     * resolution Textual description of width and height
 160                                  Automatically calculated from width and height
 161                     * dynamic_range The dynamic range of the video. One of:
 162                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 163                     * tbr        Average bitrate of audio and video in KBit/s
 164                     * abr        Average audio bitrate in KBit/s
 165                     * acodec     Name of the audio codec in use
 166                     * asr        Audio sampling rate in Hertz
 167                     * audio_channels  Number of audio channels
 168                     * vbr        Average video bitrate in KBit/s
 169                     * fps        Frame rate
 170                     * vcodec     Name of the video codec in use
 171                     * container  Name of the container format
 172                     * filesize   The number of bytes, if known in advance
 173                     * filesize_approx  An estimate for the number of bytes
 174                     * player_url SWF Player URL (used for rtmpdump).
 175                     * protocol   The protocol that will be used for the actual
 176                                  download, lower-case. One of "http", "https" or
 177                                  one of the protocols defined in downloader.PROTOCOL_MAP
 178                     * fragment_base_url
 179                                  Base URL for fragments. Each fragment's path
 180                                  value (if present) will be relative to
 181                                  this URL.
 182                     * fragments  A list of fragments of a fragmented media.
 183                                  Each fragment entry must contain either an url
 184                                  or a path. If an url is present it should be
 185                                  considered by a client. Otherwise both path and
 186                                  fragment_base_url must be present. Here is
 187                                  the list of all potential fields:
 188                                  * "url" - fragment's URL
 189                                  * "path" - fragment's path relative to
 190                                             fragment_base_url
 191                                  * "duration" (optional, int or float)
 192                                  * "filesize" (optional, int)
 193                     * is_from_start  Is a live format that can be downloaded
 194                                 from the start. Boolean
 195                     * preference Order number of this format. If this field is
 196                                  present and not None, the formats get sorted
 197                                  by this field, regardless of all other values.
 198                                  -1 for default (order by other properties),
 199                                  -2 or smaller for less than default.
 200                                  < -1000 to hide the format (if there is
 201                                     another one which is strictly better)
 202                     * language   Language code, e.g. "de" or "en-US".
 203                     * language_preference  Is this in the language mentioned in
 204                                  the URL?
 205                                  10 if it's what the URL is about,
 206                                  -1 for default (don't know),
 207                                  -10 otherwise, other values reserved for now.
 208                     * quality    Order number of the video quality of this
 209                                  format, irrespective of the file format.
 210                                  -1 for default (order by other properties),
 211                                  -2 or smaller for less than default.
 212                     * source_preference  Order number for this video source
 213                                   (quality takes higher priority)
 214                                  -1 for default (order by other properties),
 215                                  -2 or smaller for less than default.
 216                     * http_headers  A dictionary of additional HTTP headers
 217                                  to add to the request.
 218                     * stretched_ratio  If given and not 1, indicates that the
 219                                  video's pixels are not square.
 220                                  width : height ratio as float.
 221                     * no_resume  The server does not support resuming the
 222                                  (HTTP or RTMP) download. Boolean.
 223                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 224                     * extra_param_to_segment_url  A query string to append to each
 225                                  fragment's URL, or to update each existing query string
 226                                  with. Only applied by the native HLS/DASH downloaders.
 227                     * hls_aes    A dictionary of HLS AES-128 decryption information
 228                                  used by the native HLS downloader to override the
 229                                  values in the media playlist when an '#EXT-X-KEY' tag
 230                                  is present in the playlist:
 231                                  * uri  The URI from which the key will be downloaded
 232                                  * key  The key (as hex) used to decrypt fragments.
 233                                         If `key` is given, any key URI will be ignored
 234                                  * iv   The IV (as hex) used to decrypt fragments
 235                     * downloader_options  A dictionary of downloader options
 236                                  (For internal use only)
 237                                  * http_chunk_size Chunk size for HTTP downloads
 238                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 239                     RTMP formats can also have the additional fields: page_url,
 240                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 241                     rtmp_protocol, rtmp_real_time
 242
 243     url:            Final video URL.
 244     ext:            Video filename extension.
 245     format:         The video format, defaults to ext (used for --get-format)
 246     player_url:     SWF Player URL (used for rtmpdump).
 247
 248     The following fields are optional:
 249
 250     direct:         True if a direct video file was given (must only be set by GenericIE)
 251     alt_title:      A secondary title of the video.
 252     display_id      An alternative identifier for the video, not necessarily
 253                     unique, but available before title. Typically, id is
 254                     something like "4234987", title "Dancing naked mole rats",
 255                     and display_id "dancing-naked-mole-rats"
 256     thumbnails:     A list of dictionaries, with the following entries:
 257                         * "id" (optional, string) - Thumbnail format ID
 258                         * "url"
 259                         * "preference" (optional, int) - quality of the image
 260                         * "width" (optional, int)
 261                         * "height" (optional, int)
 262                         * "resolution" (optional, string "{width}x{height}",
 263                                         deprecated)
 264                         * "filesize" (optional, int)
 265                         * "http_headers" (dict) - HTTP headers for the request
 266     thumbnail:      Full URL to a video thumbnail image.
 267     description:    Full video description.
 268     uploader:       Full name of the video uploader.
 269     license:        License name the video is licensed under.
 270     creator:        The creator of the video.
 271     timestamp:      UNIX timestamp of the moment the video was uploaded
 272     upload_date:    Video upload date in UTC (YYYYMMDD).
 273                     If not explicitly set, calculated from timestamp
 274     release_timestamp: UNIX timestamp of the moment the video was released.
 275                     If it is not clear whether to use timestamp or this, use the former
 276     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 277                     If not explicitly set, calculated from release_timestamp
 278     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 279     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 280                     If not explicitly set, calculated from modified_timestamp
 281     uploader_id:    Nickname or id of the video uploader.
 282     uploader_url:   Full URL to a personal webpage of the video uploader.
 283     channel:        Full name of the channel the video is uploaded on.
 284                     Note that channel fields may or may not repeat uploader
 285                     fields. This depends on a particular extractor.
 286     channel_id:     Id of the channel.
 287     channel_url:    Full URL to a channel webpage.
 288     channel_follower_count: Number of followers of the channel.
 289     location:       Physical location where the video was filmed.
 290     subtitles:      The available subtitles as a dictionary in the format
 291                     {tag: subformats}. "tag" is usually a language code, and
 292                     "subformats" is a list sorted from lower to higher
 293                     preference, each element is a dictionary with the "ext"
 294                     entry and one of:
 295                         * "data": The subtitles file contents
 296                         * "url": A URL pointing to the subtitles file
 297                     It can optionally also have:
 298                         * "name": Name or description of the subtitles
 299                         * "http_headers": A dictionary of additional HTTP headers
 300                                   to add to the request.
 301                     "ext" will be calculated from URL if missing
 302     automatic_captions: Like 'subtitles'; contains automatically generated
 303                     captions instead of normal subtitles
 304     duration:       Length of the video in seconds, as an integer or float.
 305     view_count:     How many users have watched the video on the platform.
 306     concurrent_view_count: How many users are currently watching the video on the platform.
 307     like_count:     Number of positive ratings of the video
 308     dislike_count:  Number of negative ratings of the video
 309     repost_count:   Number of reposts of the video
 310     average_rating: Average rating give by users, the scale used depends on the webpage
 311     comment_count:  Number of comments on the video
 312     comments:       A list of comments, each with one or more of the following
 313                     properties (all but one of text or html optional):
 314                         * "author" - human-readable name of the comment author
 315                         * "author_id" - user ID of the comment author
 316                         * "author_thumbnail" - The thumbnail of the comment author
 317                         * "id" - Comment ID
 318                         * "html" - Comment as HTML
 319                         * "text" - Plain text of the comment
 320                         * "timestamp" - UNIX timestamp of comment
 321                         * "parent" - ID of the comment this one is replying to.
 322                                      Set to "root" to indicate that this is a
 323                                      comment to the original video.
 324                         * "like_count" - Number of positive ratings of the comment
 325                         * "dislike_count" - Number of negative ratings of the comment
 326                         * "is_favorited" - Whether the comment is marked as
 327                                            favorite by the video uploader
 328                         * "author_is_uploader" - Whether the comment is made by
 329                                                  the video uploader
 330     age_limit:      Age restriction for the video, as an integer (years)
 331     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 332                     should allow to get the same result again. (It will be set
 333                     by YoutubeDL if it's missing)
 334     categories:     A list of categories that the video falls in, for example
 335                     ["Sports", "Berlin"]
 336     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 337     cast:           A list of the video cast
 338     is_live:        True, False, or None (=unknown). Whether this video is a
 339                     live stream that goes on instead of a fixed-length video.
 340     was_live:       True, False, or None (=unknown). Whether this video was
 341                     originally a live stream.
 342     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 343                     or 'post_live' (was live, but VOD is not yet processed)
 344                     If absent, automatically set from is_live, was_live
 345     start_time:     Time in seconds where the reproduction should start, as
 346                     specified in the URL.
 347     end_time:       Time in seconds where the reproduction should end, as
 348                     specified in the URL.
 349     chapters:       A list of dictionaries, with the following entries:
 350                         * "start_time" - The start time of the chapter in seconds
 351                         * "end_time" - The end time of the chapter in seconds
 352                         * "title" (optional, string)
 353     heatmap:        A list of dictionaries, with the following entries:
 354                         * "start_time" - The start time of the data point in seconds
 355                         * "end_time" - The end time of the data point in seconds
 356                         * "value" - The normalized value of the data point (float between 0 and 1)
 357     playable_in_embed: Whether this video is allowed to play in embedded
 358                     players on other sites. Can be True (=always allowed),
 359                     False (=never allowed), None (=unknown), or a string
 360                     specifying the criteria for embedability; e.g. 'whitelist'
 361     availability:   Under what condition the video is available. One of
 362                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 363                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 364                     to set it
 365     _old_archive_ids: A list of old archive ids needed for backward compatibility
 366     _format_sort_fields: A list of fields to use for sorting formats
 367     __post_extractor: A function to be called just before the metadata is
 368                     written to either disk, logger or console. The function
 369                     must return a dict which will be added to the info_dict.
 370                     This is usefull for additional information that is
 371                     time-consuming to extract. Note that the fields thus
 372                     extracted will not be available to output template and
 373                     match_filter. So, only "comments" and "comment_count" are
 374                     currently allowed to be extracted via this method.
 375
 376     The following fields should only be used when the video belongs to some logical
 377     chapter or section:
 378
 379     chapter:        Name or title of the chapter the video belongs to.
 380     chapter_number: Number of the chapter the video belongs to, as an integer.
 381     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 382
 383     The following fields should only be used when the video is an episode of some
 384     series, programme or podcast:
 385
 386     series:         Title of the series or programme the video episode belongs to.
 387     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 388     season:         Title of the season the video episode belongs to.
 389     season_number:  Number of the season the video episode belongs to, as an integer.
 390     season_id:      Id of the season the video episode belongs to, as a unicode string.
 391     episode:        Title of the video episode. Unlike mandatory video title field,
 392                     this field should denote the exact title of the video episode
 393                     without any kind of decoration.
 394     episode_number: Number of the video episode within a season, as an integer.
 395     episode_id:     Id of the video episode, as a unicode string.
 396
 397     The following fields should only be used when the media is a track or a part of
 398     a music album:
 399
 400     track:          Title of the track.
 401     track_number:   Number of the track within an album or a disc, as an integer.
 402     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 403                     as a unicode string.
 404     artist:         Artist(s) of the track.
 405     genre:          Genre(s) of the track.
 406     album:          Title of the album the track belongs to.
 407     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 408     album_artist:   List of all artists appeared on the album (e.g.
 409                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 410                     and compilations).
 411     disc_number:    Number of the disc or other physical medium the track belongs to,
 412                     as an integer.
 413     release_year:   Year (YYYY) when the album was released.
 414     composer:       Composer of the piece
 415
 416     The following fields should only be set for clips that should be cut from the original video:
 417
 418     section_start:  Start time of the section in seconds
 419     section_end:    End time of the section in seconds
 420
 421     The following fields should only be set for storyboards:
 422     rows:           Number of rows in each storyboard fragment, as an integer
 423     columns:        Number of columns in each storyboard fragment, as an integer
 424
 425     Unless mentioned otherwise, the fields should be Unicode strings.
 426
 427     Unless mentioned otherwise, None is equivalent to absence of information.
 428
 429
 430     _type "playlist" indicates multiple videos.
 431     There must be a key "entries", which is a list, an iterable, or a PagedList
 432     object, each element of which is a valid dictionary by this specification.
 433
 434     Additionally, playlists can have "id", "title", and any other relevant
 435     attributes with the same semantics as videos (see above).
 436
 437     It can also have the following optional fields:
 438
 439     playlist_count: The total number of videos in a playlist. If not given,
 440                     YoutubeDL tries to calculate it from "entries"
 441
 442
 443     _type "multi_video" indicates that there are multiple videos that
 444     form a single show, for examples multiple acts of an opera or TV episode.
 445     It must have an entries key like a playlist and contain all the keys
 446     required for a video at the same time.
 447
 448
 449     _type "url" indicates that the video must be extracted from another
 450     location, possibly by a different extractor. Its only required key is:
 451     "url" - the next URL to extract.
 452     The key "ie_key" can be set to the class name (minus the trailing "IE",
 453     e.g. "Youtube") if the extractor class is known in advance.
 454     Additionally, the dictionary may have any properties of the resolved entity
 455     known in advance, for example "title" if the title of the referred video is
 456     known ahead of time.
 457
 458
 459     _type "url_transparent" entities have the same specification as "url", but
 460     indicate that the given additional information is more precise than the one
 461     associated with the resolved URL.
 462     This is useful when a site employs a video service that hosts the video and
 463     its technical metadata, but that video service does not embed a useful
 464     title, description etc.
 465
 466
 467     Subclasses of this should also be added to the list of extractors and
 468     should define a _VALID_URL regexp and, re-define the _real_extract() and
 469     (optionally) _real_initialize() methods.
 470
 471     Subclasses may also override suitable() if necessary, but ensure the function
 472     signature is preserved and that this function imports everything it needs
 473     (except other extractors), so that lazy_extractors works correctly.
 474
 475     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 476     the HTML of Generic webpages. It may also override _extract_embed_urls
 477     or _extract_from_webpage as necessary. While these are normally classmethods,
 478     _extract_from_webpage is allowed to be an instance method.
 479
 480     _extract_from_webpage may raise self.StopExtraction() to stop further
 481     processing of the webpage and obtain exclusive rights to it. This is useful
 482     when the extractor cannot reliably be matched using just the URL,
 483     e.g. invidious/peertube instances
 484
 485     Embed-only extractors can be defined by setting _VALID_URL = False.
 486
 487     To support username + password (or netrc) login, the extractor must define a
 488     _NETRC_MACHINE and re-define _perform_login(username, password) and
 489     (optionally) _initialize_pre_login() methods. The _perform_login method will
 490     be called between _initialize_pre_login and _real_initialize if credentials
 491     are passed by the user. In cases where it is necessary to have the login
 492     process as part of the extraction rather than initialization, _perform_login
 493     can be left undefined.
 494
 495     _GEO_BYPASS attribute may be set to False in order to disable
 496     geo restriction bypass mechanisms for a particular extractor.
 497     Though it won't disable explicit geo restriction bypass based on
 498     country code provided with geo_bypass_country.
 499
 500     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 501     countries for this extractor. One of these countries will be used by
 502     geo restriction bypass mechanism right away in order to bypass
 503     geo restriction, of course, if the mechanism is not disabled.
 504
 505     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 506     IP blocks in CIDR notation for this extractor. One of these IP blocks
 507     will be used by geo restriction bypass mechanism similarly
 508     to _GEO_COUNTRIES.
 509
 510     The _ENABLED attribute should be set to False for IEs that
 511     are disabled by default and must be explicitly enabled.
 512
 513     The _WORKING attribute should be set to False for broken IEs
 514     in order to warn the users and skip the tests.
 515     """
 516
 517     _ready = False
 518     _downloader = None
 519     _x_forwarded_for_ip = None
 520     _GEO_BYPASS = True
 521     _GEO_COUNTRIES = None
 522     _GEO_IP_BLOCKS = None
 523     _WORKING = True
 524     _ENABLED = True
 525     _NETRC_MACHINE = None
 526     IE_DESC = None
 527     SEARCH_KEY = None
 528     _VALID_URL = None
 529     _EMBED_REGEX = []
 530
 531     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 532         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 533         return {
 534             None: '',
 535             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 536             'password': f'Use {password_hint}',
 537             'cookies': (
 538                 'Use --cookies-from-browser or --cookies for the authentication. '
 539                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 540         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 541
 542     def __init__(self, downloader=None):
 543         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 544         If a downloader is not passed during initialization,
 545         it must be set using "set_downloader()" before "extract()" is called"""
 546         self._ready = False
 547         self._x_forwarded_for_ip = None
 548         self._printed_messages = set()
 549         self.set_downloader(downloader)
 550
 551     @classmethod
 552     def _match_valid_url(cls, url):
 553         if cls._VALID_URL is False:
 554             return None
 555         # This does not use has/getattr intentionally - we want to know whether
 556         # we have cached the regexp for *this* class, whereas getattr would also
 557         # match the superclass
 558         if '_VALID_URL_RE' not in cls.__dict__:
 559             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 560         return cls._VALID_URL_RE.match(url)
 561
 562     @classmethod
 563     def suitable(cls, url):
 564         """Receives a URL and returns True if suitable for this IE."""
 565         # This function must import everything it needs (except other extractors),
 566         # so that lazy_extractors works correctly
 567         return cls._match_valid_url(url) is not None
 568
 569     @classmethod
 570     def _match_id(cls, url):
 571         return cls._match_valid_url(url).group('id')
 572
 573     @classmethod
 574     def get_temp_id(cls, url):
 575         try:
 576             return cls._match_id(url)
 577         except (IndexError, AttributeError):
 578             return None
 579
 580     @classmethod
 581     def working(cls):
 582         """Getter method for _WORKING."""
 583         return cls._WORKING
 584
 585     @classmethod
 586     def supports_login(cls):
 587         return bool(cls._NETRC_MACHINE)
 588
 589     def initialize(self):
 590         """Initializes an instance (authentication, etc)."""
 591         self._printed_messages = set()
 592         self._initialize_geo_bypass({
 593             'countries': self._GEO_COUNTRIES,
 594             'ip_blocks': self._GEO_IP_BLOCKS,
 595         })
 596         if not self._ready:
 597             self._initialize_pre_login()
 598             if self.supports_login():
 599                 username, password = self._get_login_info()
 600                 if username:
 601                     self._perform_login(username, password)
 602             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 603                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 604             self._real_initialize()
 605             self._ready = True
 606
 607     def _initialize_geo_bypass(self, geo_bypass_context):
 608         """
 609         Initialize geo restriction bypass mechanism.
 610
 611         This method is used to initialize geo bypass mechanism based on faking
 612         X-Forwarded-For HTTP header. A random country from provided country list
 613         is selected and a random IP belonging to this country is generated. This
 614         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 615         HTTP requests.
 616
 617         This method will be used for initial geo bypass mechanism initialization
 618         during the instance initialization with _GEO_COUNTRIES and
 619         _GEO_IP_BLOCKS.
 620
 621         You may also manually call it from extractor's code if geo bypass
 622         information is not available beforehand (e.g. obtained during
 623         extraction) or due to some other reason. In this case you should pass
 624         this information in geo bypass context passed as first argument. It may
 625         contain following fields:
 626
 627         countries:  List of geo unrestricted countries (similar
 628                     to _GEO_COUNTRIES)
 629         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 630                     (similar to _GEO_IP_BLOCKS)
 631
 632         """
 633         if not self._x_forwarded_for_ip:
 634
 635             # Geo bypass mechanism is explicitly disabled by user
 636             if not self.get_param('geo_bypass', True):
 637                 return
 638
 639             if not geo_bypass_context:
 640                 geo_bypass_context = {}
 641
 642             # Backward compatibility: previously _initialize_geo_bypass
 643             # expected a list of countries, some 3rd party code may still use
 644             # it this way
 645             if isinstance(geo_bypass_context, (list, tuple)):
 646                 geo_bypass_context = {
 647                     'countries': geo_bypass_context,
 648                 }
 649
 650             # The whole point of geo bypass mechanism is to fake IP
 651             # as X-Forwarded-For HTTP header based on some IP block or
 652             # country code.
 653
 654             # Path 1: bypassing based on IP block in CIDR notation
 655
 656             # Explicit IP block specified by user, use it right away
 657             # regardless of whether extractor is geo bypassable or not
 658             ip_block = self.get_param('geo_bypass_ip_block', None)
 659
 660             # Otherwise use random IP block from geo bypass context but only
 661             # if extractor is known as geo bypassable
 662             if not ip_block:
 663                 ip_blocks = geo_bypass_context.get('ip_blocks')
 664                 if self._GEO_BYPASS and ip_blocks:
 665                     ip_block = random.choice(ip_blocks)
 666
 667             if ip_block:
 668                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 669                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 670                 return
 671
 672             # Path 2: bypassing based on country code
 673
 674             # Explicit country code specified by user, use it right away
 675             # regardless of whether extractor is geo bypassable or not
 676             country = self.get_param('geo_bypass_country', None)
 677
 678             # Otherwise use random country code from geo bypass context but
 679             # only if extractor is known as geo bypassable
 680             if not country:
 681                 countries = geo_bypass_context.get('countries')
 682                 if self._GEO_BYPASS and countries:
 683                     country = random.choice(countries)
 684
 685             if country:
 686                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 687                 self._downloader.write_debug(
 688                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 689
 690     def extract(self, url):
 691         """Extracts URL information and returns it in list of dicts."""
 692         try:
 693             for _ in range(2):
 694                 try:
 695                     self.initialize()
 696                     self.to_screen('Extracting URL: %s' % (
 697                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 698                     ie_result = self._real_extract(url)
 699                     if ie_result is None:
 700                         return None
 701                     if self._x_forwarded_for_ip:
 702                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 703                     subtitles = ie_result.get('subtitles') or {}
 704                     if 'no-live-chat' in self.get_param('compat_opts'):
 705                         for lang in ('live_chat', 'comments', 'danmaku'):
 706                             subtitles.pop(lang, None)
 707                     return ie_result
 708                 except GeoRestrictedError as e:
 709                     if self.__maybe_fake_ip_and_retry(e.countries):
 710                         continue
 711                     raise
 712         except UnsupportedError:
 713             raise
 714         except ExtractorError as e:
 715             e.video_id = e.video_id or self.get_temp_id(url),
 716             e.ie = e.ie or self.IE_NAME,
 717             e.traceback = e.traceback or sys.exc_info()[2]
 718             raise
 719         except http.client.IncompleteRead as e:
 720             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 721         except (KeyError, StopIteration) as e:
 722             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 723
 724     def __maybe_fake_ip_and_retry(self, countries):
 725         if (not self.get_param('geo_bypass_country', None)
 726                 and self._GEO_BYPASS
 727                 and self.get_param('geo_bypass', True)
 728                 and not self._x_forwarded_for_ip
 729                 and countries):
 730             country_code = random.choice(countries)
 731             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 732             if self._x_forwarded_for_ip:
 733                 self.report_warning(
 734                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 735                     % (self._x_forwarded_for_ip, country_code.upper()))
 736                 return True
 737         return False
 738
 739     def set_downloader(self, downloader):
 740         """Sets a YoutubeDL instance as the downloader for this IE."""
 741         self._downloader = downloader
 742
 743     @property
 744     def cache(self):
 745         return self._downloader.cache
 746
 747     @property
 748     def cookiejar(self):
 749         return self._downloader.cookiejar
 750
 751     def _initialize_pre_login(self):
 752         """ Initialization before login. Redefine in subclasses."""
 753         pass
 754
 755     def _perform_login(self, username, password):
 756         """ Login with username and password. Redefine in subclasses."""
 757         pass
 758
 759     def _real_initialize(self):
 760         """Real initialization process. Redefine in subclasses."""
 761         pass
 762
 763     def _real_extract(self, url):
 764         """Real extraction process. Redefine in subclasses."""
 765         raise NotImplementedError('This method must be implemented by subclasses')
 766
 767     @classmethod
 768     def ie_key(cls):
 769         """A string for getting the InfoExtractor with get_info_extractor"""
 770         return cls.__name__[:-2]
 771
 772     @classproperty
 773     def IE_NAME(cls):
 774         return cls.__name__[:-2]
 775
 776     @staticmethod
 777     def __can_accept_status_code(err, expected_status):
 778         assert isinstance(err, urllib.error.HTTPError)
 779         if expected_status is None:
 780             return False
 781         elif callable(expected_status):
 782             return expected_status(err.code) is True
 783         else:
 784             return err.code in variadic(expected_status)
 785
 786     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 787         if isinstance(url_or_request, urllib.request.Request):
 788             return update_Request(url_or_request, data=data, headers=headers, query=query)
 789         if query:
 790             url_or_request = update_url_query(url_or_request, query)
 791         return sanitized_Request(url_or_request, data, headers or {})
 792
 793     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 794         """
 795         Return the response handle.
 796
 797         See _download_webpage docstring for arguments specification.
 798         """
 799         if not self._downloader._first_webpage_request:
 800             sleep_interval = self.get_param('sleep_interval_requests') or 0
 801             if sleep_interval > 0:
 802                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 803                 time.sleep(sleep_interval)
 804         else:
 805             self._downloader._first_webpage_request = False
 806
 807         if note is None:
 808             self.report_download_webpage(video_id)
 809         elif note is not False:
 810             if video_id is None:
 811                 self.to_screen(str(note))
 812             else:
 813                 self.to_screen(f'{video_id}: {note}')
 814
 815         # Some sites check X-Forwarded-For HTTP header in order to figure out
 816         # the origin of the client behind proxy. This allows bypassing geo
 817         # restriction by faking this header's value to IP that belongs to some
 818         # geo unrestricted country. We will do so once we encounter any
 819         # geo restriction error.
 820         if self._x_forwarded_for_ip:
 821             headers = (headers or {}).copy()
 822             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 823
 824         try:
 825             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 826         except network_exceptions as err:
 827             if isinstance(err, urllib.error.HTTPError):
 828                 if self.__can_accept_status_code(err, expected_status):
 829                     # Retain reference to error to prevent file object from
 830                     # being closed before it can be read. Works around the
 831                     # effects of <https://bugs.python.org/issue15002>
 832                     # introduced in Python 3.4.1.
 833                     err.fp._error = err
 834                     return err.fp
 835
 836             if errnote is False:
 837                 return False
 838             if errnote is None:
 839                 errnote = 'Unable to download webpage'
 840
 841             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 842             if fatal:
 843                 raise ExtractorError(errmsg, cause=err)
 844             else:
 845                 self.report_warning(errmsg)
 846                 return False
 847
 848     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 849                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 850         """
 851         Return a tuple (page content as string, URL handle).
 852
 853         Arguments:
 854         url_or_request -- plain text URL as a string or
 855             a urllib.request.Request object
 856         video_id -- Video/playlist/item identifier (string)
 857
 858         Keyword arguments:
 859         note -- note printed before downloading (string)
 860         errnote -- note printed in case of an error (string)
 861         fatal -- flag denoting whether error should be considered fatal,
 862             i.e. whether it should cause ExtractionError to be raised,
 863             otherwise a warning will be reported and extraction continued
 864         encoding -- encoding for a page content decoding, guessed automatically
 865             when not explicitly specified
 866         data -- POST data (bytes)
 867         headers -- HTTP headers (dict)
 868         query -- URL query (dict)
 869         expected_status -- allows to accept failed HTTP requests (non 2xx
 870             status code) by explicitly specifying a set of accepted status
 871             codes. Can be any of the following entities:
 872                 - an integer type specifying an exact failed status code to
 873                   accept
 874                 - a list or a tuple of integer types specifying a list of
 875                   failed status codes to accept
 876                 - a callable accepting an actual failed status code and
 877                   returning True if it should be accepted
 878             Note that this argument does not affect success status codes (2xx)
 879             which are always accepted.
 880         """
 881
 882         # Strip hashes from the URL (#1038)
 883         if isinstance(url_or_request, str):
 884             url_or_request = url_or_request.partition('#')[0]
 885
 886         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 887         if urlh is False:
 888             assert not fatal
 889             return False
 890         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 891         return (content, urlh)
 892
 893     @staticmethod
 894     def _guess_encoding_from_content(content_type, webpage_bytes):
 895         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 896         if m:
 897             encoding = m.group(1)
 898         else:
 899             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 900                           webpage_bytes[:1024])
 901             if m:
 902                 encoding = m.group(1).decode('ascii')
 903             elif webpage_bytes.startswith(b'\xff\xfe'):
 904                 encoding = 'utf-16'
 905             else:
 906                 encoding = 'utf-8'
 907
 908         return encoding
 909
 910     def __check_blocked(self, content):
 911         first_block = content[:512]
 912         if ('<title>Access to this site is blocked</title>' in content
 913                 and 'Websense' in first_block):
 914             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 915             blocked_iframe = self._html_search_regex(
 916                 r'<iframe src="([^"]+)"', content,
 917                 'Websense information URL', default=None)
 918             if blocked_iframe:
 919                 msg += ' Visit %s for more details' % blocked_iframe
 920             raise ExtractorError(msg, expected=True)
 921         if '<title>The URL you requested has been blocked</title>' in first_block:
 922             msg = (
 923                 'Access to this webpage has been blocked by Indian censorship. '
 924                 'Use a VPN or proxy server (with --proxy) to route around it.')
 925             block_msg = self._html_search_regex(
 926                 r'</h1><p>(.*?)</p>',
 927                 content, 'block message', default=None)
 928             if block_msg:
 929                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 930             raise ExtractorError(msg, expected=True)
 931         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 932                 and 'blocklist.rkn.gov.ru' in content):
 933             raise ExtractorError(
 934                 'Access to this webpage has been blocked by decision of the Russian government. '
 935                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 936                 expected=True)
 937
 938     def _request_dump_filename(self, url, video_id):
 939         basen = f'{video_id}_{url}'
 940         trim_length = self.get_param('trim_file_name') or 240
 941         if len(basen) > trim_length:
 942             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 943             basen = basen[:trim_length - len(h)] + h
 944         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 945         # Working around MAX_PATH limitation on Windows (see
 946         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 947         if compat_os_name == 'nt':
 948             absfilepath = os.path.abspath(filename)
 949             if len(absfilepath) > 259:
 950                 filename = fR'\\?\{absfilepath}'
 951         return filename
 952
 953     def __decode_webpage(self, webpage_bytes, encoding, headers):
 954         if not encoding:
 955             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 956         try:
 957             return webpage_bytes.decode(encoding, 'replace')
 958         except LookupError:
 959             return webpage_bytes.decode('utf-8', 'replace')
 960
 961     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 962         webpage_bytes = urlh.read()
 963         if prefix is not None:
 964             webpage_bytes = prefix + webpage_bytes
 965         if self.get_param('dump_intermediate_pages', False):
 966             self.to_screen('Dumping request to ' + urlh.geturl())
 967             dump = base64.b64encode(webpage_bytes).decode('ascii')
 968             self._downloader.to_screen(dump)
 969         if self.get_param('write_pages'):
 970             filename = self._request_dump_filename(urlh.geturl(), video_id)
 971             self.to_screen(f'Saving request to {filename}')
 972             with open(filename, 'wb') as outf:
 973                 outf.write(webpage_bytes)
 974
 975         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 976         self.__check_blocked(content)
 977
 978         return content
 979
 980     def __print_error(self, errnote, fatal, video_id, err):
 981         if fatal:
 982             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 983         elif errnote:
 984             self.report_warning(f'{video_id}: {errnote}: {err}')
 985
 986     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 987         if transform_source:
 988             xml_string = transform_source(xml_string)
 989         try:
 990             return compat_etree_fromstring(xml_string.encode('utf-8'))
 991         except xml.etree.ElementTree.ParseError as ve:
 992             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 993
 994     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
 995         try:
 996             return json.loads(
 997                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 998         except ValueError as ve:
 999             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1000
1001     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1002         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1003
1004     def __create_download_methods(name, parser, note, errnote, return_value):
1005
1006         def parse(ie, content, *args, errnote=errnote, **kwargs):
1007             if parser is None:
1008                 return content
1009             if errnote is False:
1010                 kwargs['errnote'] = errnote
1011             # parser is fetched by name so subclasses can override it
1012             return getattr(ie, parser)(content, *args, **kwargs)
1013
1014         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1015                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1016             res = self._download_webpage_handle(
1017                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1018                 data=data, headers=headers, query=query, expected_status=expected_status)
1019             if res is False:
1020                 return res
1021             content, urlh = res
1022             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1023
1024         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1025                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1026             if self.get_param('load_pages'):
1027                 url_or_request = self._create_request(url_or_request, data, headers, query)
1028                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1029                 self.to_screen(f'Loading request from {filename}')
1030                 try:
1031                     with open(filename, 'rb') as dumpf:
1032                         webpage_bytes = dumpf.read()
1033                 except OSError as e:
1034                     self.report_warning(f'Unable to load request from disk: {e}')
1035                 else:
1036                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1037                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1038             kwargs = {
1039                 'note': note,
1040                 'errnote': errnote,
1041                 'transform_source': transform_source,
1042                 'fatal': fatal,
1043                 'encoding': encoding,
1044                 'data': data,
1045                 'headers': headers,
1046                 'query': query,
1047                 'expected_status': expected_status,
1048             }
1049             if parser is None:
1050                 kwargs.pop('transform_source')
1051             # The method is fetched by name so subclasses can override _download_..._handle
1052             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1053             return res if res is False else res[0]
1054
1055         def impersonate(func, name, return_value):
1056             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1057             func.__doc__ = f'''
1058                 @param transform_source     Apply this transformation before parsing
1059                 @returns                    {return_value}
1060
1061                 See _download_webpage_handle docstring for other arguments specification
1062             '''
1063
1064         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1065         impersonate(download_content, f'_download_{name}', f'{return_value}')
1066         return download_handle, download_content
1067
1068     _download_xml_handle, _download_xml = __create_download_methods(
1069         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1070     _download_json_handle, _download_json = __create_download_methods(
1071         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1072     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1073         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1074     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1075
1076     def _download_webpage(
1077             self, url_or_request, video_id, note=None, errnote=None,
1078             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1079         """
1080         Return the data of the page as a string.
1081
1082         Keyword arguments:
1083         tries -- number of tries
1084         timeout -- sleep interval between tries
1085
1086         See _download_webpage_handle docstring for other arguments specification.
1087         """
1088
1089         R''' # NB: These are unused; should they be deprecated?
1090         if tries != 1:
1091             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1092         if timeout is NO_DEFAULT:
1093             timeout = 5
1094         else:
1095             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1096         '''
1097
1098         try_count = 0
1099         while True:
1100             try:
1101                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1102             except http.client.IncompleteRead as e:
1103                 try_count += 1
1104                 if try_count >= tries:
1105                     raise e
1106                 self._sleep(timeout, video_id)
1107
1108     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1109         idstr = format_field(video_id, None, '%s: ')
1110         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1111         if only_once:
1112             if f'WARNING: {msg}' in self._printed_messages:
1113                 return
1114             self._printed_messages.add(f'WARNING: {msg}')
1115         self._downloader.report_warning(msg, *args, **kwargs)
1116
1117     def to_screen(self, msg, *args, **kwargs):
1118         """Print msg to screen, prefixing it with '[ie_name]'"""
1119         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1120
1121     def write_debug(self, msg, *args, **kwargs):
1122         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1123
1124     def get_param(self, name, default=None, *args, **kwargs):
1125         if self._downloader:
1126             return self._downloader.params.get(name, default, *args, **kwargs)
1127         return default
1128
1129     def report_drm(self, video_id, partial=NO_DEFAULT):
1130         if partial is not NO_DEFAULT:
1131             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1132         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1133
1134     def report_extraction(self, id_or_name):
1135         """Report information extraction."""
1136         self.to_screen('%s: Extracting information' % id_or_name)
1137
1138     def report_download_webpage(self, video_id):
1139         """Report webpage download."""
1140         self.to_screen('%s: Downloading webpage' % video_id)
1141
1142     def report_age_confirmation(self):
1143         """Report attempt to confirm age."""
1144         self.to_screen('Confirming age')
1145
1146     def report_login(self):
1147         """Report attempt to log in."""
1148         self.to_screen('Logging in')
1149
1150     def raise_login_required(
1151             self, msg='This video is only available for registered users',
1152             metadata_available=False, method=NO_DEFAULT):
1153         if metadata_available and (
1154                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1155             self.report_warning(msg)
1156             return
1157         msg += format_field(self._login_hint(method), None, '. %s')
1158         raise ExtractorError(msg, expected=True)
1159
1160     def raise_geo_restricted(
1161             self, msg='This video is not available from your location due to geo restriction',
1162             countries=None, metadata_available=False):
1163         if metadata_available and (
1164                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1165             self.report_warning(msg)
1166         else:
1167             raise GeoRestrictedError(msg, countries=countries)
1168
1169     def raise_no_formats(self, msg, expected=False, video_id=None):
1170         if expected and (
1171                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1172             self.report_warning(msg, video_id)
1173         elif isinstance(msg, ExtractorError):
1174             raise msg
1175         else:
1176             raise ExtractorError(msg, expected=expected, video_id=video_id)
1177
1178     # Methods for following #608
1179     @staticmethod
1180     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1181         """Returns a URL that points to a page that should be processed"""
1182         if ie is not None:
1183             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1184         if video_id is not None:
1185             kwargs['id'] = video_id
1186         if video_title is not None:
1187             kwargs['title'] = video_title
1188         return {
1189             **kwargs,
1190             '_type': 'url_transparent' if url_transparent else 'url',
1191             'url': url,
1192         }
1193
1194     @classmethod
1195     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1196                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1197         return cls.playlist_result(
1198             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1199             playlist_id, playlist_title, **kwargs)
1200
1201     @staticmethod
1202     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1203         """Returns a playlist"""
1204         if playlist_id:
1205             kwargs['id'] = playlist_id
1206         if playlist_title:
1207             kwargs['title'] = playlist_title
1208         if playlist_description is not None:
1209             kwargs['description'] = playlist_description
1210         return {
1211             **kwargs,
1212             '_type': 'multi_video' if multi_video else 'playlist',
1213             'entries': entries,
1214         }
1215
1216     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1217         """
1218         Perform a regex search on the given string, using a single or a list of
1219         patterns returning the first matching group.
1220         In case of failure return a default value or raise a WARNING or a
1221         RegexNotFoundError, depending on fatal, specifying the field name.
1222         """
1223         if string is None:
1224             mobj = None
1225         elif isinstance(pattern, (str, re.Pattern)):
1226             mobj = re.search(pattern, string, flags)
1227         else:
1228             for p in pattern:
1229                 mobj = re.search(p, string, flags)
1230                 if mobj:
1231                     break
1232
1233         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1234
1235         if mobj:
1236             if group is None:
1237                 # return the first matching group
1238                 return next(g for g in mobj.groups() if g is not None)
1239             elif isinstance(group, (list, tuple)):
1240                 return tuple(mobj.group(g) for g in group)
1241             else:
1242                 return mobj.group(group)
1243         elif default is not NO_DEFAULT:
1244             return default
1245         elif fatal:
1246             raise RegexNotFoundError('Unable to extract %s' % _name)
1247         else:
1248             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1249             return None
1250
1251     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1252                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1253         """Searches string for the JSON object specified by start_pattern"""
1254         # NB: end_pattern is only used to reduce the size of the initial match
1255         if default is NO_DEFAULT:
1256             default, has_default = {}, False
1257         else:
1258             fatal, has_default = False, True
1259
1260         json_string = self._search_regex(
1261             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1262             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1263         if not json_string:
1264             return default
1265
1266         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1267         try:
1268             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1269         except ExtractorError as e:
1270             if fatal:
1271                 raise ExtractorError(
1272                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1273             elif not has_default:
1274                 self.report_warning(
1275                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1276         return default
1277
1278     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1279         """
1280         Like _search_regex, but strips HTML tags and unescapes entities.
1281         """
1282         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1283         if isinstance(res, tuple):
1284             return tuple(map(clean_html, res))
1285         return clean_html(res)
1286
1287     def _get_netrc_login_info(self, netrc_machine=None):
1288         username = None
1289         password = None
1290         netrc_machine = netrc_machine or self._NETRC_MACHINE
1291
1292         if self.get_param('usenetrc', False):
1293             try:
1294                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1295                 if os.path.isdir(netrc_file):
1296                     netrc_file = os.path.join(netrc_file, '.netrc')
1297                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1298                 if info is not None:
1299                     username = info[0]
1300                     password = info[2]
1301                 else:
1302                     raise netrc.NetrcParseError(
1303                         'No authenticators for %s' % netrc_machine)
1304             except (OSError, netrc.NetrcParseError) as err:
1305                 self.report_warning(
1306                     'parsing .netrc: %s' % error_to_compat_str(err))
1307
1308         return username, password
1309
1310     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1311         """
1312         Get the login info as (username, password)
1313         First look for the manually specified credentials using username_option
1314         and password_option as keys in params dictionary. If no such credentials
1315         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1316         value.
1317         If there's no info available, return (None, None)
1318         """
1319
1320         # Attempt to use provided username and password or .netrc data
1321         username = self.get_param(username_option)
1322         if username is not None:
1323             password = self.get_param(password_option)
1324         else:
1325             username, password = self._get_netrc_login_info(netrc_machine)
1326
1327         return username, password
1328
1329     def _get_tfa_info(self, note='two-factor verification code'):
1330         """
1331         Get the two-factor authentication info
1332         TODO - asking the user will be required for sms/phone verify
1333         currently just uses the command line option
1334         If there's no info available, return None
1335         """
1336
1337         tfa = self.get_param('twofactor')
1338         if tfa is not None:
1339             return tfa
1340
1341         return getpass.getpass('Type %s and press [Return]: ' % note)
1342
1343     # Helper functions for extracting OpenGraph info
1344     @staticmethod
1345     def _og_regexes(prop):
1346         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1347         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1348                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1349         template = r'<meta[^>]+?%s[^>]+?%s'
1350         return [
1351             template % (property_re, content_re),
1352             template % (content_re, property_re),
1353         ]
1354
1355     @staticmethod
1356     def _meta_regex(prop):
1357         return r'''(?isx)<meta
1358                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1359                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1360
1361     def _og_search_property(self, prop, html, name=None, **kargs):
1362         prop = variadic(prop)
1363         if name is None:
1364             name = 'OpenGraph %s' % prop[0]
1365         og_regexes = []
1366         for p in prop:
1367             og_regexes.extend(self._og_regexes(p))
1368         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1369         if escaped is None:
1370             return None
1371         return unescapeHTML(escaped)
1372
1373     def _og_search_thumbnail(self, html, **kargs):
1374         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1375
1376     def _og_search_description(self, html, **kargs):
1377         return self._og_search_property('description', html, fatal=False, **kargs)
1378
1379     def _og_search_title(self, html, *, fatal=False, **kargs):
1380         return self._og_search_property('title', html, fatal=fatal, **kargs)
1381
1382     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1383         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1384         if secure:
1385             regexes = self._og_regexes('video:secure_url') + regexes
1386         return self._html_search_regex(regexes, html, name, **kargs)
1387
1388     def _og_search_url(self, html, **kargs):
1389         return self._og_search_property('url', html, **kargs)
1390
1391     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1392         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1393
1394     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1395         name = variadic(name)
1396         if display_name is None:
1397             display_name = name[0]
1398         return self._html_search_regex(
1399             [self._meta_regex(n) for n in name],
1400             html, display_name, fatal=fatal, group='content', **kwargs)
1401
1402     def _dc_search_uploader(self, html):
1403         return self._html_search_meta('dc.creator', html, 'uploader')
1404
1405     @staticmethod
1406     def _rta_search(html):
1407         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1408         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1409                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1410                      html):
1411             return 18
1412
1413         # And then there are the jokers who advertise that they use RTA, but actually don't.
1414         AGE_LIMIT_MARKERS = [
1415             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1416             r'>[^<]*you acknowledge you are at least (\d+) years old',
1417             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1418         ]
1419
1420         age_limit = 0
1421         for marker in AGE_LIMIT_MARKERS:
1422             mobj = re.search(marker, html)
1423             if mobj:
1424                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1425         return age_limit
1426
1427     def _media_rating_search(self, html):
1428         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1429         rating = self._html_search_meta('rating', html)
1430
1431         if not rating:
1432             return None
1433
1434         RATING_TABLE = {
1435             'safe for kids': 0,
1436             'general': 8,
1437             '14 years': 14,
1438             'mature': 17,
1439             'restricted': 19,
1440         }
1441         return RATING_TABLE.get(rating.lower())
1442
1443     def _family_friendly_search(self, html):
1444         # See http://schema.org/VideoObject
1445         family_friendly = self._html_search_meta(
1446             'isFamilyFriendly', html, default=None)
1447
1448         if not family_friendly:
1449             return None
1450
1451         RATING_TABLE = {
1452             '1': 0,
1453             'true': 0,
1454             '0': 18,
1455             'false': 18,
1456         }
1457         return RATING_TABLE.get(family_friendly.lower())
1458
1459     def _twitter_search_player(self, html):
1460         return self._html_search_meta('twitter:player', html,
1461                                       'twitter card player')
1462
1463     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1464         """Yield all json ld objects in the html"""
1465         if default is not NO_DEFAULT:
1466             fatal = False
1467         for mobj in re.finditer(JSON_LD_RE, html):
1468             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1469             for json_ld in variadic(json_ld_item):
1470                 if isinstance(json_ld, dict):
1471                     yield json_ld
1472
1473     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1474         """Search for a video in any json ld in the html"""
1475         if default is not NO_DEFAULT:
1476             fatal = False
1477         info = self._json_ld(
1478             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1479             video_id, fatal=fatal, expected_type=expected_type)
1480         if info:
1481             return info
1482         if default is not NO_DEFAULT:
1483             return default
1484         elif fatal:
1485             raise RegexNotFoundError('Unable to extract JSON-LD')
1486         else:
1487             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1488             return {}
1489
1490     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1491         if isinstance(json_ld, str):
1492             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1493         if not json_ld:
1494             return {}
1495         info = {}
1496
1497         INTERACTION_TYPE_MAP = {
1498             'CommentAction': 'comment',
1499             'AgreeAction': 'like',
1500             'DisagreeAction': 'dislike',
1501             'LikeAction': 'like',
1502             'DislikeAction': 'dislike',
1503             'ListenAction': 'view',
1504             'WatchAction': 'view',
1505             'ViewAction': 'view',
1506         }
1507
1508         def is_type(e, *expected_types):
1509             type = variadic(traverse_obj(e, '@type'))
1510             return any(x in type for x in expected_types)
1511
1512         def extract_interaction_type(e):
1513             interaction_type = e.get('interactionType')
1514             if isinstance(interaction_type, dict):
1515                 interaction_type = interaction_type.get('@type')
1516             return str_or_none(interaction_type)
1517
1518         def extract_interaction_statistic(e):
1519             interaction_statistic = e.get('interactionStatistic')
1520             if isinstance(interaction_statistic, dict):
1521                 interaction_statistic = [interaction_statistic]
1522             if not isinstance(interaction_statistic, list):
1523                 return
1524             for is_e in interaction_statistic:
1525                 if not is_type(is_e, 'InteractionCounter'):
1526                     continue
1527                 interaction_type = extract_interaction_type(is_e)
1528                 if not interaction_type:
1529                     continue
1530                 # For interaction count some sites provide string instead of
1531                 # an integer (as per spec) with non digit characters (e.g. ",")
1532                 # so extracting count with more relaxed str_to_int
1533                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1534                 if interaction_count is None:
1535                     continue
1536                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1537                 if not count_kind:
1538                     continue
1539                 count_key = '%s_count' % count_kind
1540                 if info.get(count_key) is not None:
1541                     continue
1542                 info[count_key] = interaction_count
1543
1544         def extract_chapter_information(e):
1545             chapters = [{
1546                 'title': part.get('name'),
1547                 'start_time': part.get('startOffset'),
1548                 'end_time': part.get('endOffset'),
1549             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1550             for idx, (last_c, current_c, next_c) in enumerate(zip(
1551                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1552                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1553                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1554                 if None in current_c.values():
1555                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1556                     return
1557             if chapters:
1558                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1559                 info['chapters'] = chapters
1560
1561         def extract_video_object(e):
1562             author = e.get('author')
1563             info.update({
1564                 'url': url_or_none(e.get('contentUrl')),
1565                 'ext': mimetype2ext(e.get('encodingFormat')),
1566                 'title': unescapeHTML(e.get('name')),
1567                 'description': unescapeHTML(e.get('description')),
1568                 'thumbnails': [{'url': unescapeHTML(url)}
1569                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1570                                if url_or_none(url)],
1571                 'duration': parse_duration(e.get('duration')),
1572                 'timestamp': unified_timestamp(e.get('uploadDate')),
1573                 # author can be an instance of 'Organization' or 'Person' types.
1574                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1575                 # however some websites are using 'Text' type instead.
1576                 # 1. https://schema.org/VideoObject
1577                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1578                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1579                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1580                 'tbr': int_or_none(e.get('bitrate')),
1581                 'width': int_or_none(e.get('width')),
1582                 'height': int_or_none(e.get('height')),
1583                 'view_count': int_or_none(e.get('interactionCount')),
1584                 'tags': try_call(lambda: e.get('keywords').split(',')),
1585             })
1586             if is_type(e, 'AudioObject'):
1587                 info.update({
1588                     'vcodec': 'none',
1589                     'abr': int_or_none(e.get('bitrate')),
1590                 })
1591             extract_interaction_statistic(e)
1592             extract_chapter_information(e)
1593
1594         def traverse_json_ld(json_ld, at_top_level=True):
1595             for e in variadic(json_ld):
1596                 if not isinstance(e, dict):
1597                     continue
1598                 if at_top_level and '@context' not in e:
1599                     continue
1600                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1601                     traverse_json_ld(e['@graph'], at_top_level=False)
1602                     continue
1603                 if expected_type is not None and not is_type(e, expected_type):
1604                     continue
1605                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1606                 if rating is not None:
1607                     info['average_rating'] = rating
1608                 if is_type(e, 'TVEpisode', 'Episode'):
1609                     episode_name = unescapeHTML(e.get('name'))
1610                     info.update({
1611                         'episode': episode_name,
1612                         'episode_number': int_or_none(e.get('episodeNumber')),
1613                         'description': unescapeHTML(e.get('description')),
1614                     })
1615                     if not info.get('title') and episode_name:
1616                         info['title'] = episode_name
1617                     part_of_season = e.get('partOfSeason')
1618                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1619                         info.update({
1620                             'season': unescapeHTML(part_of_season.get('name')),
1621                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1622                         })
1623                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1624                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1625                         info['series'] = unescapeHTML(part_of_series.get('name'))
1626                 elif is_type(e, 'Movie'):
1627                     info.update({
1628                         'title': unescapeHTML(e.get('name')),
1629                         'description': unescapeHTML(e.get('description')),
1630                         'duration': parse_duration(e.get('duration')),
1631                         'timestamp': unified_timestamp(e.get('dateCreated')),
1632                     })
1633                 elif is_type(e, 'Article', 'NewsArticle'):
1634                     info.update({
1635                         'timestamp': parse_iso8601(e.get('datePublished')),
1636                         'title': unescapeHTML(e.get('headline')),
1637                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1638                     })
1639                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1640                         extract_video_object(e['video'][0])
1641                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1642                         extract_video_object(e['subjectOf'][0])
1643                 elif is_type(e, 'VideoObject', 'AudioObject'):
1644                     extract_video_object(e)
1645                     if expected_type is None:
1646                         continue
1647                     else:
1648                         break
1649                 video = e.get('video')
1650                 if is_type(video, 'VideoObject'):
1651                     extract_video_object(video)
1652                 if expected_type is None:
1653                     continue
1654                 else:
1655                     break
1656
1657         traverse_json_ld(json_ld)
1658         return filter_dict(info)
1659
1660     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1661         return self._parse_json(
1662             self._search_regex(
1663                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1664                 webpage, 'next.js data', fatal=fatal, **kw),
1665             video_id, transform_source=transform_source, fatal=fatal)
1666
1667     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1668         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1669         rectx = re.escape(context_name)
1670         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1671         js, arg_keys, arg_vals = self._search_regex(
1672             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1673             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1674             default=NO_DEFAULT if fatal else (None, None, None))
1675         if js is None:
1676             return {}
1677
1678         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1679             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1680
1681         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1682         return traverse_obj(ret, traverse) or {}
1683
1684     @staticmethod
1685     def _hidden_inputs(html):
1686         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1687         hidden_inputs = {}
1688         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1689             attrs = extract_attributes(input)
1690             if not input:
1691                 continue
1692             if attrs.get('type') not in ('hidden', 'submit'):
1693                 continue
1694             name = attrs.get('name') or attrs.get('id')
1695             value = attrs.get('value')
1696             if name and value is not None:
1697                 hidden_inputs[name] = value
1698         return hidden_inputs
1699
1700     def _form_hidden_inputs(self, form_id, html):
1701         form = self._search_regex(
1702             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1703             html, '%s form' % form_id, group='form')
1704         return self._hidden_inputs(form)
1705
1706     @classproperty(cache=True)
1707     def FormatSort(cls):
1708         class FormatSort(FormatSorter):
1709             def __init__(ie, *args, **kwargs):
1710                 super().__init__(ie._downloader, *args, **kwargs)
1711
1712         deprecation_warning(
1713             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1714             'Use yt_dlp.utils.FormatSorter instead')
1715         return FormatSort
1716
1717     def _sort_formats(self, formats, field_preference=[]):
1718         if not field_preference:
1719             self._downloader.deprecation_warning(
1720                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1721             return
1722         self._downloader.deprecation_warning(
1723             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1724             'Return _format_sort_fields in the info_dict instead')
1725         if formats:
1726             formats[0]['__sort_fields'] = field_preference
1727
1728     def _check_formats(self, formats, video_id):
1729         if formats:
1730             formats[:] = filter(
1731                 lambda f: self._is_valid_url(
1732                     f['url'], video_id,
1733                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1734                 formats)
1735
1736     @staticmethod
1737     def _remove_duplicate_formats(formats):
1738         format_urls = set()
1739         unique_formats = []
1740         for f in formats:
1741             if f['url'] not in format_urls:
1742                 format_urls.add(f['url'])
1743                 unique_formats.append(f)
1744         formats[:] = unique_formats
1745
1746     def _is_valid_url(self, url, video_id, item='video', headers={}):
1747         url = self._proto_relative_url(url, scheme='http:')
1748         # For now assume non HTTP(S) URLs always valid
1749         if not (url.startswith('http://') or url.startswith('https://')):
1750             return True
1751         try:
1752             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1753             return True
1754         except ExtractorError as e:
1755             self.to_screen(
1756                 '%s: %s URL is invalid, skipping: %s'
1757                 % (video_id, item, error_to_compat_str(e.cause)))
1758             return False
1759
1760     def http_scheme(self):
1761         """ Either "http:" or "https:", depending on the user's preferences """
1762         return (
1763             'http:'
1764             if self.get_param('prefer_insecure', False)
1765             else 'https:')
1766
1767     def _proto_relative_url(self, url, scheme=None):
1768         scheme = scheme or self.http_scheme()
1769         assert scheme.endswith(':')
1770         return sanitize_url(url, scheme=scheme[:-1])
1771
1772     def _sleep(self, timeout, video_id, msg_template=None):
1773         if msg_template is None:
1774             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1775         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1776         self.to_screen(msg)
1777         time.sleep(timeout)
1778
1779     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1780                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1781                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1782         if self.get_param('ignore_no_formats_error'):
1783             fatal = False
1784
1785         res = self._download_xml_handle(
1786             manifest_url, video_id, 'Downloading f4m manifest',
1787             'Unable to download f4m manifest',
1788             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1789             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1790             transform_source=transform_source,
1791             fatal=fatal, data=data, headers=headers, query=query)
1792         if res is False:
1793             return []
1794
1795         manifest, urlh = res
1796         manifest_url = urlh.geturl()
1797
1798         return self._parse_f4m_formats(
1799             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1800             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1801
1802     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1803                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1804                            fatal=True, m3u8_id=None):
1805         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1806             return []
1807
1808         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1809         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1810         if akamai_pv is not None and ';' in akamai_pv.text:
1811             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1812             if playerVerificationChallenge.strip() != '':
1813                 return []
1814
1815         formats = []
1816         manifest_version = '1.0'
1817         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1818         if not media_nodes:
1819             manifest_version = '2.0'
1820             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1821         # Remove unsupported DRM protected media from final formats
1822         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1823         media_nodes = remove_encrypted_media(media_nodes)
1824         if not media_nodes:
1825             return formats
1826
1827         manifest_base_url = get_base_url(manifest)
1828
1829         bootstrap_info = xpath_element(
1830             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1831             'bootstrap info', default=None)
1832
1833         vcodec = None
1834         mime_type = xpath_text(
1835             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1836             'base URL', default=None)
1837         if mime_type and mime_type.startswith('audio/'):
1838             vcodec = 'none'
1839
1840         for i, media_el in enumerate(media_nodes):
1841             tbr = int_or_none(media_el.attrib.get('bitrate'))
1842             width = int_or_none(media_el.attrib.get('width'))
1843             height = int_or_none(media_el.attrib.get('height'))
1844             format_id = join_nonempty(f4m_id, tbr or i)
1845             # If <bootstrapInfo> is present, the specified f4m is a
1846             # stream-level manifest, and only set-level manifests may refer to
1847             # external resources.  See section 11.4 and section 4 of F4M spec
1848             if bootstrap_info is None:
1849                 media_url = None
1850                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1851                 if manifest_version == '2.0':
1852                     media_url = media_el.attrib.get('href')
1853                 if media_url is None:
1854                     media_url = media_el.attrib.get('url')
1855                 if not media_url:
1856                     continue
1857                 manifest_url = (
1858                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1859                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1860                 # If media_url is itself a f4m manifest do the recursive extraction
1861                 # since bitrates in parent manifest (this one) and media_url manifest
1862                 # may differ leading to inability to resolve the format by requested
1863                 # bitrate in f4m downloader
1864                 ext = determine_ext(manifest_url)
1865                 if ext == 'f4m':
1866                     f4m_formats = self._extract_f4m_formats(
1867                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1868                         transform_source=transform_source, fatal=fatal)
1869                     # Sometimes stream-level manifest contains single media entry that
1870                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1871                     # At the same time parent's media entry in set-level manifest may
1872                     # contain it. We will copy it from parent in such cases.
1873                     if len(f4m_formats) == 1:
1874                         f = f4m_formats[0]
1875                         f.update({
1876                             'tbr': f.get('tbr') or tbr,
1877                             'width': f.get('width') or width,
1878                             'height': f.get('height') or height,
1879                             'format_id': f.get('format_id') if not tbr else format_id,
1880                             'vcodec': vcodec,
1881                         })
1882                     formats.extend(f4m_formats)
1883                     continue
1884                 elif ext == 'm3u8':
1885                     formats.extend(self._extract_m3u8_formats(
1886                         manifest_url, video_id, 'mp4', preference=preference,
1887                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1888                     continue
1889             formats.append({
1890                 'format_id': format_id,
1891                 'url': manifest_url,
1892                 'manifest_url': manifest_url,
1893                 'ext': 'flv' if bootstrap_info is not None else None,
1894                 'protocol': 'f4m',
1895                 'tbr': tbr,
1896                 'width': width,
1897                 'height': height,
1898                 'vcodec': vcodec,
1899                 'preference': preference,
1900                 'quality': quality,
1901             })
1902         return formats
1903
1904     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1905         return {
1906             'format_id': join_nonempty(m3u8_id, 'meta'),
1907             'url': m3u8_url,
1908             'ext': ext,
1909             'protocol': 'm3u8',
1910             'preference': preference - 100 if preference else -100,
1911             'quality': quality,
1912             'resolution': 'multiple',
1913             'format_note': 'Quality selection URL',
1914         }
1915
1916     def _report_ignoring_subs(self, name):
1917         self.report_warning(bug_reports_message(
1918             f'Ignoring subtitle tracks found in the {name} manifest; '
1919             'if any subtitle tracks are missing,'
1920         ), only_once=True)
1921
1922     def _extract_m3u8_formats(self, *args, **kwargs):
1923         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1924         if subs:
1925             self._report_ignoring_subs('HLS')
1926         return fmts
1927
1928     def _extract_m3u8_formats_and_subtitles(
1929             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1930             preference=None, quality=None, m3u8_id=None, note=None,
1931             errnote=None, fatal=True, live=False, data=None, headers={},
1932             query={}):
1933
1934         if self.get_param('ignore_no_formats_error'):
1935             fatal = False
1936
1937         if not m3u8_url:
1938             if errnote is not False:
1939                 errnote = errnote or 'Failed to obtain m3u8 URL'
1940                 if fatal:
1941                     raise ExtractorError(errnote, video_id=video_id)
1942                 self.report_warning(f'{errnote}{bug_reports_message()}')
1943             return [], {}
1944
1945         res = self._download_webpage_handle(
1946             m3u8_url, video_id,
1947             note='Downloading m3u8 information' if note is None else note,
1948             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1949             fatal=fatal, data=data, headers=headers, query=query)
1950
1951         if res is False:
1952             return [], {}
1953
1954         m3u8_doc, urlh = res
1955         m3u8_url = urlh.geturl()
1956
1957         return self._parse_m3u8_formats_and_subtitles(
1958             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1959             preference=preference, quality=quality, m3u8_id=m3u8_id,
1960             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1961             headers=headers, query=query, video_id=video_id)
1962
1963     def _parse_m3u8_formats_and_subtitles(
1964             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
1965             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1966             errnote=None, fatal=True, data=None, headers={}, query={},
1967             video_id=None):
1968         formats, subtitles = [], {}
1969
1970         has_drm = re.search('|'.join([
1971             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
1972             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
1973         ]), m3u8_doc)
1974
1975         def format_url(url):
1976             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
1977
1978         if self.get_param('hls_split_discontinuity', False):
1979             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1980                 if not m3u8_doc:
1981                     if not manifest_url:
1982                         return []
1983                     m3u8_doc = self._download_webpage(
1984                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
1985                         note=False, errnote='Failed to download m3u8 playlist information')
1986                     if m3u8_doc is False:
1987                         return []
1988                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
1989
1990         else:
1991             def _extract_m3u8_playlist_indices(*args, **kwargs):
1992                 return [None]
1993
1994         # References:
1995         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1996         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1997         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1998
1999         # We should try extracting formats only from master playlists [1, 4.3.4],
2000         # i.e. playlists that describe available qualities. On the other hand
2001         # media playlists [1, 4.3.3] should be returned as is since they contain
2002         # just the media without qualities renditions.
2003         # Fortunately, master playlist can be easily distinguished from media
2004         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2005         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2006         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2007         # media playlist and MUST NOT appear in master playlist thus we can
2008         # clearly detect media playlist with this criterion.
2009
2010         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2011             formats = [{
2012                 'format_id': join_nonempty(m3u8_id, idx),
2013                 'format_index': idx,
2014                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2015                 'ext': ext,
2016                 'protocol': entry_protocol,
2017                 'preference': preference,
2018                 'quality': quality,
2019                 'has_drm': has_drm,
2020             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2021
2022             return formats, subtitles
2023
2024         groups = {}
2025         last_stream_inf = {}
2026
2027         def extract_media(x_media_line):
2028             media = parse_m3u8_attributes(x_media_line)
2029             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2030             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2031             if not (media_type and group_id and name):
2032                 return
2033             groups.setdefault(group_id, []).append(media)
2034             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2035             if media_type == 'SUBTITLES':
2036                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2037                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2038                 # However, lack of URI has been spotted in the wild.
2039                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2040                 if not media.get('URI'):
2041                     return
2042                 url = format_url(media['URI'])
2043                 sub_info = {
2044                     'url': url,
2045                     'ext': determine_ext(url),
2046                 }
2047                 if sub_info['ext'] == 'm3u8':
2048                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2049                     # files may contain is WebVTT:
2050                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2051                     sub_info['ext'] = 'vtt'
2052                     sub_info['protocol'] = 'm3u8_native'
2053                 lang = media.get('LANGUAGE') or 'und'
2054                 subtitles.setdefault(lang, []).append(sub_info)
2055             if media_type not in ('VIDEO', 'AUDIO'):
2056                 return
2057             media_url = media.get('URI')
2058             if media_url:
2059                 manifest_url = format_url(media_url)
2060                 formats.extend({
2061                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2062                     'format_note': name,
2063                     'format_index': idx,
2064                     'url': manifest_url,
2065                     'manifest_url': m3u8_url,
2066                     'language': media.get('LANGUAGE'),
2067                     'ext': ext,
2068                     'protocol': entry_protocol,
2069                     'preference': preference,
2070                     'quality': quality,
2071                     'has_drm': has_drm,
2072                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2073                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2074
2075         def build_stream_name():
2076             # Despite specification does not mention NAME attribute for
2077             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2078             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2079             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2080             stream_name = last_stream_inf.get('NAME')
2081             if stream_name:
2082                 return stream_name
2083             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2084             # from corresponding rendition group
2085             stream_group_id = last_stream_inf.get('VIDEO')
2086             if not stream_group_id:
2087                 return
2088             stream_group = groups.get(stream_group_id)
2089             if not stream_group:
2090                 return stream_group_id
2091             rendition = stream_group[0]
2092             return rendition.get('NAME') or stream_group_id
2093
2094         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2095         # chance to detect video only formats when EXT-X-STREAM-INF tags
2096         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2097         for line in m3u8_doc.splitlines():
2098             if line.startswith('#EXT-X-MEDIA:'):
2099                 extract_media(line)
2100
2101         for line in m3u8_doc.splitlines():
2102             if line.startswith('#EXT-X-STREAM-INF:'):
2103                 last_stream_inf = parse_m3u8_attributes(line)
2104             elif line.startswith('#') or not line.strip():
2105                 continue
2106             else:
2107                 tbr = float_or_none(
2108                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2109                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2110                 manifest_url = format_url(line.strip())
2111
2112                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2113                     format_id = [m3u8_id, None, idx]
2114                     # Bandwidth of live streams may differ over time thus making
2115                     # format_id unpredictable. So it's better to keep provided
2116                     # format_id intact.
2117                     if not live:
2118                         stream_name = build_stream_name()
2119                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2120                     f = {
2121                         'format_id': join_nonempty(*format_id),
2122                         'format_index': idx,
2123                         'url': manifest_url,
2124                         'manifest_url': m3u8_url,
2125                         'tbr': tbr,
2126                         'ext': ext,
2127                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2128                         'protocol': entry_protocol,
2129                         'preference': preference,
2130                         'quality': quality,
2131                         'has_drm': has_drm,
2132                     }
2133                     resolution = last_stream_inf.get('RESOLUTION')
2134                     if resolution:
2135                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2136                         if mobj:
2137                             f['width'] = int(mobj.group('width'))
2138                             f['height'] = int(mobj.group('height'))
2139                     # Unified Streaming Platform
2140                     mobj = re.search(
2141                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2142                     if mobj:
2143                         abr, vbr = mobj.groups()
2144                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2145                         f.update({
2146                             'vbr': vbr,
2147                             'abr': abr,
2148                         })
2149                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2150                     f.update(codecs)
2151                     audio_group_id = last_stream_inf.get('AUDIO')
2152                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2153                     # references a rendition group MUST have a CODECS attribute.
2154                     # However, this is not always respected. E.g. [2]
2155                     # contains EXT-X-STREAM-INF tag which references AUDIO
2156                     # rendition group but does not have CODECS and despite
2157                     # referencing an audio group it represents a complete
2158                     # (with audio and video) format. So, for such cases we will
2159                     # ignore references to rendition groups and treat them
2160                     # as complete formats.
2161                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2162                         audio_group = groups.get(audio_group_id)
2163                         if audio_group and audio_group[0].get('URI'):
2164                             # TODO: update acodec for audio only formats with
2165                             # the same GROUP-ID
2166                             f['acodec'] = 'none'
2167                     if not f.get('ext'):
2168                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2169                     formats.append(f)
2170
2171                     # for DailyMotion
2172                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2173                     if progressive_uri:
2174                         http_f = f.copy()
2175                         del http_f['manifest_url']
2176                         http_f.update({
2177                             'format_id': f['format_id'].replace('hls-', 'http-'),
2178                             'protocol': 'http',
2179                             'url': progressive_uri,
2180                         })
2181                         formats.append(http_f)
2182
2183                 last_stream_inf = {}
2184         return formats, subtitles
2185
2186     def _extract_m3u8_vod_duration(
2187             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2188
2189         m3u8_vod = self._download_webpage(
2190             m3u8_vod_url, video_id,
2191             note='Downloading m3u8 VOD manifest' if note is None else note,
2192             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2193             fatal=False, data=data, headers=headers, query=query)
2194
2195         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2196
2197     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2198         if '#EXT-X-ENDLIST' not in m3u8_vod:
2199             return None
2200
2201         return int(sum(
2202             float(line[len('#EXTINF:'):].split(',')[0])
2203             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2204
2205     def _extract_mpd_vod_duration(
2206             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2207
2208         mpd_doc = self._download_xml(
2209             mpd_url, video_id,
2210             note='Downloading MPD VOD manifest' if note is None else note,
2211             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2212             fatal=False, data=data, headers=headers, query=query) or {}
2213         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2214
2215     @staticmethod
2216     def _xpath_ns(path, namespace=None):
2217         if not namespace:
2218             return path
2219         out = []
2220         for c in path.split('/'):
2221             if not c or c == '.':
2222                 out.append(c)
2223             else:
2224                 out.append('{%s}%s' % (namespace, c))
2225         return '/'.join(out)
2226
2227     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2228         if self.get_param('ignore_no_formats_error'):
2229             fatal = False
2230
2231         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2232         if res is False:
2233             assert not fatal
2234             return [], {}
2235
2236         smil, urlh = res
2237         smil_url = urlh.geturl()
2238
2239         namespace = self._parse_smil_namespace(smil)
2240
2241         fmts = self._parse_smil_formats(
2242             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2243         subs = self._parse_smil_subtitles(
2244             smil, namespace=namespace)
2245
2246         return fmts, subs
2247
2248     def _extract_smil_formats(self, *args, **kwargs):
2249         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2250         if subs:
2251             self._report_ignoring_subs('SMIL')
2252         return fmts
2253
2254     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2255         res = self._download_smil(smil_url, video_id, fatal=fatal)
2256         if res is False:
2257             return {}
2258
2259         smil, urlh = res
2260         smil_url = urlh.geturl()
2261
2262         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2263
2264     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2265         return self._download_xml_handle(
2266             smil_url, video_id, 'Downloading SMIL file',
2267             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2268
2269     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2270         namespace = self._parse_smil_namespace(smil)
2271
2272         formats = self._parse_smil_formats(
2273             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2274         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2275
2276         video_id = os.path.splitext(url_basename(smil_url))[0]
2277         title = None
2278         description = None
2279         upload_date = None
2280         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2281             name = meta.attrib.get('name')
2282             content = meta.attrib.get('content')
2283             if not name or not content:
2284                 continue
2285             if not title and name == 'title':
2286                 title = content
2287             elif not description and name in ('description', 'abstract'):
2288                 description = content
2289             elif not upload_date and name == 'date':
2290                 upload_date = unified_strdate(content)
2291
2292         thumbnails = [{
2293             'id': image.get('type'),
2294             'url': image.get('src'),
2295             'width': int_or_none(image.get('width')),
2296             'height': int_or_none(image.get('height')),
2297         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2298
2299         return {
2300             'id': video_id,
2301             'title': title or video_id,
2302             'description': description,
2303             'upload_date': upload_date,
2304             'thumbnails': thumbnails,
2305             'formats': formats,
2306             'subtitles': subtitles,
2307         }
2308
2309     def _parse_smil_namespace(self, smil):
2310         return self._search_regex(
2311             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2312
2313     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2314         base = smil_url
2315         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2316             b = meta.get('base') or meta.get('httpBase')
2317             if b:
2318                 base = b
2319                 break
2320
2321         formats = []
2322         rtmp_count = 0
2323         http_count = 0
2324         m3u8_count = 0
2325         imgs_count = 0
2326
2327         srcs = set()
2328         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2329         for medium in media:
2330             src = medium.get('src')
2331             if not src or src in srcs:
2332                 continue
2333             srcs.add(src)
2334
2335             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2336             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2337             width = int_or_none(medium.get('width'))
2338             height = int_or_none(medium.get('height'))
2339             proto = medium.get('proto')
2340             ext = medium.get('ext')
2341             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2342                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2343             streamer = medium.get('streamer') or base
2344
2345             if proto == 'rtmp' or streamer.startswith('rtmp'):
2346                 rtmp_count += 1
2347                 formats.append({
2348                     'url': streamer,
2349                     'play_path': src,
2350                     'ext': 'flv',
2351                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2352                     'tbr': bitrate,
2353                     'filesize': filesize,
2354                     'width': width,
2355                     'height': height,
2356                 })
2357                 if transform_rtmp_url:
2358                     streamer, src = transform_rtmp_url(streamer, src)
2359                     formats[-1].update({
2360                         'url': streamer,
2361                         'play_path': src,
2362                     })
2363                 continue
2364
2365             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2366             src_url = src_url.strip()
2367
2368             if proto == 'm3u8' or src_ext == 'm3u8':
2369                 m3u8_formats = self._extract_m3u8_formats(
2370                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2371                 if len(m3u8_formats) == 1:
2372                     m3u8_count += 1
2373                     m3u8_formats[0].update({
2374                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2375                         'tbr': bitrate,
2376                         'width': width,
2377                         'height': height,
2378                     })
2379                 formats.extend(m3u8_formats)
2380             elif src_ext == 'f4m':
2381                 f4m_url = src_url
2382                 if not f4m_params:
2383                     f4m_params = {
2384                         'hdcore': '3.2.0',
2385                         'plugin': 'flowplayer-3.2.0.1',
2386                     }
2387                 f4m_url += '&' if '?' in f4m_url else '?'
2388                 f4m_url += urllib.parse.urlencode(f4m_params)
2389                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2390             elif src_ext == 'mpd':
2391                 formats.extend(self._extract_mpd_formats(
2392                     src_url, video_id, mpd_id='dash', fatal=False))
2393             elif re.search(r'\.ism/[Mm]anifest', src_url):
2394                 formats.extend(self._extract_ism_formats(
2395                     src_url, video_id, ism_id='mss', fatal=False))
2396             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2397                 http_count += 1
2398                 formats.append({
2399                     'url': src_url,
2400                     'ext': ext or src_ext or 'flv',
2401                     'format_id': 'http-%d' % (bitrate or http_count),
2402                     'tbr': bitrate,
2403                     'filesize': filesize,
2404                     'width': width,
2405                     'height': height,
2406                 })
2407
2408         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2409             src = medium.get('src')
2410             if not src or src in srcs:
2411                 continue
2412             srcs.add(src)
2413
2414             imgs_count += 1
2415             formats.append({
2416                 'format_id': 'imagestream-%d' % (imgs_count),
2417                 'url': src,
2418                 'ext': mimetype2ext(medium.get('type')),
2419                 'acodec': 'none',
2420                 'vcodec': 'none',
2421                 'width': int_or_none(medium.get('width')),
2422                 'height': int_or_none(medium.get('height')),
2423                 'format_note': 'SMIL storyboards',
2424             })
2425
2426         return formats
2427
2428     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2429         urls = []
2430         subtitles = {}
2431         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2432             src = textstream.get('src')
2433             if not src or src in urls:
2434                 continue
2435             urls.append(src)
2436             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2437             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2438             subtitles.setdefault(lang, []).append({
2439                 'url': src,
2440                 'ext': ext,
2441             })
2442         return subtitles
2443
2444     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2445         res = self._download_xml_handle(
2446             xspf_url, playlist_id, 'Downloading xpsf playlist',
2447             'Unable to download xspf manifest', fatal=fatal)
2448         if res is False:
2449             return []
2450
2451         xspf, urlh = res
2452         xspf_url = urlh.geturl()
2453
2454         return self._parse_xspf(
2455             xspf, playlist_id, xspf_url=xspf_url,
2456             xspf_base_url=base_url(xspf_url))
2457
2458     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2459         NS_MAP = {
2460             'xspf': 'http://xspf.org/ns/0/',
2461             's1': 'http://static.streamone.nl/player/ns/0',
2462         }
2463
2464         entries = []
2465         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2466             title = xpath_text(
2467                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2468             description = xpath_text(
2469                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2470             thumbnail = xpath_text(
2471                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2472             duration = float_or_none(
2473                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2474
2475             formats = []
2476             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2477                 format_url = urljoin(xspf_base_url, location.text)
2478                 if not format_url:
2479                     continue
2480                 formats.append({
2481                     'url': format_url,
2482                     'manifest_url': xspf_url,
2483                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2484                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2485                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2486                 })
2487
2488             entries.append({
2489                 'id': playlist_id,
2490                 'title': title,
2491                 'description': description,
2492                 'thumbnail': thumbnail,
2493                 'duration': duration,
2494                 'formats': formats,
2495             })
2496         return entries
2497
2498     def _extract_mpd_formats(self, *args, **kwargs):
2499         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2500         if subs:
2501             self._report_ignoring_subs('DASH')
2502         return fmts
2503
2504     def _extract_mpd_formats_and_subtitles(
2505             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2506             fatal=True, data=None, headers={}, query={}):
2507
2508         if self.get_param('ignore_no_formats_error'):
2509             fatal = False
2510
2511         res = self._download_xml_handle(
2512             mpd_url, video_id,
2513             note='Downloading MPD manifest' if note is None else note,
2514             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2515             fatal=fatal, data=data, headers=headers, query=query)
2516         if res is False:
2517             return [], {}
2518         mpd_doc, urlh = res
2519         if mpd_doc is None:
2520             return [], {}
2521
2522         # We could have been redirected to a new url when we retrieved our mpd file.
2523         mpd_url = urlh.geturl()
2524         mpd_base_url = base_url(mpd_url)
2525
2526         return self._parse_mpd_formats_and_subtitles(
2527             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2528
2529     def _parse_mpd_formats(self, *args, **kwargs):
2530         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2531         if subs:
2532             self._report_ignoring_subs('DASH')
2533         return fmts
2534
2535     def _parse_mpd_formats_and_subtitles(
2536             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2537         """
2538         Parse formats from MPD manifest.
2539         References:
2540          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2541             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2542          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2543         """
2544         if not self.get_param('dynamic_mpd', True):
2545             if mpd_doc.get('type') == 'dynamic':
2546                 return [], {}
2547
2548         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2549
2550         def _add_ns(path):
2551             return self._xpath_ns(path, namespace)
2552
2553         def is_drm_protected(element):
2554             return element.find(_add_ns('ContentProtection')) is not None
2555
2556         def extract_multisegment_info(element, ms_parent_info):
2557             ms_info = ms_parent_info.copy()
2558
2559             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2560             # common attributes and elements.  We will only extract relevant
2561             # for us.
2562             def extract_common(source):
2563                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2564                 if segment_timeline is not None:
2565                     s_e = segment_timeline.findall(_add_ns('S'))
2566                     if s_e:
2567                         ms_info['total_number'] = 0
2568                         ms_info['s'] = []
2569                         for s in s_e:
2570                             r = int(s.get('r', 0))
2571                             ms_info['total_number'] += 1 + r
2572                             ms_info['s'].append({
2573                                 't': int(s.get('t', 0)),
2574                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2575                                 'd': int(s.attrib['d']),
2576                                 'r': r,
2577                             })
2578                 start_number = source.get('startNumber')
2579                 if start_number:
2580                     ms_info['start_number'] = int(start_number)
2581                 timescale = source.get('timescale')
2582                 if timescale:
2583                     ms_info['timescale'] = int(timescale)
2584                 segment_duration = source.get('duration')
2585                 if segment_duration:
2586                     ms_info['segment_duration'] = float(segment_duration)
2587
2588             def extract_Initialization(source):
2589                 initialization = source.find(_add_ns('Initialization'))
2590                 if initialization is not None:
2591                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2592
2593             segment_list = element.find(_add_ns('SegmentList'))
2594             if segment_list is not None:
2595                 extract_common(segment_list)
2596                 extract_Initialization(segment_list)
2597                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2598                 if segment_urls_e:
2599                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2600             else:
2601                 segment_template = element.find(_add_ns('SegmentTemplate'))
2602                 if segment_template is not None:
2603                     extract_common(segment_template)
2604                     media = segment_template.get('media')
2605                     if media:
2606                         ms_info['media'] = media
2607                     initialization = segment_template.get('initialization')
2608                     if initialization:
2609                         ms_info['initialization'] = initialization
2610                     else:
2611                         extract_Initialization(segment_template)
2612             return ms_info
2613
2614         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2615         formats, subtitles = [], {}
2616         stream_numbers = collections.defaultdict(int)
2617         for period in mpd_doc.findall(_add_ns('Period')):
2618             period_duration = parse_duration(period.get('duration')) or mpd_duration
2619             period_ms_info = extract_multisegment_info(period, {
2620                 'start_number': 1,
2621                 'timescale': 1,
2622             })
2623             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2624                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2625                 for representation in adaptation_set.findall(_add_ns('Representation')):
2626                     representation_attrib = adaptation_set.attrib.copy()
2627                     representation_attrib.update(representation.attrib)
2628                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2629                     mime_type = representation_attrib['mimeType']
2630                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2631
2632                     codec_str = representation_attrib.get('codecs', '')
2633                     # Some kind of binary subtitle found in some youtube livestreams
2634                     if mime_type == 'application/x-rawcc':
2635                         codecs = {'scodec': codec_str}
2636                     else:
2637                         codecs = parse_codecs(codec_str)
2638                     if content_type not in ('video', 'audio', 'text'):
2639                         if mime_type == 'image/jpeg':
2640                             content_type = mime_type
2641                         elif codecs.get('vcodec', 'none') != 'none':
2642                             content_type = 'video'
2643                         elif codecs.get('acodec', 'none') != 'none':
2644                             content_type = 'audio'
2645                         elif codecs.get('scodec', 'none') != 'none':
2646                             content_type = 'text'
2647                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2648                             content_type = 'text'
2649                         else:
2650                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2651                             continue
2652
2653                     base_url = ''
2654                     for element in (representation, adaptation_set, period, mpd_doc):
2655                         base_url_e = element.find(_add_ns('BaseURL'))
2656                         if try_call(lambda: base_url_e.text) is not None:
2657                             base_url = base_url_e.text + base_url
2658                             if re.match(r'^https?://', base_url):
2659                                 break
2660                     if mpd_base_url and base_url.startswith('/'):
2661                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2662                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2663                         if not mpd_base_url.endswith('/'):
2664                             mpd_base_url += '/'
2665                         base_url = mpd_base_url + base_url
2666                     representation_id = representation_attrib.get('id')
2667                     lang = representation_attrib.get('lang')
2668                     url_el = representation.find(_add_ns('BaseURL'))
2669                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2670                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2671                     if representation_id is not None:
2672                         format_id = representation_id
2673                     else:
2674                         format_id = content_type
2675                     if mpd_id:
2676                         format_id = mpd_id + '-' + format_id
2677                     if content_type in ('video', 'audio'):
2678                         f = {
2679                             'format_id': format_id,
2680                             'manifest_url': mpd_url,
2681                             'ext': mimetype2ext(mime_type),
2682                             'width': int_or_none(representation_attrib.get('width')),
2683                             'height': int_or_none(representation_attrib.get('height')),
2684                             'tbr': float_or_none(bandwidth, 1000),
2685                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2686                             'fps': int_or_none(representation_attrib.get('frameRate')),
2687                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2688                             'format_note': 'DASH %s' % content_type,
2689                             'filesize': filesize,
2690                             'container': mimetype2ext(mime_type) + '_dash',
2691                             **codecs
2692                         }
2693                     elif content_type == 'text':
2694                         f = {
2695                             'ext': mimetype2ext(mime_type),
2696                             'manifest_url': mpd_url,
2697                             'filesize': filesize,
2698                         }
2699                     elif content_type == 'image/jpeg':
2700                         # See test case in VikiIE
2701                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2702                         f = {
2703                             'format_id': format_id,
2704                             'ext': 'mhtml',
2705                             'manifest_url': mpd_url,
2706                             'format_note': 'DASH storyboards (jpeg)',
2707                             'acodec': 'none',
2708                             'vcodec': 'none',
2709                         }
2710                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2711                         f['has_drm'] = True
2712                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2713
2714                     def prepare_template(template_name, identifiers):
2715                         tmpl = representation_ms_info[template_name]
2716                         if representation_id is not None:
2717                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2718                         # First of, % characters outside $...$ templates
2719                         # must be escaped by doubling for proper processing
2720                         # by % operator string formatting used further (see
2721                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2722                         t = ''
2723                         in_template = False
2724                         for c in tmpl:
2725                             t += c
2726                             if c == '$':
2727                                 in_template = not in_template
2728                             elif c == '%' and not in_template:
2729                                 t += c
2730                         # Next, $...$ templates are translated to their
2731                         # %(...) counterparts to be used with % operator
2732                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2733                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2734                         t.replace('$$', '$')
2735                         return t
2736
2737                     # @initialization is a regular template like @media one
2738                     # so it should be handled just the same way (see
2739                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2740                     if 'initialization' in representation_ms_info:
2741                         initialization_template = prepare_template(
2742                             'initialization',
2743                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2744                             # $Time$ shall not be included for @initialization thus
2745                             # only $Bandwidth$ remains
2746                             ('Bandwidth', ))
2747                         representation_ms_info['initialization_url'] = initialization_template % {
2748                             'Bandwidth': bandwidth,
2749                         }
2750
2751                     def location_key(location):
2752                         return 'url' if re.match(r'^https?://', location) else 'path'
2753
2754                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2755
2756                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2757                         media_location_key = location_key(media_template)
2758
2759                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2760                         # can't be used at the same time
2761                         if '%(Number' in media_template and 's' not in representation_ms_info:
2762                             segment_duration = None
2763                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2764                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2765                                 representation_ms_info['total_number'] = int(math.ceil(
2766                                     float_or_none(period_duration, segment_duration, default=0)))
2767                             representation_ms_info['fragments'] = [{
2768                                 media_location_key: media_template % {
2769                                     'Number': segment_number,
2770                                     'Bandwidth': bandwidth,
2771                                 },
2772                                 'duration': segment_duration,
2773                             } for segment_number in range(
2774                                 representation_ms_info['start_number'],
2775                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2776                         else:
2777                             # $Number*$ or $Time$ in media template with S list available
2778                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2779                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2780                             representation_ms_info['fragments'] = []
2781                             segment_time = 0
2782                             segment_d = None
2783                             segment_number = representation_ms_info['start_number']
2784
2785                             def add_segment_url():
2786                                 segment_url = media_template % {
2787                                     'Time': segment_time,
2788                                     'Bandwidth': bandwidth,
2789                                     'Number': segment_number,
2790                                 }
2791                                 representation_ms_info['fragments'].append({
2792                                     media_location_key: segment_url,
2793                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2794                                 })
2795
2796                             for num, s in enumerate(representation_ms_info['s']):
2797                                 segment_time = s.get('t') or segment_time
2798                                 segment_d = s['d']
2799                                 add_segment_url()
2800                                 segment_number += 1
2801                                 for r in range(s.get('r', 0)):
2802                                     segment_time += segment_d
2803                                     add_segment_url()
2804                                     segment_number += 1
2805                                 segment_time += segment_d
2806                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2807                         # No media template,
2808                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2809                         # or any YouTube dashsegments video
2810                         fragments = []
2811                         segment_index = 0
2812                         timescale = representation_ms_info['timescale']
2813                         for s in representation_ms_info['s']:
2814                             duration = float_or_none(s['d'], timescale)
2815                             for r in range(s.get('r', 0) + 1):
2816                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2817                                 fragments.append({
2818                                     location_key(segment_uri): segment_uri,
2819                                     'duration': duration,
2820                                 })
2821                                 segment_index += 1
2822                         representation_ms_info['fragments'] = fragments
2823                     elif 'segment_urls' in representation_ms_info:
2824                         # Segment URLs with no SegmentTimeline
2825                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2826                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2827                         fragments = []
2828                         segment_duration = float_or_none(
2829                             representation_ms_info['segment_duration'],
2830                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2831                         for segment_url in representation_ms_info['segment_urls']:
2832                             fragment = {
2833                                 location_key(segment_url): segment_url,
2834                             }
2835                             if segment_duration:
2836                                 fragment['duration'] = segment_duration
2837                             fragments.append(fragment)
2838                         representation_ms_info['fragments'] = fragments
2839                     # If there is a fragments key available then we correctly recognized fragmented media.
2840                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2841                     # assumption is not necessarily correct since we may simply have no support for
2842                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2843                     if 'fragments' in representation_ms_info:
2844                         f.update({
2845                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2846                             'url': mpd_url or base_url,
2847                             'fragment_base_url': base_url,
2848                             'fragments': [],
2849                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2850                         })
2851                         if 'initialization_url' in representation_ms_info:
2852                             initialization_url = representation_ms_info['initialization_url']
2853                             if not f.get('url'):
2854                                 f['url'] = initialization_url
2855                             f['fragments'].append({location_key(initialization_url): initialization_url})
2856                         f['fragments'].extend(representation_ms_info['fragments'])
2857                         if not period_duration:
2858                             period_duration = try_get(
2859                                 representation_ms_info,
2860                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2861                     else:
2862                         # Assuming direct URL to unfragmented media.
2863                         f['url'] = base_url
2864                     if content_type in ('video', 'audio', 'image/jpeg'):
2865                         f['manifest_stream_number'] = stream_numbers[f['url']]
2866                         stream_numbers[f['url']] += 1
2867                         formats.append(f)
2868                     elif content_type == 'text':
2869                         subtitles.setdefault(lang or 'und', []).append(f)
2870
2871         return formats, subtitles
2872
2873     def _extract_ism_formats(self, *args, **kwargs):
2874         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2875         if subs:
2876             self._report_ignoring_subs('ISM')
2877         return fmts
2878
2879     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2880         if self.get_param('ignore_no_formats_error'):
2881             fatal = False
2882
2883         res = self._download_xml_handle(
2884             ism_url, video_id,
2885             note='Downloading ISM manifest' if note is None else note,
2886             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2887             fatal=fatal, data=data, headers=headers, query=query)
2888         if res is False:
2889             return [], {}
2890         ism_doc, urlh = res
2891         if ism_doc is None:
2892             return [], {}
2893
2894         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2895
2896     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2897         """
2898         Parse formats from ISM manifest.
2899         References:
2900          1. [MS-SSTR]: Smooth Streaming Protocol,
2901             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2902         """
2903         if ism_doc.get('IsLive') == 'TRUE':
2904             return [], {}
2905
2906         duration = int(ism_doc.attrib['Duration'])
2907         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2908
2909         formats = []
2910         subtitles = {}
2911         for stream in ism_doc.findall('StreamIndex'):
2912             stream_type = stream.get('Type')
2913             if stream_type not in ('video', 'audio', 'text'):
2914                 continue
2915             url_pattern = stream.attrib['Url']
2916             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2917             stream_name = stream.get('Name')
2918             stream_language = stream.get('Language', 'und')
2919             for track in stream.findall('QualityLevel'):
2920                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2921                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
2922                 # TODO: add support for WVC1 and WMAP
2923                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
2924                     self.report_warning('%s is not a supported codec' % fourcc)
2925                     continue
2926                 tbr = int(track.attrib['Bitrate']) // 1000
2927                 # [1] does not mention Width and Height attributes. However,
2928                 # they're often present while MaxWidth and MaxHeight are
2929                 # missing, so should be used as fallbacks
2930                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2931                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2932                 sampling_rate = int_or_none(track.get('SamplingRate'))
2933
2934                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2935                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
2936
2937                 fragments = []
2938                 fragment_ctx = {
2939                     'time': 0,
2940                 }
2941                 stream_fragments = stream.findall('c')
2942                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2943                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2944                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2945                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2946                     if not fragment_ctx['duration']:
2947                         try:
2948                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2949                         except IndexError:
2950                             next_fragment_time = duration
2951                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2952                     for _ in range(fragment_repeat):
2953                         fragments.append({
2954                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
2955                             'duration': fragment_ctx['duration'] / stream_timescale,
2956                         })
2957                         fragment_ctx['time'] += fragment_ctx['duration']
2958
2959                 if stream_type == 'text':
2960                     subtitles.setdefault(stream_language, []).append({
2961                         'ext': 'ismt',
2962                         'protocol': 'ism',
2963                         'url': ism_url,
2964                         'manifest_url': ism_url,
2965                         'fragments': fragments,
2966                         '_download_params': {
2967                             'stream_type': stream_type,
2968                             'duration': duration,
2969                             'timescale': stream_timescale,
2970                             'fourcc': fourcc,
2971                             'language': stream_language,
2972                             'codec_private_data': track.get('CodecPrivateData'),
2973                         }
2974                     })
2975                 elif stream_type in ('video', 'audio'):
2976                     formats.append({
2977                         'format_id': join_nonempty(ism_id, stream_name, tbr),
2978                         'url': ism_url,
2979                         'manifest_url': ism_url,
2980                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2981                         'width': width,
2982                         'height': height,
2983                         'tbr': tbr,
2984                         'asr': sampling_rate,
2985                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2986                         'acodec': 'none' if stream_type == 'video' else fourcc,
2987                         'protocol': 'ism',
2988                         'fragments': fragments,
2989                         'has_drm': ism_doc.find('Protection') is not None,
2990                         'language': stream_language,
2991                         'audio_channels': int_or_none(track.get('Channels')),
2992                         '_download_params': {
2993                             'stream_type': stream_type,
2994                             'duration': duration,
2995                             'timescale': stream_timescale,
2996                             'width': width or 0,
2997                             'height': height or 0,
2998                             'fourcc': fourcc,
2999                             'language': stream_language,
3000                             'codec_private_data': track.get('CodecPrivateData'),
3001                             'sampling_rate': sampling_rate,
3002                             'channels': int_or_none(track.get('Channels', 2)),
3003                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3004                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3005                         },
3006                     })
3007         return formats, subtitles
3008
3009     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3010         def absolute_url(item_url):
3011             return urljoin(base_url, item_url)
3012
3013         def parse_content_type(content_type):
3014             if not content_type:
3015                 return {}
3016             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3017             if ctr:
3018                 mimetype, codecs = ctr.groups()
3019                 f = parse_codecs(codecs)
3020                 f['ext'] = mimetype2ext(mimetype)
3021                 return f
3022             return {}
3023
3024         def _media_formats(src, cur_media_type, type_info=None):
3025             type_info = type_info or {}
3026             full_url = absolute_url(src)
3027             ext = type_info.get('ext') or determine_ext(full_url)
3028             if ext == 'm3u8':
3029                 is_plain_url = False
3030                 formats = self._extract_m3u8_formats(
3031                     full_url, video_id, ext='mp4',
3032                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3033                     preference=preference, quality=quality, fatal=False)
3034             elif ext == 'mpd':
3035                 is_plain_url = False
3036                 formats = self._extract_mpd_formats(
3037                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3038             else:
3039                 is_plain_url = True
3040                 formats = [{
3041                     'url': full_url,
3042                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3043                     'ext': ext,
3044                 }]
3045             return is_plain_url, formats
3046
3047         entries = []
3048         # amp-video and amp-audio are very similar to their HTML5 counterparts
3049         # so we will include them right here (see
3050         # https://www.ampproject.org/docs/reference/components/amp-video)
3051         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3052         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3053         media_tags = [(media_tag, media_tag_name, media_type, '')
3054                       for media_tag, media_tag_name, media_type
3055                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3056         media_tags.extend(re.findall(
3057             # We only allow video|audio followed by a whitespace or '>'.
3058             # Allowing more characters may end up in significant slow down (see
3059             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3060             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3061             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3062         for media_tag, _, media_type, media_content in media_tags:
3063             media_info = {
3064                 'formats': [],
3065                 'subtitles': {},
3066             }
3067             media_attributes = extract_attributes(media_tag)
3068             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3069             if src:
3070                 f = parse_content_type(media_attributes.get('type'))
3071                 _, formats = _media_formats(src, media_type, f)
3072                 media_info['formats'].extend(formats)
3073             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3074             if media_content:
3075                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3076                     s_attr = extract_attributes(source_tag)
3077                     # data-video-src and data-src are non standard but seen
3078                     # several times in the wild
3079                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3080                     if not src:
3081                         continue
3082                     f = parse_content_type(s_attr.get('type'))
3083                     is_plain_url, formats = _media_formats(src, media_type, f)
3084                     if is_plain_url:
3085                         # width, height, res, label and title attributes are
3086                         # all not standard but seen several times in the wild
3087                         labels = [
3088                             s_attr.get(lbl)
3089                             for lbl in ('label', 'title')
3090                             if str_or_none(s_attr.get(lbl))
3091                         ]
3092                         width = int_or_none(s_attr.get('width'))
3093                         height = (int_or_none(s_attr.get('height'))
3094                                   or int_or_none(s_attr.get('res')))
3095                         if not width or not height:
3096                             for lbl in labels:
3097                                 resolution = parse_resolution(lbl)
3098                                 if not resolution:
3099                                     continue
3100                                 width = width or resolution.get('width')
3101                                 height = height or resolution.get('height')
3102                         for lbl in labels:
3103                             tbr = parse_bitrate(lbl)
3104                             if tbr:
3105                                 break
3106                         else:
3107                             tbr = None
3108                         f.update({
3109                             'width': width,
3110                             'height': height,
3111                             'tbr': tbr,
3112                             'format_id': s_attr.get('label') or s_attr.get('title'),
3113                         })
3114                         f.update(formats[0])
3115                         media_info['formats'].append(f)
3116                     else:
3117                         media_info['formats'].extend(formats)
3118                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3119                     track_attributes = extract_attributes(track_tag)
3120                     kind = track_attributes.get('kind')
3121                     if not kind or kind in ('subtitles', 'captions'):
3122                         src = strip_or_none(track_attributes.get('src'))
3123                         if not src:
3124                             continue
3125                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3126                         media_info['subtitles'].setdefault(lang, []).append({
3127                             'url': absolute_url(src),
3128                         })
3129             for f in media_info['formats']:
3130                 f.setdefault('http_headers', {})['Referer'] = base_url
3131             if media_info['formats'] or media_info['subtitles']:
3132                 entries.append(media_info)
3133         return entries
3134
3135     def _extract_akamai_formats(self, *args, **kwargs):
3136         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3137         if subs:
3138             self._report_ignoring_subs('akamai')
3139         return fmts
3140
3141     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3142         signed = 'hdnea=' in manifest_url
3143         if not signed:
3144             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3145             manifest_url = re.sub(
3146                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3147                 '', manifest_url).strip('?')
3148
3149         formats = []
3150         subtitles = {}
3151
3152         hdcore_sign = 'hdcore=3.7.0'
3153         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3154         hds_host = hosts.get('hds')
3155         if hds_host:
3156             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3157         if 'hdcore=' not in f4m_url:
3158             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3159         f4m_formats = self._extract_f4m_formats(
3160             f4m_url, video_id, f4m_id='hds', fatal=False)
3161         for entry in f4m_formats:
3162             entry.update({'extra_param_to_segment_url': hdcore_sign})
3163         formats.extend(f4m_formats)
3164
3165         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3166         hls_host = hosts.get('hls')
3167         if hls_host:
3168             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3169         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3170             m3u8_url, video_id, 'mp4', 'm3u8_native',
3171             m3u8_id='hls', fatal=False)
3172         formats.extend(m3u8_formats)
3173         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3174
3175         http_host = hosts.get('http')
3176         if http_host and m3u8_formats and not signed:
3177             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3178             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3179             qualities_length = len(qualities)
3180             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3181                 i = 0
3182                 for f in m3u8_formats:
3183                     if f['vcodec'] != 'none':
3184                         for protocol in ('http', 'https'):
3185                             http_f = f.copy()
3186                             del http_f['manifest_url']
3187                             http_url = re.sub(
3188                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3189                             http_f.update({
3190                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3191                                 'url': http_url,
3192                                 'protocol': protocol,
3193                             })
3194                             formats.append(http_f)
3195                         i += 1
3196
3197         return formats, subtitles
3198
3199     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3200         query = urllib.parse.urlparse(url).query
3201         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3202         mobj = re.search(
3203             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3204         url_base = mobj.group('url')
3205         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3206         formats = []
3207
3208         def manifest_url(manifest):
3209             m_url = f'{http_base_url}/{manifest}'
3210             if query:
3211                 m_url += '?%s' % query
3212             return m_url
3213
3214         if 'm3u8' not in skip_protocols:
3215             formats.extend(self._extract_m3u8_formats(
3216                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3217                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3218         if 'f4m' not in skip_protocols:
3219             formats.extend(self._extract_f4m_formats(
3220                 manifest_url('manifest.f4m'),
3221                 video_id, f4m_id='hds', fatal=False))
3222         if 'dash' not in skip_protocols:
3223             formats.extend(self._extract_mpd_formats(
3224                 manifest_url('manifest.mpd'),
3225                 video_id, mpd_id='dash', fatal=False))
3226         if re.search(r'(?:/smil:|\.smil)', url_base):
3227             if 'smil' not in skip_protocols:
3228                 rtmp_formats = self._extract_smil_formats(
3229                     manifest_url('jwplayer.smil'),
3230                     video_id, fatal=False)
3231                 for rtmp_format in rtmp_formats:
3232                     rtsp_format = rtmp_format.copy()
3233                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3234                     del rtsp_format['play_path']
3235                     del rtsp_format['ext']
3236                     rtsp_format.update({
3237                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3238                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3239                         'protocol': 'rtsp',
3240                     })
3241                     formats.extend([rtmp_format, rtsp_format])
3242         else:
3243             for protocol in ('rtmp', 'rtsp'):
3244                 if protocol not in skip_protocols:
3245                     formats.append({
3246                         'url': f'{protocol}:{url_base}',
3247                         'format_id': protocol,
3248                         'protocol': protocol,
3249                     })
3250         return formats
3251
3252     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3253         mobj = re.search(
3254             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3255             webpage)
3256         if mobj:
3257             try:
3258                 jwplayer_data = self._parse_json(mobj.group('options'),
3259                                                  video_id=video_id,
3260                                                  transform_source=transform_source)
3261             except ExtractorError:
3262                 pass
3263             else:
3264                 if isinstance(jwplayer_data, dict):
3265                     return jwplayer_data
3266
3267     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3268         jwplayer_data = self._find_jwplayer_data(
3269             webpage, video_id, transform_source=js_to_json)
3270         return self._parse_jwplayer_data(
3271             jwplayer_data, video_id, *args, **kwargs)
3272
3273     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3274                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3275         entries = []
3276         if not isinstance(jwplayer_data, dict):
3277             return entries
3278
3279         playlist_items = jwplayer_data.get('playlist')
3280         # JWPlayer backward compatibility: single playlist item/flattened playlists
3281         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3282         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3283         if not isinstance(playlist_items, list):
3284             playlist_items = (playlist_items or jwplayer_data, )
3285
3286         for video_data in playlist_items:
3287             if not isinstance(video_data, dict):
3288                 continue
3289             # JWPlayer backward compatibility: flattened sources
3290             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3291             if 'sources' not in video_data:
3292                 video_data['sources'] = [video_data]
3293
3294             this_video_id = video_id or video_data['mediaid']
3295
3296             formats = self._parse_jwplayer_formats(
3297                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3298                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3299
3300             subtitles = {}
3301             tracks = video_data.get('tracks')
3302             if tracks and isinstance(tracks, list):
3303                 for track in tracks:
3304                     if not isinstance(track, dict):
3305                         continue
3306                     track_kind = track.get('kind')
3307                     if not track_kind or not isinstance(track_kind, str):
3308                         continue
3309                     if track_kind.lower() not in ('captions', 'subtitles'):
3310                         continue
3311                     track_url = urljoin(base_url, track.get('file'))
3312                     if not track_url:
3313                         continue
3314                     subtitles.setdefault(track.get('label') or 'en', []).append({
3315                         'url': self._proto_relative_url(track_url)
3316                     })
3317
3318             entry = {
3319                 'id': this_video_id,
3320                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3321                 'description': clean_html(video_data.get('description')),
3322                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3323                 'timestamp': int_or_none(video_data.get('pubdate')),
3324                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3325                 'subtitles': subtitles,
3326                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3327                 'genre': clean_html(video_data.get('genre')),
3328                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3329                 'season_number': int_or_none(video_data.get('season')),
3330                 'episode_number': int_or_none(video_data.get('episode')),
3331                 'release_year': int_or_none(video_data.get('releasedate')),
3332                 'age_limit': int_or_none(video_data.get('age_restriction')),
3333             }
3334             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3335             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3336                 entry.update({
3337                     '_type': 'url_transparent',
3338                     'url': formats[0]['url'],
3339                 })
3340             else:
3341                 entry['formats'] = formats
3342             entries.append(entry)
3343         if len(entries) == 1:
3344             return entries[0]
3345         else:
3346             return self.playlist_result(entries)
3347
3348     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3349                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3350         urls = set()
3351         formats = []
3352         for source in jwplayer_sources_data:
3353             if not isinstance(source, dict):
3354                 continue
3355             source_url = urljoin(
3356                 base_url, self._proto_relative_url(source.get('file')))
3357             if not source_url or source_url in urls:
3358                 continue
3359             urls.add(source_url)
3360             source_type = source.get('type') or ''
3361             ext = mimetype2ext(source_type) or determine_ext(source_url)
3362             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3363                 formats.extend(self._extract_m3u8_formats(
3364                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3365                     m3u8_id=m3u8_id, fatal=False))
3366             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3367                 formats.extend(self._extract_mpd_formats(
3368                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3369             elif ext == 'smil':
3370                 formats.extend(self._extract_smil_formats(
3371                     source_url, video_id, fatal=False))
3372             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3373             elif source_type.startswith('audio') or ext in (
3374                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3375                 formats.append({
3376                     'url': source_url,
3377                     'vcodec': 'none',
3378                     'ext': ext,
3379                 })
3380             else:
3381                 format_id = str_or_none(source.get('label'))
3382                 height = int_or_none(source.get('height'))
3383                 if height is None and format_id:
3384                     # Often no height is provided but there is a label in
3385                     # format like "1080p", "720p SD", or 1080.
3386                     height = parse_resolution(format_id).get('height')
3387                 a_format = {
3388                     'url': source_url,
3389                     'width': int_or_none(source.get('width')),
3390                     'height': height,
3391                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3392                     'filesize': int_or_none(source.get('filesize')),
3393                     'ext': ext,
3394                     'format_id': format_id
3395                 }
3396                 if source_url.startswith('rtmp'):
3397                     a_format['ext'] = 'flv'
3398                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3399                     # of jwplayer.flash.swf
3400                     rtmp_url_parts = re.split(
3401                         r'((?:mp4|mp3|flv):)', source_url, 1)
3402                     if len(rtmp_url_parts) == 3:
3403                         rtmp_url, prefix, play_path = rtmp_url_parts
3404                         a_format.update({
3405                             'url': rtmp_url,
3406                             'play_path': prefix + play_path,
3407                         })
3408                     if rtmp_params:
3409                         a_format.update(rtmp_params)
3410                 formats.append(a_format)
3411         return formats
3412
3413     def _live_title(self, name):
3414         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3415         return name
3416
3417     def _int(self, v, name, fatal=False, **kwargs):
3418         res = int_or_none(v, **kwargs)
3419         if res is None:
3420             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3421             if fatal:
3422                 raise ExtractorError(msg)
3423             else:
3424                 self.report_warning(msg)
3425         return res
3426
3427     def _float(self, v, name, fatal=False, **kwargs):
3428         res = float_or_none(v, **kwargs)
3429         if res is None:
3430             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3431             if fatal:
3432                 raise ExtractorError(msg)
3433             else:
3434                 self.report_warning(msg)
3435         return res
3436
3437     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3438                     path='/', secure=False, discard=False, rest={}, **kwargs):
3439         cookie = http.cookiejar.Cookie(
3440             0, name, value, port, port is not None, domain, True,
3441             domain.startswith('.'), path, True, secure, expire_time,
3442             discard, None, None, rest)
3443         self.cookiejar.set_cookie(cookie)
3444
3445     def _get_cookies(self, url):
3446         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3447         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3448
3449     def _apply_first_set_cookie_header(self, url_handle, cookie):
3450         """
3451         Apply first Set-Cookie header instead of the last. Experimental.
3452
3453         Some sites (e.g. [1-3]) may serve two cookies under the same name
3454         in Set-Cookie header and expect the first (old) one to be set rather
3455         than second (new). However, as of RFC6265 the newer one cookie
3456         should be set into cookie store what actually happens.
3457         We will workaround this issue by resetting the cookie to
3458         the first one manually.
3459         1. https://new.vk.com/
3460         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3461         3. https://learning.oreilly.com/
3462         """
3463         for header, cookies in url_handle.headers.items():
3464             if header.lower() != 'set-cookie':
3465                 continue
3466             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3467             cookie_value = re.search(
3468                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3469             if cookie_value:
3470                 value, domain = cookie_value.groups()
3471                 self._set_cookie(domain, cookie, value)
3472                 break
3473
3474     @classmethod
3475     def get_testcases(cls, include_onlymatching=False):
3476         # Do not look in super classes
3477         t = vars(cls).get('_TEST')
3478         if t:
3479             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3480             tests = [t]
3481         else:
3482             tests = vars(cls).get('_TESTS', [])
3483         for t in tests:
3484             if not include_onlymatching and t.get('only_matching', False):
3485                 continue
3486             t['name'] = cls.ie_key()
3487             yield t
3488         if getattr(cls, '__wrapped__', None):
3489             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3490
3491     @classmethod
3492     def get_webpage_testcases(cls):
3493         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3494         for t in tests:
3495             t['name'] = cls.ie_key()
3496             yield t
3497         if getattr(cls, '__wrapped__', None):
3498             yield from cls.__wrapped__.get_webpage_testcases()
3499
3500     @classproperty(cache=True)
3501     def age_limit(cls):
3502         """Get age limit from the testcases"""
3503         return max(traverse_obj(
3504             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3505             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3506
3507     @classproperty(cache=True)
3508     def _RETURN_TYPE(cls):
3509         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3510         tests = tuple(cls.get_testcases(include_onlymatching=False))
3511         if not tests:
3512             return None
3513         elif not any(k.startswith('playlist') for test in tests for k in test):
3514             return 'video'
3515         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3516             return 'playlist'
3517         return 'any'
3518
3519     @classmethod
3520     def is_single_video(cls, url):
3521         """Returns whether the URL is of a single video, None if unknown"""
3522         if cls.suitable(url):
3523             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3524
3525     @classmethod
3526     def is_suitable(cls, age_limit):
3527         """Test whether the extractor is generally suitable for the given age limit"""
3528         return not age_restricted(cls.age_limit, age_limit)
3529
3530     @classmethod
3531     def description(cls, *, markdown=True, search_examples=None):
3532         """Description of the extractor"""
3533         desc = ''
3534         if cls._NETRC_MACHINE:
3535             if markdown:
3536                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3537             else:
3538                 desc += f' [{cls._NETRC_MACHINE}]'
3539         if cls.IE_DESC is False:
3540             desc += ' [HIDDEN]'
3541         elif cls.IE_DESC:
3542             desc += f' {cls.IE_DESC}'
3543         if cls.SEARCH_KEY:
3544             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3545             if search_examples:
3546                 _COUNTS = ('', '5', '10', 'all')
3547                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3548         if not cls.working():
3549             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3550
3551         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3552         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3553         return f'{name}:{desc}' if desc else name
3554
3555     def extract_subtitles(self, *args, **kwargs):
3556         if (self.get_param('writesubtitles', False)
3557                 or self.get_param('listsubtitles')):
3558             return self._get_subtitles(*args, **kwargs)
3559         return {}
3560
3561     def _get_subtitles(self, *args, **kwargs):
3562         raise NotImplementedError('This method must be implemented by subclasses')
3563
3564     class CommentsDisabled(Exception):
3565         """Raise in _get_comments if comments are disabled for the video"""
3566
3567     def extract_comments(self, *args, **kwargs):
3568         if not self.get_param('getcomments'):
3569             return None
3570         generator = self._get_comments(*args, **kwargs)
3571
3572         def extractor():
3573             comments = []
3574             interrupted = True
3575             try:
3576                 while True:
3577                     comments.append(next(generator))
3578             except StopIteration:
3579                 interrupted = False
3580             except KeyboardInterrupt:
3581                 self.to_screen('Interrupted by user')
3582             except self.CommentsDisabled:
3583                 return {'comments': None, 'comment_count': None}
3584             except Exception as e:
3585                 if self.get_param('ignoreerrors') is not True:
3586                     raise
3587                 self._downloader.report_error(e)
3588             comment_count = len(comments)
3589             self.to_screen(f'Extracted {comment_count} comments')
3590             return {
3591                 'comments': comments,
3592                 'comment_count': None if interrupted else comment_count
3593             }
3594         return extractor
3595
3596     def _get_comments(self, *args, **kwargs):
3597         raise NotImplementedError('This method must be implemented by subclasses')
3598
3599     @staticmethod
3600     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3601         """ Merge subtitle items for one language. Items with duplicated URLs/data
3602         will be dropped. """
3603         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3604         ret = list(subtitle_list1)
3605         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3606         return ret
3607
3608     @classmethod
3609     def _merge_subtitles(cls, *dicts, target=None):
3610         """ Merge subtitle dictionaries, language by language. """
3611         if target is None:
3612             target = {}
3613         for d in dicts:
3614             for lang, subs in d.items():
3615                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3616         return target
3617
3618     def extract_automatic_captions(self, *args, **kwargs):
3619         if (self.get_param('writeautomaticsub', False)
3620                 or self.get_param('listsubtitles')):
3621             return self._get_automatic_captions(*args, **kwargs)
3622         return {}
3623
3624     def _get_automatic_captions(self, *args, **kwargs):
3625         raise NotImplementedError('This method must be implemented by subclasses')
3626
3627     @functools.cached_property
3628     def _cookies_passed(self):
3629         """Whether cookies have been passed to YoutubeDL"""
3630         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3631
3632     def mark_watched(self, *args, **kwargs):
3633         if not self.get_param('mark_watched', False):
3634             return
3635         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3636             self._mark_watched(*args, **kwargs)
3637
3638     def _mark_watched(self, *args, **kwargs):
3639         raise NotImplementedError('This method must be implemented by subclasses')
3640
3641     def geo_verification_headers(self):
3642         headers = {}
3643         geo_verification_proxy = self.get_param('geo_verification_proxy')
3644         if geo_verification_proxy:
3645             headers['Ytdl-request-proxy'] = geo_verification_proxy
3646         return headers
3647
3648     @staticmethod
3649     def _generic_id(url):
3650         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3651
3652     def _generic_title(self, url='', webpage='', *, default=None):
3653         return (self._og_search_title(webpage, default=None)
3654                 or self._html_extract_title(webpage, default=None)
3655                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3656                 or default)
3657
3658     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3659         if not duration:
3660             return
3661         chapter_list = [{
3662             'start_time': start_function(chapter),
3663             'title': title_function(chapter),
3664         } for chapter in chapter_list or []]
3665         if strict:
3666             warn = self.report_warning
3667         else:
3668             warn = self.write_debug
3669             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3670
3671         chapters = [{'start_time': 0}]
3672         for idx, chapter in enumerate(chapter_list):
3673             if chapter['start_time'] is None:
3674                 warn(f'Incomplete chapter {idx}')
3675             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3676                 chapters.append(chapter)
3677             elif chapter not in chapters:
3678                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3679                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3680                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3681         return chapters[1:]
3682
3683     def _extract_chapters_from_description(self, description, duration):
3684         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3685         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3686         return self._extract_chapters_helper(
3687             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3688             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3689             duration=duration, strict=False) or self._extract_chapters_helper(
3690             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3691             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3692             duration=duration, strict=False)
3693
3694     @staticmethod
3695     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3696         all_known = all(map(
3697             lambda x: x is not None,
3698             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3699         return (
3700             'private' if is_private
3701             else 'premium_only' if needs_premium
3702             else 'subscriber_only' if needs_subscription
3703             else 'needs_auth' if needs_auth
3704             else 'unlisted' if is_unlisted
3705             else 'public' if all_known
3706             else None)
3707
3708     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3709         '''
3710         @returns            A list of values for the extractor argument given by "key"
3711                             or "default" if no such key is present
3712         @param default      The default value to return when the key is not present (default: [])
3713         @param casesense    When false, the values are converted to lower case
3714         '''
3715         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3716         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3717         if val is None:
3718             return [] if default is NO_DEFAULT else default
3719         return list(val) if casesense else [x.lower() for x in val]
3720
3721     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3722         if not playlist_id or not video_id:
3723             return not video_id
3724
3725         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3726         if no_playlist is not None:
3727             return not no_playlist
3728
3729         video_id = '' if video_id is True else f' {video_id}'
3730         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3731         if self.get_param('noplaylist'):
3732             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3733             return False
3734         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3735         return True
3736
3737     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3738         RetryManager.report_retry(
3739             err, _count or int(fatal), _retries,
3740             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3741             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3742
3743     def RetryManager(self, **kwargs):
3744         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3745
3746     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3747         display_id = traverse_obj(info_dict, 'display_id', 'id')
3748         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3749         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3750             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3751
3752     @classmethod
3753     def extract_from_webpage(cls, ydl, url, webpage):
3754         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3755               else ydl.get_info_extractor(cls.ie_key()))
3756         for info in ie._extract_from_webpage(url, webpage) or []:
3757             # url = None since we do not want to set (webpage/original)_url
3758             ydl.add_default_extra_info(info, ie, None)
3759             yield info
3760
3761     @classmethod
3762     def _extract_from_webpage(cls, url, webpage):
3763         for embed_url in orderedSet(
3764                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3765             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3766
3767     @classmethod
3768     def _extract_embed_urls(cls, url, webpage):
3769         """@returns all the embed urls on the webpage"""
3770         if '_EMBED_URL_RE' not in cls.__dict__:
3771             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3772             for idx, regex in enumerate(cls._EMBED_REGEX):
3773                 assert regex.count('(?P<url>') == 1, \
3774                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3775             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3776
3777         for regex in cls._EMBED_URL_RE:
3778             for mobj in regex.finditer(webpage):
3779                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3780                 if cls._VALID_URL is False or cls.suitable(embed_url):
3781                     yield embed_url
3782
3783     class StopExtraction(Exception):
3784         pass
3785
3786     @classmethod
3787     def _extract_url(cls, webpage):  # TODO: Remove
3788         """Only for compatibility with some older extractors"""
3789         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3790
3791     @classmethod
3792     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3793         if plugin_name:
3794             mro = inspect.getmro(cls)
3795             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3796             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3797             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3798             while getattr(super_class, '__wrapped__', None):
3799                 super_class = super_class.__wrapped__
3800             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3801             _PLUGIN_OVERRIDES[super_class].append(cls)
3802
3803         return super().__init_subclass__(**kwargs)
3804
3805
3806 class SearchInfoExtractor(InfoExtractor):
3807     """
3808     Base class for paged search queries extractors.
3809     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3810     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3811     """
3812
3813     _MAX_RESULTS = float('inf')
3814     _RETURN_TYPE = 'playlist'
3815
3816     @classproperty
3817     def _VALID_URL(cls):
3818         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3819
3820     def _real_extract(self, query):
3821         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3822         if prefix == '':
3823             return self._get_n_results(query, 1)
3824         elif prefix == 'all':
3825             return self._get_n_results(query, self._MAX_RESULTS)
3826         else:
3827             n = int(prefix)
3828             if n <= 0:
3829                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3830             elif n > self._MAX_RESULTS:
3831                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3832                 n = self._MAX_RESULTS
3833             return self._get_n_results(query, n)
3834
3835     def _get_n_results(self, query, n):
3836         """Get a specified number of results for a query.
3837         Either this function or _search_results must be overridden by subclasses """
3838         return self.playlist_result(
3839             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3840             query, query)
3841
3842     def _search_results(self, query):
3843         """Returns an iterator of search results"""
3844         raise NotImplementedError('This method must be implemented by subclasses')
3845
3846     @classproperty
3847     def SEARCH_KEY(cls):
3848         return cls._SEARCH_KEY
3849
3850
3851 class UnsupportedURLIE(InfoExtractor):
3852     _VALID_URL = '.*'
3853     _ENABLED = False
3854     IE_DESC = False
3855
3856     def _real_extract(self, url):
3857         raise UnsupportedError(url)
3858
3859
3860 _PLUGIN_OVERRIDES = collections.defaultdict(list)