yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import sys
  17 import time
  18 import types
  19 import urllib.parse
  20 import urllib.request
  21 import xml.etree.ElementTree
  22
  23 from ..compat import functools  # isort: split
  24 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  25 from ..cookies import LenientSimpleCookie
  26 from ..downloader.f4m import get_base_url, remove_encrypted_media
  27 from ..utils import (
  28     IDENTITY,
  29     JSON_LD_RE,
  30     NO_DEFAULT,
  31     ExtractorError,
  32     FormatSorter,
  33     GeoRestrictedError,
  34     GeoUtils,
  35     HEADRequest,
  36     LenientJSONDecoder,
  37     RegexNotFoundError,
  38     RetryManager,
  39     UnsupportedError,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     classproperty,
  44     clean_html,
  45     deprecation_warning,
  46     determine_ext,
  47     dict_get,
  48     encode_data_uri,
  49     error_to_compat_str,
  50     extract_attributes,
  51     filter_dict,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     int_or_none,
  56     join_nonempty,
  57     js_to_json,
  58     mimetype2ext,
  59     network_exceptions,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     sanitize_filename,
  68     sanitize_url,
  69     sanitized_Request,
  70     smuggle_url,
  71     str_or_none,
  72     str_to_int,
  73     strip_or_none,
  74     traverse_obj,
  75     truncate_string,
  76     try_call,
  77     try_get,
  78     unescapeHTML,
  79     unified_strdate,
  80     unified_timestamp,
  81     update_Request,
  82     update_url_query,
  83     url_basename,
  84     url_or_none,
  85     urlhandle_detect_ext,
  86     urljoin,
  87     variadic,
  88     xpath_element,
  89     xpath_text,
  90     xpath_with_ns,
  91 )
  92
  93
  94 class InfoExtractor:
  95     """Information Extractor class.
  96
  97     Information extractors are the classes that, given a URL, extract
  98     information about the video (or videos) the URL refers to. This
  99     information includes the real video URL, the video title, author and
 100     others. The information is stored in a dictionary which is then
 101     passed to the YoutubeDL. The YoutubeDL processes this
 102     information possibly downloading the video to the file system, among
 103     other possible outcomes.
 104
 105     The type field determines the type of the result.
 106     By far the most common value (and the default if _type is missing) is
 107     "video", which indicates a single video.
 108
 109     For a video, the dictionaries must include the following fields:
 110
 111     id:             Video identifier.
 112     title:          Video title, unescaped. Set to an empty string if video has
 113                     no title as opposed to "None" which signifies that the
 114                     extractor failed to obtain a title
 115
 116     Additionally, it must contain either a formats entry or a url one:
 117
 118     formats:        A list of dictionaries for each format available, ordered
 119                     from worst to best quality.
 120
 121                     Potential fields:
 122                     * url        The mandatory URL representing the media:
 123                                    for plain file media - HTTP URL of this file,
 124                                    for RTMP - RTMP URL,
 125                                    for HLS - URL of the M3U8 media playlist,
 126                                    for HDS - URL of the F4M manifest,
 127                                    for DASH
 128                                      - HTTP URL to plain file media (in case of
 129                                        unfragmented media)
 130                                      - URL of the MPD manifest or base URL
 131                                        representing the media if MPD manifest
 132                                        is parsed from a string (in case of
 133                                        fragmented media)
 134                                    for MSS - URL of the ISM manifest.
 135                     * request_data  Data to send in POST request to the URL
 136                     * manifest_url
 137                                  The URL of the manifest file in case of
 138                                  fragmented media:
 139                                    for HLS - URL of the M3U8 master playlist,
 140                                    for HDS - URL of the F4M manifest,
 141                                    for DASH - URL of the MPD manifest,
 142                                    for MSS - URL of the ISM manifest.
 143                     * manifest_stream_number  (For internal use only)
 144                                  The index of the stream in the manifest file
 145                     * ext        Will be calculated from URL if missing
 146                     * format     A human-readable description of the format
 147                                  ("mp4 container with h264/opus").
 148                                  Calculated from the format_id, width, height.
 149                                  and format_note fields if missing.
 150                     * format_id  A short description of the format
 151                                  ("mp4_h264_opus" or "19").
 152                                 Technically optional, but strongly recommended.
 153                     * format_note Additional info about the format
 154                                  ("3D" or "DASH video")
 155                     * width      Width of the video, if known
 156                     * height     Height of the video, if known
 157                     * aspect_ratio  Aspect ratio of the video, if known
 158                                  Automatically calculated from width and height
 159                     * resolution Textual description of width and height
 160                                  Automatically calculated from width and height
 161                     * dynamic_range The dynamic range of the video. One of:
 162                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 163                     * tbr        Average bitrate of audio and video in KBit/s
 164                     * abr        Average audio bitrate in KBit/s
 165                     * acodec     Name of the audio codec in use
 166                     * asr        Audio sampling rate in Hertz
 167                     * audio_channels  Number of audio channels
 168                     * vbr        Average video bitrate in KBit/s
 169                     * fps        Frame rate
 170                     * vcodec     Name of the video codec in use
 171                     * container  Name of the container format
 172                     * filesize   The number of bytes, if known in advance
 173                     * filesize_approx  An estimate for the number of bytes
 174                     * player_url SWF Player URL (used for rtmpdump).
 175                     * protocol   The protocol that will be used for the actual
 176                                  download, lower-case. One of "http", "https" or
 177                                  one of the protocols defined in downloader.PROTOCOL_MAP
 178                     * fragment_base_url
 179                                  Base URL for fragments. Each fragment's path
 180                                  value (if present) will be relative to
 181                                  this URL.
 182                     * fragments  A list of fragments of a fragmented media.
 183                                  Each fragment entry must contain either an url
 184                                  or a path. If an url is present it should be
 185                                  considered by a client. Otherwise both path and
 186                                  fragment_base_url must be present. Here is
 187                                  the list of all potential fields:
 188                                  * "url" - fragment's URL
 189                                  * "path" - fragment's path relative to
 190                                             fragment_base_url
 191                                  * "duration" (optional, int or float)
 192                                  * "filesize" (optional, int)
 193                     * is_from_start  Is a live format that can be downloaded
 194                                 from the start. Boolean
 195                     * preference Order number of this format. If this field is
 196                                  present and not None, the formats get sorted
 197                                  by this field, regardless of all other values.
 198                                  -1 for default (order by other properties),
 199                                  -2 or smaller for less than default.
 200                                  < -1000 to hide the format (if there is
 201                                     another one which is strictly better)
 202                     * language   Language code, e.g. "de" or "en-US".
 203                     * language_preference  Is this in the language mentioned in
 204                                  the URL?
 205                                  10 if it's what the URL is about,
 206                                  -1 for default (don't know),
 207                                  -10 otherwise, other values reserved for now.
 208                     * quality    Order number of the video quality of this
 209                                  format, irrespective of the file format.
 210                                  -1 for default (order by other properties),
 211                                  -2 or smaller for less than default.
 212                     * source_preference  Order number for this video source
 213                                   (quality takes higher priority)
 214                                  -1 for default (order by other properties),
 215                                  -2 or smaller for less than default.
 216                     * http_headers  A dictionary of additional HTTP headers
 217                                  to add to the request.
 218                     * stretched_ratio  If given and not 1, indicates that the
 219                                  video's pixels are not square.
 220                                  width : height ratio as float.
 221                     * no_resume  The server does not support resuming the
 222                                  (HTTP or RTMP) download. Boolean.
 223                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 224                     * extra_param_to_segment_url  A query string to append to each
 225                                  fragment's URL, or to update each existing query string
 226                                  with. Only applied by the native HLS/DASH downloaders.
 227                     * hls_aes    A dictionary of HLS AES-128 decryption information
 228                                  used by the native HLS downloader to override the
 229                                  values in the media playlist when an '#EXT-X-KEY' tag
 230                                  is present in the playlist:
 231                                  * uri  The URI from which the key will be downloaded
 232                                  * key  The key (as hex) used to decrypt fragments.
 233                                         If `key` is given, any key URI will be ignored
 234                                  * iv   The IV (as hex) used to decrypt fragments
 235                     * downloader_options  A dictionary of downloader options
 236                                  (For internal use only)
 237                                  * http_chunk_size Chunk size for HTTP downloads
 238                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 239                     RTMP formats can also have the additional fields: page_url,
 240                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 241                     rtmp_protocol, rtmp_real_time
 242
 243     url:            Final video URL.
 244     ext:            Video filename extension.
 245     format:         The video format, defaults to ext (used for --get-format)
 246     player_url:     SWF Player URL (used for rtmpdump).
 247
 248     The following fields are optional:
 249
 250     direct:         True if a direct video file was given (must only be set by GenericIE)
 251     alt_title:      A secondary title of the video.
 252     display_id      An alternative identifier for the video, not necessarily
 253                     unique, but available before title. Typically, id is
 254                     something like "4234987", title "Dancing naked mole rats",
 255                     and display_id "dancing-naked-mole-rats"
 256     thumbnails:     A list of dictionaries, with the following entries:
 257                         * "id" (optional, string) - Thumbnail format ID
 258                         * "url"
 259                         * "preference" (optional, int) - quality of the image
 260                         * "width" (optional, int)
 261                         * "height" (optional, int)
 262                         * "resolution" (optional, string "{width}x{height}",
 263                                         deprecated)
 264                         * "filesize" (optional, int)
 265                         * "http_headers" (dict) - HTTP headers for the request
 266     thumbnail:      Full URL to a video thumbnail image.
 267     description:    Full video description.
 268     uploader:       Full name of the video uploader.
 269     license:        License name the video is licensed under.
 270     creator:        The creator of the video.
 271     timestamp:      UNIX timestamp of the moment the video was uploaded
 272     upload_date:    Video upload date in UTC (YYYYMMDD).
 273                     If not explicitly set, calculated from timestamp
 274     release_timestamp: UNIX timestamp of the moment the video was released.
 275                     If it is not clear whether to use timestamp or this, use the former
 276     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 277                     If not explicitly set, calculated from release_timestamp
 278     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 279     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 280                     If not explicitly set, calculated from modified_timestamp
 281     uploader_id:    Nickname or id of the video uploader.
 282     uploader_url:   Full URL to a personal webpage of the video uploader.
 283     channel:        Full name of the channel the video is uploaded on.
 284                     Note that channel fields may or may not repeat uploader
 285                     fields. This depends on a particular extractor.
 286     channel_id:     Id of the channel.
 287     channel_url:    Full URL to a channel webpage.
 288     channel_follower_count: Number of followers of the channel.
 289     location:       Physical location where the video was filmed.
 290     subtitles:      The available subtitles as a dictionary in the format
 291                     {tag: subformats}. "tag" is usually a language code, and
 292                     "subformats" is a list sorted from lower to higher
 293                     preference, each element is a dictionary with the "ext"
 294                     entry and one of:
 295                         * "data": The subtitles file contents
 296                         * "url": A URL pointing to the subtitles file
 297                     It can optionally also have:
 298                         * "name": Name or description of the subtitles
 299                         * "http_headers": A dictionary of additional HTTP headers
 300                                   to add to the request.
 301                     "ext" will be calculated from URL if missing
 302     automatic_captions: Like 'subtitles'; contains automatically generated
 303                     captions instead of normal subtitles
 304     duration:       Length of the video in seconds, as an integer or float.
 305     view_count:     How many users have watched the video on the platform.
 306     concurrent_view_count: How many users are currently watching the video on the platform.
 307     like_count:     Number of positive ratings of the video
 308     dislike_count:  Number of negative ratings of the video
 309     repost_count:   Number of reposts of the video
 310     average_rating: Average rating give by users, the scale used depends on the webpage
 311     comment_count:  Number of comments on the video
 312     comments:       A list of comments, each with one or more of the following
 313                     properties (all but one of text or html optional):
 314                         * "author" - human-readable name of the comment author
 315                         * "author_id" - user ID of the comment author
 316                         * "author_thumbnail" - The thumbnail of the comment author
 317                         * "author_url" - The url to the comment author's page
 318                         * "author_is_verified" - Whether the author is verified
 319                                                  on the platform
 320                         * "author_is_uploader" - Whether the comment is made by
 321                                                  the video uploader
 322                         * "id" - Comment ID
 323                         * "html" - Comment as HTML
 324                         * "text" - Plain text of the comment
 325                         * "timestamp" - UNIX timestamp of comment
 326                         * "parent" - ID of the comment this one is replying to.
 327                                      Set to "root" to indicate that this is a
 328                                      comment to the original video.
 329                         * "like_count" - Number of positive ratings of the comment
 330                         * "dislike_count" - Number of negative ratings of the comment
 331                         * "is_favorited" - Whether the comment is marked as
 332                                            favorite by the video uploader
 333                         * "is_pinned" - Whether the comment is pinned to
 334                                         the top of the comments
 335     age_limit:      Age restriction for the video, as an integer (years)
 336     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 337                     should allow to get the same result again. (It will be set
 338                     by YoutubeDL if it's missing)
 339     categories:     A list of categories that the video falls in, for example
 340                     ["Sports", "Berlin"]
 341     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 342     cast:           A list of the video cast
 343     is_live:        True, False, or None (=unknown). Whether this video is a
 344                     live stream that goes on instead of a fixed-length video.
 345     was_live:       True, False, or None (=unknown). Whether this video was
 346                     originally a live stream.
 347     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 348                     or 'post_live' (was live, but VOD is not yet processed)
 349                     If absent, automatically set from is_live, was_live
 350     start_time:     Time in seconds where the reproduction should start, as
 351                     specified in the URL.
 352     end_time:       Time in seconds where the reproduction should end, as
 353                     specified in the URL.
 354     chapters:       A list of dictionaries, with the following entries:
 355                         * "start_time" - The start time of the chapter in seconds
 356                         * "end_time" - The end time of the chapter in seconds
 357                         * "title" (optional, string)
 358     heatmap:        A list of dictionaries, with the following entries:
 359                         * "start_time" - The start time of the data point in seconds
 360                         * "end_time" - The end time of the data point in seconds
 361                         * "value" - The normalized value of the data point (float between 0 and 1)
 362     playable_in_embed: Whether this video is allowed to play in embedded
 363                     players on other sites. Can be True (=always allowed),
 364                     False (=never allowed), None (=unknown), or a string
 365                     specifying the criteria for embedability; e.g. 'whitelist'
 366     availability:   Under what condition the video is available. One of
 367                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 368                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 369                     to set it
 370     _old_archive_ids: A list of old archive ids needed for backward compatibility
 371     _format_sort_fields: A list of fields to use for sorting formats
 372     __post_extractor: A function to be called just before the metadata is
 373                     written to either disk, logger or console. The function
 374                     must return a dict which will be added to the info_dict.
 375                     This is usefull for additional information that is
 376                     time-consuming to extract. Note that the fields thus
 377                     extracted will not be available to output template and
 378                     match_filter. So, only "comments" and "comment_count" are
 379                     currently allowed to be extracted via this method.
 380
 381     The following fields should only be used when the video belongs to some logical
 382     chapter or section:
 383
 384     chapter:        Name or title of the chapter the video belongs to.
 385     chapter_number: Number of the chapter the video belongs to, as an integer.
 386     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 387
 388     The following fields should only be used when the video is an episode of some
 389     series, programme or podcast:
 390
 391     series:         Title of the series or programme the video episode belongs to.
 392     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 393     season:         Title of the season the video episode belongs to.
 394     season_number:  Number of the season the video episode belongs to, as an integer.
 395     season_id:      Id of the season the video episode belongs to, as a unicode string.
 396     episode:        Title of the video episode. Unlike mandatory video title field,
 397                     this field should denote the exact title of the video episode
 398                     without any kind of decoration.
 399     episode_number: Number of the video episode within a season, as an integer.
 400     episode_id:     Id of the video episode, as a unicode string.
 401
 402     The following fields should only be used when the media is a track or a part of
 403     a music album:
 404
 405     track:          Title of the track.
 406     track_number:   Number of the track within an album or a disc, as an integer.
 407     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 408                     as a unicode string.
 409     artist:         Artist(s) of the track.
 410     genre:          Genre(s) of the track.
 411     album:          Title of the album the track belongs to.
 412     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 413     album_artist:   List of all artists appeared on the album (e.g.
 414                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 415                     and compilations).
 416     disc_number:    Number of the disc or other physical medium the track belongs to,
 417                     as an integer.
 418     release_year:   Year (YYYY) when the album was released.
 419     composer:       Composer of the piece
 420
 421     The following fields should only be set for clips that should be cut from the original video:
 422
 423     section_start:  Start time of the section in seconds
 424     section_end:    End time of the section in seconds
 425
 426     The following fields should only be set for storyboards:
 427     rows:           Number of rows in each storyboard fragment, as an integer
 428     columns:        Number of columns in each storyboard fragment, as an integer
 429
 430     Unless mentioned otherwise, the fields should be Unicode strings.
 431
 432     Unless mentioned otherwise, None is equivalent to absence of information.
 433
 434
 435     _type "playlist" indicates multiple videos.
 436     There must be a key "entries", which is a list, an iterable, or a PagedList
 437     object, each element of which is a valid dictionary by this specification.
 438
 439     Additionally, playlists can have "id", "title", and any other relevant
 440     attributes with the same semantics as videos (see above).
 441
 442     It can also have the following optional fields:
 443
 444     playlist_count: The total number of videos in a playlist. If not given,
 445                     YoutubeDL tries to calculate it from "entries"
 446
 447
 448     _type "multi_video" indicates that there are multiple videos that
 449     form a single show, for examples multiple acts of an opera or TV episode.
 450     It must have an entries key like a playlist and contain all the keys
 451     required for a video at the same time.
 452
 453
 454     _type "url" indicates that the video must be extracted from another
 455     location, possibly by a different extractor. Its only required key is:
 456     "url" - the next URL to extract.
 457     The key "ie_key" can be set to the class name (minus the trailing "IE",
 458     e.g. "Youtube") if the extractor class is known in advance.
 459     Additionally, the dictionary may have any properties of the resolved entity
 460     known in advance, for example "title" if the title of the referred video is
 461     known ahead of time.
 462
 463
 464     _type "url_transparent" entities have the same specification as "url", but
 465     indicate that the given additional information is more precise than the one
 466     associated with the resolved URL.
 467     This is useful when a site employs a video service that hosts the video and
 468     its technical metadata, but that video service does not embed a useful
 469     title, description etc.
 470
 471
 472     Subclasses of this should also be added to the list of extractors and
 473     should define a _VALID_URL regexp and, re-define the _real_extract() and
 474     (optionally) _real_initialize() methods.
 475
 476     Subclasses may also override suitable() if necessary, but ensure the function
 477     signature is preserved and that this function imports everything it needs
 478     (except other extractors), so that lazy_extractors works correctly.
 479
 480     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 481     the HTML of Generic webpages. It may also override _extract_embed_urls
 482     or _extract_from_webpage as necessary. While these are normally classmethods,
 483     _extract_from_webpage is allowed to be an instance method.
 484
 485     _extract_from_webpage may raise self.StopExtraction() to stop further
 486     processing of the webpage and obtain exclusive rights to it. This is useful
 487     when the extractor cannot reliably be matched using just the URL,
 488     e.g. invidious/peertube instances
 489
 490     Embed-only extractors can be defined by setting _VALID_URL = False.
 491
 492     To support username + password (or netrc) login, the extractor must define a
 493     _NETRC_MACHINE and re-define _perform_login(username, password) and
 494     (optionally) _initialize_pre_login() methods. The _perform_login method will
 495     be called between _initialize_pre_login and _real_initialize if credentials
 496     are passed by the user. In cases where it is necessary to have the login
 497     process as part of the extraction rather than initialization, _perform_login
 498     can be left undefined.
 499
 500     _GEO_BYPASS attribute may be set to False in order to disable
 501     geo restriction bypass mechanisms for a particular extractor.
 502     Though it won't disable explicit geo restriction bypass based on
 503     country code provided with geo_bypass_country.
 504
 505     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 506     countries for this extractor. One of these countries will be used by
 507     geo restriction bypass mechanism right away in order to bypass
 508     geo restriction, of course, if the mechanism is not disabled.
 509
 510     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 511     IP blocks in CIDR notation for this extractor. One of these IP blocks
 512     will be used by geo restriction bypass mechanism similarly
 513     to _GEO_COUNTRIES.
 514
 515     The _ENABLED attribute should be set to False for IEs that
 516     are disabled by default and must be explicitly enabled.
 517
 518     The _WORKING attribute should be set to False for broken IEs
 519     in order to warn the users and skip the tests.
 520     """
 521
 522     _ready = False
 523     _downloader = None
 524     _x_forwarded_for_ip = None
 525     _GEO_BYPASS = True
 526     _GEO_COUNTRIES = None
 527     _GEO_IP_BLOCKS = None
 528     _WORKING = True
 529     _ENABLED = True
 530     _NETRC_MACHINE = None
 531     IE_DESC = None
 532     SEARCH_KEY = None
 533     _VALID_URL = None
 534     _EMBED_REGEX = []
 535
 536     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 537         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 538         return {
 539             None: '',
 540             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 541             'password': f'Use {password_hint}',
 542             'cookies': (
 543                 'Use --cookies-from-browser or --cookies for the authentication. '
 544                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 545         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 546
 547     def __init__(self, downloader=None):
 548         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 549         If a downloader is not passed during initialization,
 550         it must be set using "set_downloader()" before "extract()" is called"""
 551         self._ready = False
 552         self._x_forwarded_for_ip = None
 553         self._printed_messages = set()
 554         self.set_downloader(downloader)
 555
 556     @classmethod
 557     def _match_valid_url(cls, url):
 558         if cls._VALID_URL is False:
 559             return None
 560         # This does not use has/getattr intentionally - we want to know whether
 561         # we have cached the regexp for *this* class, whereas getattr would also
 562         # match the superclass
 563         if '_VALID_URL_RE' not in cls.__dict__:
 564             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 565         return cls._VALID_URL_RE.match(url)
 566
 567     @classmethod
 568     def suitable(cls, url):
 569         """Receives a URL and returns True if suitable for this IE."""
 570         # This function must import everything it needs (except other extractors),
 571         # so that lazy_extractors works correctly
 572         return cls._match_valid_url(url) is not None
 573
 574     @classmethod
 575     def _match_id(cls, url):
 576         return cls._match_valid_url(url).group('id')
 577
 578     @classmethod
 579     def get_temp_id(cls, url):
 580         try:
 581             return cls._match_id(url)
 582         except (IndexError, AttributeError):
 583             return None
 584
 585     @classmethod
 586     def working(cls):
 587         """Getter method for _WORKING."""
 588         return cls._WORKING
 589
 590     @classmethod
 591     def supports_login(cls):
 592         return bool(cls._NETRC_MACHINE)
 593
 594     def initialize(self):
 595         """Initializes an instance (authentication, etc)."""
 596         self._printed_messages = set()
 597         self._initialize_geo_bypass({
 598             'countries': self._GEO_COUNTRIES,
 599             'ip_blocks': self._GEO_IP_BLOCKS,
 600         })
 601         if not self._ready:
 602             self._initialize_pre_login()
 603             if self.supports_login():
 604                 username, password = self._get_login_info()
 605                 if username:
 606                     self._perform_login(username, password)
 607             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 608                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 609             self._real_initialize()
 610             self._ready = True
 611
 612     def _initialize_geo_bypass(self, geo_bypass_context):
 613         """
 614         Initialize geo restriction bypass mechanism.
 615
 616         This method is used to initialize geo bypass mechanism based on faking
 617         X-Forwarded-For HTTP header. A random country from provided country list
 618         is selected and a random IP belonging to this country is generated. This
 619         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 620         HTTP requests.
 621
 622         This method will be used for initial geo bypass mechanism initialization
 623         during the instance initialization with _GEO_COUNTRIES and
 624         _GEO_IP_BLOCKS.
 625
 626         You may also manually call it from extractor's code if geo bypass
 627         information is not available beforehand (e.g. obtained during
 628         extraction) or due to some other reason. In this case you should pass
 629         this information in geo bypass context passed as first argument. It may
 630         contain following fields:
 631
 632         countries:  List of geo unrestricted countries (similar
 633                     to _GEO_COUNTRIES)
 634         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 635                     (similar to _GEO_IP_BLOCKS)
 636
 637         """
 638         if not self._x_forwarded_for_ip:
 639
 640             # Geo bypass mechanism is explicitly disabled by user
 641             if not self.get_param('geo_bypass', True):
 642                 return
 643
 644             if not geo_bypass_context:
 645                 geo_bypass_context = {}
 646
 647             # Backward compatibility: previously _initialize_geo_bypass
 648             # expected a list of countries, some 3rd party code may still use
 649             # it this way
 650             if isinstance(geo_bypass_context, (list, tuple)):
 651                 geo_bypass_context = {
 652                     'countries': geo_bypass_context,
 653                 }
 654
 655             # The whole point of geo bypass mechanism is to fake IP
 656             # as X-Forwarded-For HTTP header based on some IP block or
 657             # country code.
 658
 659             # Path 1: bypassing based on IP block in CIDR notation
 660
 661             # Explicit IP block specified by user, use it right away
 662             # regardless of whether extractor is geo bypassable or not
 663             ip_block = self.get_param('geo_bypass_ip_block', None)
 664
 665             # Otherwise use random IP block from geo bypass context but only
 666             # if extractor is known as geo bypassable
 667             if not ip_block:
 668                 ip_blocks = geo_bypass_context.get('ip_blocks')
 669                 if self._GEO_BYPASS and ip_blocks:
 670                     ip_block = random.choice(ip_blocks)
 671
 672             if ip_block:
 673                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 674                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 675                 return
 676
 677             # Path 2: bypassing based on country code
 678
 679             # Explicit country code specified by user, use it right away
 680             # regardless of whether extractor is geo bypassable or not
 681             country = self.get_param('geo_bypass_country', None)
 682
 683             # Otherwise use random country code from geo bypass context but
 684             # only if extractor is known as geo bypassable
 685             if not country:
 686                 countries = geo_bypass_context.get('countries')
 687                 if self._GEO_BYPASS and countries:
 688                     country = random.choice(countries)
 689
 690             if country:
 691                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 692                 self._downloader.write_debug(
 693                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 694
 695     def extract(self, url):
 696         """Extracts URL information and returns it in list of dicts."""
 697         try:
 698             for _ in range(2):
 699                 try:
 700                     self.initialize()
 701                     self.to_screen('Extracting URL: %s' % (
 702                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 703                     ie_result = self._real_extract(url)
 704                     if ie_result is None:
 705                         return None
 706                     if self._x_forwarded_for_ip:
 707                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 708                     subtitles = ie_result.get('subtitles') or {}
 709                     if 'no-live-chat' in self.get_param('compat_opts'):
 710                         for lang in ('live_chat', 'comments', 'danmaku'):
 711                             subtitles.pop(lang, None)
 712                     return ie_result
 713                 except GeoRestrictedError as e:
 714                     if self.__maybe_fake_ip_and_retry(e.countries):
 715                         continue
 716                     raise
 717         except UnsupportedError:
 718             raise
 719         except ExtractorError as e:
 720             e.video_id = e.video_id or self.get_temp_id(url),
 721             e.ie = e.ie or self.IE_NAME,
 722             e.traceback = e.traceback or sys.exc_info()[2]
 723             raise
 724         except http.client.IncompleteRead as e:
 725             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 726         except (KeyError, StopIteration) as e:
 727             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 728
 729     def __maybe_fake_ip_and_retry(self, countries):
 730         if (not self.get_param('geo_bypass_country', None)
 731                 and self._GEO_BYPASS
 732                 and self.get_param('geo_bypass', True)
 733                 and not self._x_forwarded_for_ip
 734                 and countries):
 735             country_code = random.choice(countries)
 736             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 737             if self._x_forwarded_for_ip:
 738                 self.report_warning(
 739                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 740                     % (self._x_forwarded_for_ip, country_code.upper()))
 741                 return True
 742         return False
 743
 744     def set_downloader(self, downloader):
 745         """Sets a YoutubeDL instance as the downloader for this IE."""
 746         self._downloader = downloader
 747
 748     @property
 749     def cache(self):
 750         return self._downloader.cache
 751
 752     @property
 753     def cookiejar(self):
 754         return self._downloader.cookiejar
 755
 756     def _initialize_pre_login(self):
 757         """ Initialization before login. Redefine in subclasses."""
 758         pass
 759
 760     def _perform_login(self, username, password):
 761         """ Login with username and password. Redefine in subclasses."""
 762         pass
 763
 764     def _real_initialize(self):
 765         """Real initialization process. Redefine in subclasses."""
 766         pass
 767
 768     def _real_extract(self, url):
 769         """Real extraction process. Redefine in subclasses."""
 770         raise NotImplementedError('This method must be implemented by subclasses')
 771
 772     @classmethod
 773     def ie_key(cls):
 774         """A string for getting the InfoExtractor with get_info_extractor"""
 775         return cls.__name__[:-2]
 776
 777     @classproperty
 778     def IE_NAME(cls):
 779         return cls.__name__[:-2]
 780
 781     @staticmethod
 782     def __can_accept_status_code(err, expected_status):
 783         assert isinstance(err, urllib.error.HTTPError)
 784         if expected_status is None:
 785             return False
 786         elif callable(expected_status):
 787             return expected_status(err.code) is True
 788         else:
 789             return err.code in variadic(expected_status)
 790
 791     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 792         if isinstance(url_or_request, urllib.request.Request):
 793             return update_Request(url_or_request, data=data, headers=headers, query=query)
 794         if query:
 795             url_or_request = update_url_query(url_or_request, query)
 796         return sanitized_Request(url_or_request, data, headers or {})
 797
 798     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 799         """
 800         Return the response handle.
 801
 802         See _download_webpage docstring for arguments specification.
 803         """
 804         if not self._downloader._first_webpage_request:
 805             sleep_interval = self.get_param('sleep_interval_requests') or 0
 806             if sleep_interval > 0:
 807                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 808                 time.sleep(sleep_interval)
 809         else:
 810             self._downloader._first_webpage_request = False
 811
 812         if note is None:
 813             self.report_download_webpage(video_id)
 814         elif note is not False:
 815             if video_id is None:
 816                 self.to_screen(str(note))
 817             else:
 818                 self.to_screen(f'{video_id}: {note}')
 819
 820         # Some sites check X-Forwarded-For HTTP header in order to figure out
 821         # the origin of the client behind proxy. This allows bypassing geo
 822         # restriction by faking this header's value to IP that belongs to some
 823         # geo unrestricted country. We will do so once we encounter any
 824         # geo restriction error.
 825         if self._x_forwarded_for_ip:
 826             headers = (headers or {}).copy()
 827             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 828
 829         try:
 830             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 831         except network_exceptions as err:
 832             if isinstance(err, urllib.error.HTTPError):
 833                 if self.__can_accept_status_code(err, expected_status):
 834                     # Retain reference to error to prevent file object from
 835                     # being closed before it can be read. Works around the
 836                     # effects of <https://bugs.python.org/issue15002>
 837                     # introduced in Python 3.4.1.
 838                     err.fp._error = err
 839                     return err.fp
 840
 841             if errnote is False:
 842                 return False
 843             if errnote is None:
 844                 errnote = 'Unable to download webpage'
 845
 846             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 847             if fatal:
 848                 raise ExtractorError(errmsg, cause=err)
 849             else:
 850                 self.report_warning(errmsg)
 851                 return False
 852
 853     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 854                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 855         """
 856         Return a tuple (page content as string, URL handle).
 857
 858         Arguments:
 859         url_or_request -- plain text URL as a string or
 860             a urllib.request.Request object
 861         video_id -- Video/playlist/item identifier (string)
 862
 863         Keyword arguments:
 864         note -- note printed before downloading (string)
 865         errnote -- note printed in case of an error (string)
 866         fatal -- flag denoting whether error should be considered fatal,
 867             i.e. whether it should cause ExtractionError to be raised,
 868             otherwise a warning will be reported and extraction continued
 869         encoding -- encoding for a page content decoding, guessed automatically
 870             when not explicitly specified
 871         data -- POST data (bytes)
 872         headers -- HTTP headers (dict)
 873         query -- URL query (dict)
 874         expected_status -- allows to accept failed HTTP requests (non 2xx
 875             status code) by explicitly specifying a set of accepted status
 876             codes. Can be any of the following entities:
 877                 - an integer type specifying an exact failed status code to
 878                   accept
 879                 - a list or a tuple of integer types specifying a list of
 880                   failed status codes to accept
 881                 - a callable accepting an actual failed status code and
 882                   returning True if it should be accepted
 883             Note that this argument does not affect success status codes (2xx)
 884             which are always accepted.
 885         """
 886
 887         # Strip hashes from the URL (#1038)
 888         if isinstance(url_or_request, str):
 889             url_or_request = url_or_request.partition('#')[0]
 890
 891         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 892         if urlh is False:
 893             assert not fatal
 894             return False
 895         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 896         return (content, urlh)
 897
 898     @staticmethod
 899     def _guess_encoding_from_content(content_type, webpage_bytes):
 900         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 901         if m:
 902             encoding = m.group(1)
 903         else:
 904             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 905                           webpage_bytes[:1024])
 906             if m:
 907                 encoding = m.group(1).decode('ascii')
 908             elif webpage_bytes.startswith(b'\xff\xfe'):
 909                 encoding = 'utf-16'
 910             else:
 911                 encoding = 'utf-8'
 912
 913         return encoding
 914
 915     def __check_blocked(self, content):
 916         first_block = content[:512]
 917         if ('<title>Access to this site is blocked</title>' in content
 918                 and 'Websense' in first_block):
 919             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 920             blocked_iframe = self._html_search_regex(
 921                 r'<iframe src="([^"]+)"', content,
 922                 'Websense information URL', default=None)
 923             if blocked_iframe:
 924                 msg += ' Visit %s for more details' % blocked_iframe
 925             raise ExtractorError(msg, expected=True)
 926         if '<title>The URL you requested has been blocked</title>' in first_block:
 927             msg = (
 928                 'Access to this webpage has been blocked by Indian censorship. '
 929                 'Use a VPN or proxy server (with --proxy) to route around it.')
 930             block_msg = self._html_search_regex(
 931                 r'</h1><p>(.*?)</p>',
 932                 content, 'block message', default=None)
 933             if block_msg:
 934                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 935             raise ExtractorError(msg, expected=True)
 936         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 937                 and 'blocklist.rkn.gov.ru' in content):
 938             raise ExtractorError(
 939                 'Access to this webpage has been blocked by decision of the Russian government. '
 940                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 941                 expected=True)
 942
 943     def _request_dump_filename(self, url, video_id):
 944         basen = f'{video_id}_{url}'
 945         trim_length = self.get_param('trim_file_name') or 240
 946         if len(basen) > trim_length:
 947             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 948             basen = basen[:trim_length - len(h)] + h
 949         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 950         # Working around MAX_PATH limitation on Windows (see
 951         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 952         if compat_os_name == 'nt':
 953             absfilepath = os.path.abspath(filename)
 954             if len(absfilepath) > 259:
 955                 filename = fR'\\?\{absfilepath}'
 956         return filename
 957
 958     def __decode_webpage(self, webpage_bytes, encoding, headers):
 959         if not encoding:
 960             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 961         try:
 962             return webpage_bytes.decode(encoding, 'replace')
 963         except LookupError:
 964             return webpage_bytes.decode('utf-8', 'replace')
 965
 966     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 967         webpage_bytes = urlh.read()
 968         if prefix is not None:
 969             webpage_bytes = prefix + webpage_bytes
 970         if self.get_param('dump_intermediate_pages', False):
 971             self.to_screen('Dumping request to ' + urlh.geturl())
 972             dump = base64.b64encode(webpage_bytes).decode('ascii')
 973             self._downloader.to_screen(dump)
 974         if self.get_param('write_pages'):
 975             filename = self._request_dump_filename(urlh.geturl(), video_id)
 976             self.to_screen(f'Saving request to {filename}')
 977             with open(filename, 'wb') as outf:
 978                 outf.write(webpage_bytes)
 979
 980         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 981         self.__check_blocked(content)
 982
 983         return content
 984
 985     def __print_error(self, errnote, fatal, video_id, err):
 986         if fatal:
 987             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 988         elif errnote:
 989             self.report_warning(f'{video_id}: {errnote}: {err}')
 990
 991     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 992         if transform_source:
 993             xml_string = transform_source(xml_string)
 994         try:
 995             return compat_etree_fromstring(xml_string.encode('utf-8'))
 996         except xml.etree.ElementTree.ParseError as ve:
 997             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 998
 999     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1000         try:
1001             return json.loads(
1002                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1003         except ValueError as ve:
1004             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1005
1006     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1007         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1008
1009     def __create_download_methods(name, parser, note, errnote, return_value):
1010
1011         def parse(ie, content, *args, errnote=errnote, **kwargs):
1012             if parser is None:
1013                 return content
1014             if errnote is False:
1015                 kwargs['errnote'] = errnote
1016             # parser is fetched by name so subclasses can override it
1017             return getattr(ie, parser)(content, *args, **kwargs)
1018
1019         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1020                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1021             res = self._download_webpage_handle(
1022                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1023                 data=data, headers=headers, query=query, expected_status=expected_status)
1024             if res is False:
1025                 return res
1026             content, urlh = res
1027             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1028
1029         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1030                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1031             if self.get_param('load_pages'):
1032                 url_or_request = self._create_request(url_or_request, data, headers, query)
1033                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1034                 self.to_screen(f'Loading request from {filename}')
1035                 try:
1036                     with open(filename, 'rb') as dumpf:
1037                         webpage_bytes = dumpf.read()
1038                 except OSError as e:
1039                     self.report_warning(f'Unable to load request from disk: {e}')
1040                 else:
1041                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1042                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1043             kwargs = {
1044                 'note': note,
1045                 'errnote': errnote,
1046                 'transform_source': transform_source,
1047                 'fatal': fatal,
1048                 'encoding': encoding,
1049                 'data': data,
1050                 'headers': headers,
1051                 'query': query,
1052                 'expected_status': expected_status,
1053             }
1054             if parser is None:
1055                 kwargs.pop('transform_source')
1056             # The method is fetched by name so subclasses can override _download_..._handle
1057             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1058             return res if res is False else res[0]
1059
1060         def impersonate(func, name, return_value):
1061             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1062             func.__doc__ = f'''
1063                 @param transform_source     Apply this transformation before parsing
1064                 @returns                    {return_value}
1065
1066                 See _download_webpage_handle docstring for other arguments specification
1067             '''
1068
1069         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1070         impersonate(download_content, f'_download_{name}', f'{return_value}')
1071         return download_handle, download_content
1072
1073     _download_xml_handle, _download_xml = __create_download_methods(
1074         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1075     _download_json_handle, _download_json = __create_download_methods(
1076         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1077     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1078         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1079     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1080
1081     def _download_webpage(
1082             self, url_or_request, video_id, note=None, errnote=None,
1083             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1084         """
1085         Return the data of the page as a string.
1086
1087         Keyword arguments:
1088         tries -- number of tries
1089         timeout -- sleep interval between tries
1090
1091         See _download_webpage_handle docstring for other arguments specification.
1092         """
1093
1094         R''' # NB: These are unused; should they be deprecated?
1095         if tries != 1:
1096             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1097         if timeout is NO_DEFAULT:
1098             timeout = 5
1099         else:
1100             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1101         '''
1102
1103         try_count = 0
1104         while True:
1105             try:
1106                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1107             except http.client.IncompleteRead as e:
1108                 try_count += 1
1109                 if try_count >= tries:
1110                     raise e
1111                 self._sleep(timeout, video_id)
1112
1113     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1114         idstr = format_field(video_id, None, '%s: ')
1115         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1116         if only_once:
1117             if f'WARNING: {msg}' in self._printed_messages:
1118                 return
1119             self._printed_messages.add(f'WARNING: {msg}')
1120         self._downloader.report_warning(msg, *args, **kwargs)
1121
1122     def to_screen(self, msg, *args, **kwargs):
1123         """Print msg to screen, prefixing it with '[ie_name]'"""
1124         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1125
1126     def write_debug(self, msg, *args, **kwargs):
1127         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1128
1129     def get_param(self, name, default=None, *args, **kwargs):
1130         if self._downloader:
1131             return self._downloader.params.get(name, default, *args, **kwargs)
1132         return default
1133
1134     def report_drm(self, video_id, partial=NO_DEFAULT):
1135         if partial is not NO_DEFAULT:
1136             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1137         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1138
1139     def report_extraction(self, id_or_name):
1140         """Report information extraction."""
1141         self.to_screen('%s: Extracting information' % id_or_name)
1142
1143     def report_download_webpage(self, video_id):
1144         """Report webpage download."""
1145         self.to_screen('%s: Downloading webpage' % video_id)
1146
1147     def report_age_confirmation(self):
1148         """Report attempt to confirm age."""
1149         self.to_screen('Confirming age')
1150
1151     def report_login(self):
1152         """Report attempt to log in."""
1153         self.to_screen('Logging in')
1154
1155     def raise_login_required(
1156             self, msg='This video is only available for registered users',
1157             metadata_available=False, method=NO_DEFAULT):
1158         if metadata_available and (
1159                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1160             self.report_warning(msg)
1161             return
1162         msg += format_field(self._login_hint(method), None, '. %s')
1163         raise ExtractorError(msg, expected=True)
1164
1165     def raise_geo_restricted(
1166             self, msg='This video is not available from your location due to geo restriction',
1167             countries=None, metadata_available=False):
1168         if metadata_available and (
1169                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1170             self.report_warning(msg)
1171         else:
1172             raise GeoRestrictedError(msg, countries=countries)
1173
1174     def raise_no_formats(self, msg, expected=False, video_id=None):
1175         if expected and (
1176                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1177             self.report_warning(msg, video_id)
1178         elif isinstance(msg, ExtractorError):
1179             raise msg
1180         else:
1181             raise ExtractorError(msg, expected=expected, video_id=video_id)
1182
1183     # Methods for following #608
1184     @staticmethod
1185     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1186         """Returns a URL that points to a page that should be processed"""
1187         if ie is not None:
1188             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1189         if video_id is not None:
1190             kwargs['id'] = video_id
1191         if video_title is not None:
1192             kwargs['title'] = video_title
1193         return {
1194             **kwargs,
1195             '_type': 'url_transparent' if url_transparent else 'url',
1196             'url': url,
1197         }
1198
1199     @classmethod
1200     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1201                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1202         return cls.playlist_result(
1203             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1204             playlist_id, playlist_title, **kwargs)
1205
1206     @staticmethod
1207     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1208         """Returns a playlist"""
1209         if playlist_id:
1210             kwargs['id'] = playlist_id
1211         if playlist_title:
1212             kwargs['title'] = playlist_title
1213         if playlist_description is not None:
1214             kwargs['description'] = playlist_description
1215         return {
1216             **kwargs,
1217             '_type': 'multi_video' if multi_video else 'playlist',
1218             'entries': entries,
1219         }
1220
1221     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1222         """
1223         Perform a regex search on the given string, using a single or a list of
1224         patterns returning the first matching group.
1225         In case of failure return a default value or raise a WARNING or a
1226         RegexNotFoundError, depending on fatal, specifying the field name.
1227         """
1228         if string is None:
1229             mobj = None
1230         elif isinstance(pattern, (str, re.Pattern)):
1231             mobj = re.search(pattern, string, flags)
1232         else:
1233             for p in pattern:
1234                 mobj = re.search(p, string, flags)
1235                 if mobj:
1236                     break
1237
1238         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1239
1240         if mobj:
1241             if group is None:
1242                 # return the first matching group
1243                 return next(g for g in mobj.groups() if g is not None)
1244             elif isinstance(group, (list, tuple)):
1245                 return tuple(mobj.group(g) for g in group)
1246             else:
1247                 return mobj.group(group)
1248         elif default is not NO_DEFAULT:
1249             return default
1250         elif fatal:
1251             raise RegexNotFoundError('Unable to extract %s' % _name)
1252         else:
1253             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1254             return None
1255
1256     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1257                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1258         """Searches string for the JSON object specified by start_pattern"""
1259         # NB: end_pattern is only used to reduce the size of the initial match
1260         if default is NO_DEFAULT:
1261             default, has_default = {}, False
1262         else:
1263             fatal, has_default = False, True
1264
1265         json_string = self._search_regex(
1266             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1267             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1268         if not json_string:
1269             return default
1270
1271         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1272         try:
1273             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1274         except ExtractorError as e:
1275             if fatal:
1276                 raise ExtractorError(
1277                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1278             elif not has_default:
1279                 self.report_warning(
1280                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1281         return default
1282
1283     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1284         """
1285         Like _search_regex, but strips HTML tags and unescapes entities.
1286         """
1287         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1288         if isinstance(res, tuple):
1289             return tuple(map(clean_html, res))
1290         return clean_html(res)
1291
1292     def _get_netrc_login_info(self, netrc_machine=None):
1293         username = None
1294         password = None
1295         netrc_machine = netrc_machine or self._NETRC_MACHINE
1296
1297         if self.get_param('usenetrc', False):
1298             try:
1299                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1300                 if os.path.isdir(netrc_file):
1301                     netrc_file = os.path.join(netrc_file, '.netrc')
1302                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1303                 if info is not None:
1304                     username = info[0]
1305                     password = info[2]
1306                 else:
1307                     raise netrc.NetrcParseError(
1308                         'No authenticators for %s' % netrc_machine)
1309             except (OSError, netrc.NetrcParseError) as err:
1310                 self.report_warning(
1311                     'parsing .netrc: %s' % error_to_compat_str(err))
1312
1313         return username, password
1314
1315     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1316         """
1317         Get the login info as (username, password)
1318         First look for the manually specified credentials using username_option
1319         and password_option as keys in params dictionary. If no such credentials
1320         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1321         value.
1322         If there's no info available, return (None, None)
1323         """
1324
1325         # Attempt to use provided username and password or .netrc data
1326         username = self.get_param(username_option)
1327         if username is not None:
1328             password = self.get_param(password_option)
1329         else:
1330             username, password = self._get_netrc_login_info(netrc_machine)
1331
1332         return username, password
1333
1334     def _get_tfa_info(self, note='two-factor verification code'):
1335         """
1336         Get the two-factor authentication info
1337         TODO - asking the user will be required for sms/phone verify
1338         currently just uses the command line option
1339         If there's no info available, return None
1340         """
1341
1342         tfa = self.get_param('twofactor')
1343         if tfa is not None:
1344             return tfa
1345
1346         return getpass.getpass('Type %s and press [Return]: ' % note)
1347
1348     # Helper functions for extracting OpenGraph info
1349     @staticmethod
1350     def _og_regexes(prop):
1351         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1352         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1353                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1354         template = r'<meta[^>]+?%s[^>]+?%s'
1355         return [
1356             template % (property_re, content_re),
1357             template % (content_re, property_re),
1358         ]
1359
1360     @staticmethod
1361     def _meta_regex(prop):
1362         return r'''(?isx)<meta
1363                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1364                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1365
1366     def _og_search_property(self, prop, html, name=None, **kargs):
1367         prop = variadic(prop)
1368         if name is None:
1369             name = 'OpenGraph %s' % prop[0]
1370         og_regexes = []
1371         for p in prop:
1372             og_regexes.extend(self._og_regexes(p))
1373         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1374         if escaped is None:
1375             return None
1376         return unescapeHTML(escaped)
1377
1378     def _og_search_thumbnail(self, html, **kargs):
1379         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1380
1381     def _og_search_description(self, html, **kargs):
1382         return self._og_search_property('description', html, fatal=False, **kargs)
1383
1384     def _og_search_title(self, html, *, fatal=False, **kargs):
1385         return self._og_search_property('title', html, fatal=fatal, **kargs)
1386
1387     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1388         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1389         if secure:
1390             regexes = self._og_regexes('video:secure_url') + regexes
1391         return self._html_search_regex(regexes, html, name, **kargs)
1392
1393     def _og_search_url(self, html, **kargs):
1394         return self._og_search_property('url', html, **kargs)
1395
1396     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1397         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1398
1399     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1400         name = variadic(name)
1401         if display_name is None:
1402             display_name = name[0]
1403         return self._html_search_regex(
1404             [self._meta_regex(n) for n in name],
1405             html, display_name, fatal=fatal, group='content', **kwargs)
1406
1407     def _dc_search_uploader(self, html):
1408         return self._html_search_meta('dc.creator', html, 'uploader')
1409
1410     @staticmethod
1411     def _rta_search(html):
1412         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1413         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1414                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1415                      html):
1416             return 18
1417
1418         # And then there are the jokers who advertise that they use RTA, but actually don't.
1419         AGE_LIMIT_MARKERS = [
1420             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1421             r'>[^<]*you acknowledge you are at least (\d+) years old',
1422             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1423         ]
1424
1425         age_limit = 0
1426         for marker in AGE_LIMIT_MARKERS:
1427             mobj = re.search(marker, html)
1428             if mobj:
1429                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1430         return age_limit
1431
1432     def _media_rating_search(self, html):
1433         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1434         rating = self._html_search_meta('rating', html)
1435
1436         if not rating:
1437             return None
1438
1439         RATING_TABLE = {
1440             'safe for kids': 0,
1441             'general': 8,
1442             '14 years': 14,
1443             'mature': 17,
1444             'restricted': 19,
1445         }
1446         return RATING_TABLE.get(rating.lower())
1447
1448     def _family_friendly_search(self, html):
1449         # See http://schema.org/VideoObject
1450         family_friendly = self._html_search_meta(
1451             'isFamilyFriendly', html, default=None)
1452
1453         if not family_friendly:
1454             return None
1455
1456         RATING_TABLE = {
1457             '1': 0,
1458             'true': 0,
1459             '0': 18,
1460             'false': 18,
1461         }
1462         return RATING_TABLE.get(family_friendly.lower())
1463
1464     def _twitter_search_player(self, html):
1465         return self._html_search_meta('twitter:player', html,
1466                                       'twitter card player')
1467
1468     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1469         """Yield all json ld objects in the html"""
1470         if default is not NO_DEFAULT:
1471             fatal = False
1472         for mobj in re.finditer(JSON_LD_RE, html):
1473             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1474             for json_ld in variadic(json_ld_item):
1475                 if isinstance(json_ld, dict):
1476                     yield json_ld
1477
1478     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1479         """Search for a video in any json ld in the html"""
1480         if default is not NO_DEFAULT:
1481             fatal = False
1482         info = self._json_ld(
1483             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1484             video_id, fatal=fatal, expected_type=expected_type)
1485         if info:
1486             return info
1487         if default is not NO_DEFAULT:
1488             return default
1489         elif fatal:
1490             raise RegexNotFoundError('Unable to extract JSON-LD')
1491         else:
1492             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1493             return {}
1494
1495     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1496         if isinstance(json_ld, str):
1497             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1498         if not json_ld:
1499             return {}
1500         info = {}
1501
1502         INTERACTION_TYPE_MAP = {
1503             'CommentAction': 'comment',
1504             'AgreeAction': 'like',
1505             'DisagreeAction': 'dislike',
1506             'LikeAction': 'like',
1507             'DislikeAction': 'dislike',
1508             'ListenAction': 'view',
1509             'WatchAction': 'view',
1510             'ViewAction': 'view',
1511         }
1512
1513         def is_type(e, *expected_types):
1514             type = variadic(traverse_obj(e, '@type'))
1515             return any(x in type for x in expected_types)
1516
1517         def extract_interaction_type(e):
1518             interaction_type = e.get('interactionType')
1519             if isinstance(interaction_type, dict):
1520                 interaction_type = interaction_type.get('@type')
1521             return str_or_none(interaction_type)
1522
1523         def extract_interaction_statistic(e):
1524             interaction_statistic = e.get('interactionStatistic')
1525             if isinstance(interaction_statistic, dict):
1526                 interaction_statistic = [interaction_statistic]
1527             if not isinstance(interaction_statistic, list):
1528                 return
1529             for is_e in interaction_statistic:
1530                 if not is_type(is_e, 'InteractionCounter'):
1531                     continue
1532                 interaction_type = extract_interaction_type(is_e)
1533                 if not interaction_type:
1534                     continue
1535                 # For interaction count some sites provide string instead of
1536                 # an integer (as per spec) with non digit characters (e.g. ",")
1537                 # so extracting count with more relaxed str_to_int
1538                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1539                 if interaction_count is None:
1540                     continue
1541                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1542                 if not count_kind:
1543                     continue
1544                 count_key = '%s_count' % count_kind
1545                 if info.get(count_key) is not None:
1546                     continue
1547                 info[count_key] = interaction_count
1548
1549         def extract_chapter_information(e):
1550             chapters = [{
1551                 'title': part.get('name'),
1552                 'start_time': part.get('startOffset'),
1553                 'end_time': part.get('endOffset'),
1554             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1555             for idx, (last_c, current_c, next_c) in enumerate(zip(
1556                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1557                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1558                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1559                 if None in current_c.values():
1560                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1561                     return
1562             if chapters:
1563                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1564                 info['chapters'] = chapters
1565
1566         def extract_video_object(e):
1567             author = e.get('author')
1568             info.update({
1569                 'url': url_or_none(e.get('contentUrl')),
1570                 'ext': mimetype2ext(e.get('encodingFormat')),
1571                 'title': unescapeHTML(e.get('name')),
1572                 'description': unescapeHTML(e.get('description')),
1573                 'thumbnails': [{'url': unescapeHTML(url)}
1574                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1575                                if url_or_none(url)],
1576                 'duration': parse_duration(e.get('duration')),
1577                 'timestamp': unified_timestamp(e.get('uploadDate')),
1578                 # author can be an instance of 'Organization' or 'Person' types.
1579                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1580                 # however some websites are using 'Text' type instead.
1581                 # 1. https://schema.org/VideoObject
1582                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1583                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1584                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1585                 'tbr': int_or_none(e.get('bitrate')),
1586                 'width': int_or_none(e.get('width')),
1587                 'height': int_or_none(e.get('height')),
1588                 'view_count': int_or_none(e.get('interactionCount')),
1589                 'tags': try_call(lambda: e.get('keywords').split(',')),
1590             })
1591             if is_type(e, 'AudioObject'):
1592                 info.update({
1593                     'vcodec': 'none',
1594                     'abr': int_or_none(e.get('bitrate')),
1595                 })
1596             extract_interaction_statistic(e)
1597             extract_chapter_information(e)
1598
1599         def traverse_json_ld(json_ld, at_top_level=True):
1600             for e in variadic(json_ld):
1601                 if not isinstance(e, dict):
1602                     continue
1603                 if at_top_level and '@context' not in e:
1604                     continue
1605                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1606                     traverse_json_ld(e['@graph'], at_top_level=False)
1607                     continue
1608                 if expected_type is not None and not is_type(e, expected_type):
1609                     continue
1610                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1611                 if rating is not None:
1612                     info['average_rating'] = rating
1613                 if is_type(e, 'TVEpisode', 'Episode'):
1614                     episode_name = unescapeHTML(e.get('name'))
1615                     info.update({
1616                         'episode': episode_name,
1617                         'episode_number': int_or_none(e.get('episodeNumber')),
1618                         'description': unescapeHTML(e.get('description')),
1619                     })
1620                     if not info.get('title') and episode_name:
1621                         info['title'] = episode_name
1622                     part_of_season = e.get('partOfSeason')
1623                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1624                         info.update({
1625                             'season': unescapeHTML(part_of_season.get('name')),
1626                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1627                         })
1628                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1629                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1630                         info['series'] = unescapeHTML(part_of_series.get('name'))
1631                 elif is_type(e, 'Movie'):
1632                     info.update({
1633                         'title': unescapeHTML(e.get('name')),
1634                         'description': unescapeHTML(e.get('description')),
1635                         'duration': parse_duration(e.get('duration')),
1636                         'timestamp': unified_timestamp(e.get('dateCreated')),
1637                     })
1638                 elif is_type(e, 'Article', 'NewsArticle'):
1639                     info.update({
1640                         'timestamp': parse_iso8601(e.get('datePublished')),
1641                         'title': unescapeHTML(e.get('headline')),
1642                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1643                     })
1644                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1645                         extract_video_object(e['video'][0])
1646                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1647                         extract_video_object(e['subjectOf'][0])
1648                 elif is_type(e, 'VideoObject', 'AudioObject'):
1649                     extract_video_object(e)
1650                     if expected_type is None:
1651                         continue
1652                     else:
1653                         break
1654                 video = e.get('video')
1655                 if is_type(video, 'VideoObject'):
1656                     extract_video_object(video)
1657                 if expected_type is None:
1658                     continue
1659                 else:
1660                     break
1661
1662         traverse_json_ld(json_ld)
1663         return filter_dict(info)
1664
1665     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1666         return self._parse_json(
1667             self._search_regex(
1668                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1669                 webpage, 'next.js data', fatal=fatal, **kw),
1670             video_id, transform_source=transform_source, fatal=fatal)
1671
1672     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1673         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1674         rectx = re.escape(context_name)
1675         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1676         js, arg_keys, arg_vals = self._search_regex(
1677             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1678             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1679             default=NO_DEFAULT if fatal else (None, None, None))
1680         if js is None:
1681             return {}
1682
1683         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1684             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1685
1686         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1687         return traverse_obj(ret, traverse) or {}
1688
1689     @staticmethod
1690     def _hidden_inputs(html):
1691         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1692         hidden_inputs = {}
1693         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1694             attrs = extract_attributes(input)
1695             if not input:
1696                 continue
1697             if attrs.get('type') not in ('hidden', 'submit'):
1698                 continue
1699             name = attrs.get('name') or attrs.get('id')
1700             value = attrs.get('value')
1701             if name and value is not None:
1702                 hidden_inputs[name] = value
1703         return hidden_inputs
1704
1705     def _form_hidden_inputs(self, form_id, html):
1706         form = self._search_regex(
1707             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1708             html, '%s form' % form_id, group='form')
1709         return self._hidden_inputs(form)
1710
1711     @classproperty(cache=True)
1712     def FormatSort(cls):
1713         class FormatSort(FormatSorter):
1714             def __init__(ie, *args, **kwargs):
1715                 super().__init__(ie._downloader, *args, **kwargs)
1716
1717         deprecation_warning(
1718             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1719             'Use yt_dlp.utils.FormatSorter instead')
1720         return FormatSort
1721
1722     def _sort_formats(self, formats, field_preference=[]):
1723         if not field_preference:
1724             self._downloader.deprecation_warning(
1725                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1726             return
1727         self._downloader.deprecation_warning(
1728             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1729             'Return _format_sort_fields in the info_dict instead')
1730         if formats:
1731             formats[0]['__sort_fields'] = field_preference
1732
1733     def _check_formats(self, formats, video_id):
1734         if formats:
1735             formats[:] = filter(
1736                 lambda f: self._is_valid_url(
1737                     f['url'], video_id,
1738                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1739                 formats)
1740
1741     @staticmethod
1742     def _remove_duplicate_formats(formats):
1743         format_urls = set()
1744         unique_formats = []
1745         for f in formats:
1746             if f['url'] not in format_urls:
1747                 format_urls.add(f['url'])
1748                 unique_formats.append(f)
1749         formats[:] = unique_formats
1750
1751     def _is_valid_url(self, url, video_id, item='video', headers={}):
1752         url = self._proto_relative_url(url, scheme='http:')
1753         # For now assume non HTTP(S) URLs always valid
1754         if not (url.startswith('http://') or url.startswith('https://')):
1755             return True
1756         try:
1757             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1758             return True
1759         except ExtractorError as e:
1760             self.to_screen(
1761                 '%s: %s URL is invalid, skipping: %s'
1762                 % (video_id, item, error_to_compat_str(e.cause)))
1763             return False
1764
1765     def http_scheme(self):
1766         """ Either "http:" or "https:", depending on the user's preferences """
1767         return (
1768             'http:'
1769             if self.get_param('prefer_insecure', False)
1770             else 'https:')
1771
1772     def _proto_relative_url(self, url, scheme=None):
1773         scheme = scheme or self.http_scheme()
1774         assert scheme.endswith(':')
1775         return sanitize_url(url, scheme=scheme[:-1])
1776
1777     def _sleep(self, timeout, video_id, msg_template=None):
1778         if msg_template is None:
1779             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1780         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1781         self.to_screen(msg)
1782         time.sleep(timeout)
1783
1784     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1785                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1786                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1787         if self.get_param('ignore_no_formats_error'):
1788             fatal = False
1789
1790         res = self._download_xml_handle(
1791             manifest_url, video_id, 'Downloading f4m manifest',
1792             'Unable to download f4m manifest',
1793             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1794             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1795             transform_source=transform_source,
1796             fatal=fatal, data=data, headers=headers, query=query)
1797         if res is False:
1798             return []
1799
1800         manifest, urlh = res
1801         manifest_url = urlh.geturl()
1802
1803         return self._parse_f4m_formats(
1804             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1805             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1806
1807     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1808                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1809                            fatal=True, m3u8_id=None):
1810         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1811             return []
1812
1813         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1814         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1815         if akamai_pv is not None and ';' in akamai_pv.text:
1816             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1817             if playerVerificationChallenge.strip() != '':
1818                 return []
1819
1820         formats = []
1821         manifest_version = '1.0'
1822         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1823         if not media_nodes:
1824             manifest_version = '2.0'
1825             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1826         # Remove unsupported DRM protected media from final formats
1827         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1828         media_nodes = remove_encrypted_media(media_nodes)
1829         if not media_nodes:
1830             return formats
1831
1832         manifest_base_url = get_base_url(manifest)
1833
1834         bootstrap_info = xpath_element(
1835             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1836             'bootstrap info', default=None)
1837
1838         vcodec = None
1839         mime_type = xpath_text(
1840             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1841             'base URL', default=None)
1842         if mime_type and mime_type.startswith('audio/'):
1843             vcodec = 'none'
1844
1845         for i, media_el in enumerate(media_nodes):
1846             tbr = int_or_none(media_el.attrib.get('bitrate'))
1847             width = int_or_none(media_el.attrib.get('width'))
1848             height = int_or_none(media_el.attrib.get('height'))
1849             format_id = join_nonempty(f4m_id, tbr or i)
1850             # If <bootstrapInfo> is present, the specified f4m is a
1851             # stream-level manifest, and only set-level manifests may refer to
1852             # external resources.  See section 11.4 and section 4 of F4M spec
1853             if bootstrap_info is None:
1854                 media_url = None
1855                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1856                 if manifest_version == '2.0':
1857                     media_url = media_el.attrib.get('href')
1858                 if media_url is None:
1859                     media_url = media_el.attrib.get('url')
1860                 if not media_url:
1861                     continue
1862                 manifest_url = (
1863                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1864                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1865                 # If media_url is itself a f4m manifest do the recursive extraction
1866                 # since bitrates in parent manifest (this one) and media_url manifest
1867                 # may differ leading to inability to resolve the format by requested
1868                 # bitrate in f4m downloader
1869                 ext = determine_ext(manifest_url)
1870                 if ext == 'f4m':
1871                     f4m_formats = self._extract_f4m_formats(
1872                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1873                         transform_source=transform_source, fatal=fatal)
1874                     # Sometimes stream-level manifest contains single media entry that
1875                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1876                     # At the same time parent's media entry in set-level manifest may
1877                     # contain it. We will copy it from parent in such cases.
1878                     if len(f4m_formats) == 1:
1879                         f = f4m_formats[0]
1880                         f.update({
1881                             'tbr': f.get('tbr') or tbr,
1882                             'width': f.get('width') or width,
1883                             'height': f.get('height') or height,
1884                             'format_id': f.get('format_id') if not tbr else format_id,
1885                             'vcodec': vcodec,
1886                         })
1887                     formats.extend(f4m_formats)
1888                     continue
1889                 elif ext == 'm3u8':
1890                     formats.extend(self._extract_m3u8_formats(
1891                         manifest_url, video_id, 'mp4', preference=preference,
1892                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1893                     continue
1894             formats.append({
1895                 'format_id': format_id,
1896                 'url': manifest_url,
1897                 'manifest_url': manifest_url,
1898                 'ext': 'flv' if bootstrap_info is not None else None,
1899                 'protocol': 'f4m',
1900                 'tbr': tbr,
1901                 'width': width,
1902                 'height': height,
1903                 'vcodec': vcodec,
1904                 'preference': preference,
1905                 'quality': quality,
1906             })
1907         return formats
1908
1909     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1910         return {
1911             'format_id': join_nonempty(m3u8_id, 'meta'),
1912             'url': m3u8_url,
1913             'ext': ext,
1914             'protocol': 'm3u8',
1915             'preference': preference - 100 if preference else -100,
1916             'quality': quality,
1917             'resolution': 'multiple',
1918             'format_note': 'Quality selection URL',
1919         }
1920
1921     def _report_ignoring_subs(self, name):
1922         self.report_warning(bug_reports_message(
1923             f'Ignoring subtitle tracks found in the {name} manifest; '
1924             'if any subtitle tracks are missing,'
1925         ), only_once=True)
1926
1927     def _extract_m3u8_formats(self, *args, **kwargs):
1928         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1929         if subs:
1930             self._report_ignoring_subs('HLS')
1931         return fmts
1932
1933     def _extract_m3u8_formats_and_subtitles(
1934             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1935             preference=None, quality=None, m3u8_id=None, note=None,
1936             errnote=None, fatal=True, live=False, data=None, headers={},
1937             query={}):
1938
1939         if self.get_param('ignore_no_formats_error'):
1940             fatal = False
1941
1942         if not m3u8_url:
1943             if errnote is not False:
1944                 errnote = errnote or 'Failed to obtain m3u8 URL'
1945                 if fatal:
1946                     raise ExtractorError(errnote, video_id=video_id)
1947                 self.report_warning(f'{errnote}{bug_reports_message()}')
1948             return [], {}
1949
1950         res = self._download_webpage_handle(
1951             m3u8_url, video_id,
1952             note='Downloading m3u8 information' if note is None else note,
1953             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1954             fatal=fatal, data=data, headers=headers, query=query)
1955
1956         if res is False:
1957             return [], {}
1958
1959         m3u8_doc, urlh = res
1960         m3u8_url = urlh.geturl()
1961
1962         return self._parse_m3u8_formats_and_subtitles(
1963             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1964             preference=preference, quality=quality, m3u8_id=m3u8_id,
1965             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1966             headers=headers, query=query, video_id=video_id)
1967
1968     def _parse_m3u8_formats_and_subtitles(
1969             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
1970             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1971             errnote=None, fatal=True, data=None, headers={}, query={},
1972             video_id=None):
1973         formats, subtitles = [], {}
1974
1975         has_drm = re.search('|'.join([
1976             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
1977             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
1978         ]), m3u8_doc)
1979
1980         def format_url(url):
1981             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
1982
1983         if self.get_param('hls_split_discontinuity', False):
1984             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1985                 if not m3u8_doc:
1986                     if not manifest_url:
1987                         return []
1988                     m3u8_doc = self._download_webpage(
1989                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
1990                         note=False, errnote='Failed to download m3u8 playlist information')
1991                     if m3u8_doc is False:
1992                         return []
1993                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
1994
1995         else:
1996             def _extract_m3u8_playlist_indices(*args, **kwargs):
1997                 return [None]
1998
1999         # References:
2000         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2001         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2002         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2003
2004         # We should try extracting formats only from master playlists [1, 4.3.4],
2005         # i.e. playlists that describe available qualities. On the other hand
2006         # media playlists [1, 4.3.3] should be returned as is since they contain
2007         # just the media without qualities renditions.
2008         # Fortunately, master playlist can be easily distinguished from media
2009         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2010         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2011         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2012         # media playlist and MUST NOT appear in master playlist thus we can
2013         # clearly detect media playlist with this criterion.
2014
2015         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2016             formats = [{
2017                 'format_id': join_nonempty(m3u8_id, idx),
2018                 'format_index': idx,
2019                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2020                 'ext': ext,
2021                 'protocol': entry_protocol,
2022                 'preference': preference,
2023                 'quality': quality,
2024                 'has_drm': has_drm,
2025             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2026
2027             return formats, subtitles
2028
2029         groups = {}
2030         last_stream_inf = {}
2031
2032         def extract_media(x_media_line):
2033             media = parse_m3u8_attributes(x_media_line)
2034             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2035             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2036             if not (media_type and group_id and name):
2037                 return
2038             groups.setdefault(group_id, []).append(media)
2039             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2040             if media_type == 'SUBTITLES':
2041                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2042                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2043                 # However, lack of URI has been spotted in the wild.
2044                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2045                 if not media.get('URI'):
2046                     return
2047                 url = format_url(media['URI'])
2048                 sub_info = {
2049                     'url': url,
2050                     'ext': determine_ext(url),
2051                 }
2052                 if sub_info['ext'] == 'm3u8':
2053                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2054                     # files may contain is WebVTT:
2055                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2056                     sub_info['ext'] = 'vtt'
2057                     sub_info['protocol'] = 'm3u8_native'
2058                 lang = media.get('LANGUAGE') or 'und'
2059                 subtitles.setdefault(lang, []).append(sub_info)
2060             if media_type not in ('VIDEO', 'AUDIO'):
2061                 return
2062             media_url = media.get('URI')
2063             if media_url:
2064                 manifest_url = format_url(media_url)
2065                 formats.extend({
2066                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2067                     'format_note': name,
2068                     'format_index': idx,
2069                     'url': manifest_url,
2070                     'manifest_url': m3u8_url,
2071                     'language': media.get('LANGUAGE'),
2072                     'ext': ext,
2073                     'protocol': entry_protocol,
2074                     'preference': preference,
2075                     'quality': quality,
2076                     'has_drm': has_drm,
2077                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2078                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2079
2080         def build_stream_name():
2081             # Despite specification does not mention NAME attribute for
2082             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2083             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2084             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2085             stream_name = last_stream_inf.get('NAME')
2086             if stream_name:
2087                 return stream_name
2088             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2089             # from corresponding rendition group
2090             stream_group_id = last_stream_inf.get('VIDEO')
2091             if not stream_group_id:
2092                 return
2093             stream_group = groups.get(stream_group_id)
2094             if not stream_group:
2095                 return stream_group_id
2096             rendition = stream_group[0]
2097             return rendition.get('NAME') or stream_group_id
2098
2099         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2100         # chance to detect video only formats when EXT-X-STREAM-INF tags
2101         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2102         for line in m3u8_doc.splitlines():
2103             if line.startswith('#EXT-X-MEDIA:'):
2104                 extract_media(line)
2105
2106         for line in m3u8_doc.splitlines():
2107             if line.startswith('#EXT-X-STREAM-INF:'):
2108                 last_stream_inf = parse_m3u8_attributes(line)
2109             elif line.startswith('#') or not line.strip():
2110                 continue
2111             else:
2112                 tbr = float_or_none(
2113                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2114                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2115                 manifest_url = format_url(line.strip())
2116
2117                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2118                     format_id = [m3u8_id, None, idx]
2119                     # Bandwidth of live streams may differ over time thus making
2120                     # format_id unpredictable. So it's better to keep provided
2121                     # format_id intact.
2122                     if not live:
2123                         stream_name = build_stream_name()
2124                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2125                     f = {
2126                         'format_id': join_nonempty(*format_id),
2127                         'format_index': idx,
2128                         'url': manifest_url,
2129                         'manifest_url': m3u8_url,
2130                         'tbr': tbr,
2131                         'ext': ext,
2132                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2133                         'protocol': entry_protocol,
2134                         'preference': preference,
2135                         'quality': quality,
2136                         'has_drm': has_drm,
2137                     }
2138                     resolution = last_stream_inf.get('RESOLUTION')
2139                     if resolution:
2140                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2141                         if mobj:
2142                             f['width'] = int(mobj.group('width'))
2143                             f['height'] = int(mobj.group('height'))
2144                     # Unified Streaming Platform
2145                     mobj = re.search(
2146                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2147                     if mobj:
2148                         abr, vbr = mobj.groups()
2149                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2150                         f.update({
2151                             'vbr': vbr,
2152                             'abr': abr,
2153                         })
2154                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2155                     f.update(codecs)
2156                     audio_group_id = last_stream_inf.get('AUDIO')
2157                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2158                     # references a rendition group MUST have a CODECS attribute.
2159                     # However, this is not always respected. E.g. [2]
2160                     # contains EXT-X-STREAM-INF tag which references AUDIO
2161                     # rendition group but does not have CODECS and despite
2162                     # referencing an audio group it represents a complete
2163                     # (with audio and video) format. So, for such cases we will
2164                     # ignore references to rendition groups and treat them
2165                     # as complete formats.
2166                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2167                         audio_group = groups.get(audio_group_id)
2168                         if audio_group and audio_group[0].get('URI'):
2169                             # TODO: update acodec for audio only formats with
2170                             # the same GROUP-ID
2171                             f['acodec'] = 'none'
2172                     if not f.get('ext'):
2173                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2174                     formats.append(f)
2175
2176                     # for DailyMotion
2177                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2178                     if progressive_uri:
2179                         http_f = f.copy()
2180                         del http_f['manifest_url']
2181                         http_f.update({
2182                             'format_id': f['format_id'].replace('hls-', 'http-'),
2183                             'protocol': 'http',
2184                             'url': progressive_uri,
2185                         })
2186                         formats.append(http_f)
2187
2188                 last_stream_inf = {}
2189         return formats, subtitles
2190
2191     def _extract_m3u8_vod_duration(
2192             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2193
2194         m3u8_vod = self._download_webpage(
2195             m3u8_vod_url, video_id,
2196             note='Downloading m3u8 VOD manifest' if note is None else note,
2197             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2198             fatal=False, data=data, headers=headers, query=query)
2199
2200         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2201
2202     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2203         if '#EXT-X-ENDLIST' not in m3u8_vod:
2204             return None
2205
2206         return int(sum(
2207             float(line[len('#EXTINF:'):].split(',')[0])
2208             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2209
2210     def _extract_mpd_vod_duration(
2211             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2212
2213         mpd_doc = self._download_xml(
2214             mpd_url, video_id,
2215             note='Downloading MPD VOD manifest' if note is None else note,
2216             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2217             fatal=False, data=data, headers=headers, query=query) or {}
2218         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2219
2220     @staticmethod
2221     def _xpath_ns(path, namespace=None):
2222         if not namespace:
2223             return path
2224         out = []
2225         for c in path.split('/'):
2226             if not c or c == '.':
2227                 out.append(c)
2228             else:
2229                 out.append('{%s}%s' % (namespace, c))
2230         return '/'.join(out)
2231
2232     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2233         if self.get_param('ignore_no_formats_error'):
2234             fatal = False
2235
2236         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2237         if res is False:
2238             assert not fatal
2239             return [], {}
2240
2241         smil, urlh = res
2242         smil_url = urlh.geturl()
2243
2244         namespace = self._parse_smil_namespace(smil)
2245
2246         fmts = self._parse_smil_formats(
2247             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2248         subs = self._parse_smil_subtitles(
2249             smil, namespace=namespace)
2250
2251         return fmts, subs
2252
2253     def _extract_smil_formats(self, *args, **kwargs):
2254         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2255         if subs:
2256             self._report_ignoring_subs('SMIL')
2257         return fmts
2258
2259     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2260         res = self._download_smil(smil_url, video_id, fatal=fatal)
2261         if res is False:
2262             return {}
2263
2264         smil, urlh = res
2265         smil_url = urlh.geturl()
2266
2267         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2268
2269     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2270         return self._download_xml_handle(
2271             smil_url, video_id, 'Downloading SMIL file',
2272             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2273
2274     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2275         namespace = self._parse_smil_namespace(smil)
2276
2277         formats = self._parse_smil_formats(
2278             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2279         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2280
2281         video_id = os.path.splitext(url_basename(smil_url))[0]
2282         title = None
2283         description = None
2284         upload_date = None
2285         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2286             name = meta.attrib.get('name')
2287             content = meta.attrib.get('content')
2288             if not name or not content:
2289                 continue
2290             if not title and name == 'title':
2291                 title = content
2292             elif not description and name in ('description', 'abstract'):
2293                 description = content
2294             elif not upload_date and name == 'date':
2295                 upload_date = unified_strdate(content)
2296
2297         thumbnails = [{
2298             'id': image.get('type'),
2299             'url': image.get('src'),
2300             'width': int_or_none(image.get('width')),
2301             'height': int_or_none(image.get('height')),
2302         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2303
2304         return {
2305             'id': video_id,
2306             'title': title or video_id,
2307             'description': description,
2308             'upload_date': upload_date,
2309             'thumbnails': thumbnails,
2310             'formats': formats,
2311             'subtitles': subtitles,
2312         }
2313
2314     def _parse_smil_namespace(self, smil):
2315         return self._search_regex(
2316             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2317
2318     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2319         base = smil_url
2320         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2321             b = meta.get('base') or meta.get('httpBase')
2322             if b:
2323                 base = b
2324                 break
2325
2326         formats = []
2327         rtmp_count = 0
2328         http_count = 0
2329         m3u8_count = 0
2330         imgs_count = 0
2331
2332         srcs = set()
2333         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2334         for medium in media:
2335             src = medium.get('src')
2336             if not src or src in srcs:
2337                 continue
2338             srcs.add(src)
2339
2340             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2341             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2342             width = int_or_none(medium.get('width'))
2343             height = int_or_none(medium.get('height'))
2344             proto = medium.get('proto')
2345             ext = medium.get('ext')
2346             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2347                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2348             streamer = medium.get('streamer') or base
2349
2350             if proto == 'rtmp' or streamer.startswith('rtmp'):
2351                 rtmp_count += 1
2352                 formats.append({
2353                     'url': streamer,
2354                     'play_path': src,
2355                     'ext': 'flv',
2356                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2357                     'tbr': bitrate,
2358                     'filesize': filesize,
2359                     'width': width,
2360                     'height': height,
2361                 })
2362                 if transform_rtmp_url:
2363                     streamer, src = transform_rtmp_url(streamer, src)
2364                     formats[-1].update({
2365                         'url': streamer,
2366                         'play_path': src,
2367                     })
2368                 continue
2369
2370             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2371             src_url = src_url.strip()
2372
2373             if proto == 'm3u8' or src_ext == 'm3u8':
2374                 m3u8_formats = self._extract_m3u8_formats(
2375                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2376                 if len(m3u8_formats) == 1:
2377                     m3u8_count += 1
2378                     m3u8_formats[0].update({
2379                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2380                         'tbr': bitrate,
2381                         'width': width,
2382                         'height': height,
2383                     })
2384                 formats.extend(m3u8_formats)
2385             elif src_ext == 'f4m':
2386                 f4m_url = src_url
2387                 if not f4m_params:
2388                     f4m_params = {
2389                         'hdcore': '3.2.0',
2390                         'plugin': 'flowplayer-3.2.0.1',
2391                     }
2392                 f4m_url += '&' if '?' in f4m_url else '?'
2393                 f4m_url += urllib.parse.urlencode(f4m_params)
2394                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2395             elif src_ext == 'mpd':
2396                 formats.extend(self._extract_mpd_formats(
2397                     src_url, video_id, mpd_id='dash', fatal=False))
2398             elif re.search(r'\.ism/[Mm]anifest', src_url):
2399                 formats.extend(self._extract_ism_formats(
2400                     src_url, video_id, ism_id='mss', fatal=False))
2401             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2402                 http_count += 1
2403                 formats.append({
2404                     'url': src_url,
2405                     'ext': ext or src_ext or 'flv',
2406                     'format_id': 'http-%d' % (bitrate or http_count),
2407                     'tbr': bitrate,
2408                     'filesize': filesize,
2409                     'width': width,
2410                     'height': height,
2411                 })
2412
2413         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2414             src = medium.get('src')
2415             if not src or src in srcs:
2416                 continue
2417             srcs.add(src)
2418
2419             imgs_count += 1
2420             formats.append({
2421                 'format_id': 'imagestream-%d' % (imgs_count),
2422                 'url': src,
2423                 'ext': mimetype2ext(medium.get('type')),
2424                 'acodec': 'none',
2425                 'vcodec': 'none',
2426                 'width': int_or_none(medium.get('width')),
2427                 'height': int_or_none(medium.get('height')),
2428                 'format_note': 'SMIL storyboards',
2429             })
2430
2431         return formats
2432
2433     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2434         urls = []
2435         subtitles = {}
2436         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2437             src = textstream.get('src')
2438             if not src or src in urls:
2439                 continue
2440             urls.append(src)
2441             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2442             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2443             subtitles.setdefault(lang, []).append({
2444                 'url': src,
2445                 'ext': ext,
2446             })
2447         return subtitles
2448
2449     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2450         res = self._download_xml_handle(
2451             xspf_url, playlist_id, 'Downloading xpsf playlist',
2452             'Unable to download xspf manifest', fatal=fatal)
2453         if res is False:
2454             return []
2455
2456         xspf, urlh = res
2457         xspf_url = urlh.geturl()
2458
2459         return self._parse_xspf(
2460             xspf, playlist_id, xspf_url=xspf_url,
2461             xspf_base_url=base_url(xspf_url))
2462
2463     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2464         NS_MAP = {
2465             'xspf': 'http://xspf.org/ns/0/',
2466             's1': 'http://static.streamone.nl/player/ns/0',
2467         }
2468
2469         entries = []
2470         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2471             title = xpath_text(
2472                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2473             description = xpath_text(
2474                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2475             thumbnail = xpath_text(
2476                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2477             duration = float_or_none(
2478                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2479
2480             formats = []
2481             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2482                 format_url = urljoin(xspf_base_url, location.text)
2483                 if not format_url:
2484                     continue
2485                 formats.append({
2486                     'url': format_url,
2487                     'manifest_url': xspf_url,
2488                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2489                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2490                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2491                 })
2492
2493             entries.append({
2494                 'id': playlist_id,
2495                 'title': title,
2496                 'description': description,
2497                 'thumbnail': thumbnail,
2498                 'duration': duration,
2499                 'formats': formats,
2500             })
2501         return entries
2502
2503     def _extract_mpd_formats(self, *args, **kwargs):
2504         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2505         if subs:
2506             self._report_ignoring_subs('DASH')
2507         return fmts
2508
2509     def _extract_mpd_formats_and_subtitles(
2510             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2511             fatal=True, data=None, headers={}, query={}):
2512
2513         if self.get_param('ignore_no_formats_error'):
2514             fatal = False
2515
2516         res = self._download_xml_handle(
2517             mpd_url, video_id,
2518             note='Downloading MPD manifest' if note is None else note,
2519             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2520             fatal=fatal, data=data, headers=headers, query=query)
2521         if res is False:
2522             return [], {}
2523         mpd_doc, urlh = res
2524         if mpd_doc is None:
2525             return [], {}
2526
2527         # We could have been redirected to a new url when we retrieved our mpd file.
2528         mpd_url = urlh.geturl()
2529         mpd_base_url = base_url(mpd_url)
2530
2531         return self._parse_mpd_formats_and_subtitles(
2532             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2533
2534     def _parse_mpd_formats(self, *args, **kwargs):
2535         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2536         if subs:
2537             self._report_ignoring_subs('DASH')
2538         return fmts
2539
2540     def _parse_mpd_formats_and_subtitles(
2541             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2542         """
2543         Parse formats from MPD manifest.
2544         References:
2545          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2546             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2547          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2548         """
2549         if not self.get_param('dynamic_mpd', True):
2550             if mpd_doc.get('type') == 'dynamic':
2551                 return [], {}
2552
2553         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2554
2555         def _add_ns(path):
2556             return self._xpath_ns(path, namespace)
2557
2558         def is_drm_protected(element):
2559             return element.find(_add_ns('ContentProtection')) is not None
2560
2561         def extract_multisegment_info(element, ms_parent_info):
2562             ms_info = ms_parent_info.copy()
2563
2564             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2565             # common attributes and elements.  We will only extract relevant
2566             # for us.
2567             def extract_common(source):
2568                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2569                 if segment_timeline is not None:
2570                     s_e = segment_timeline.findall(_add_ns('S'))
2571                     if s_e:
2572                         ms_info['total_number'] = 0
2573                         ms_info['s'] = []
2574                         for s in s_e:
2575                             r = int(s.get('r', 0))
2576                             ms_info['total_number'] += 1 + r
2577                             ms_info['s'].append({
2578                                 't': int(s.get('t', 0)),
2579                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2580                                 'd': int(s.attrib['d']),
2581                                 'r': r,
2582                             })
2583                 start_number = source.get('startNumber')
2584                 if start_number:
2585                     ms_info['start_number'] = int(start_number)
2586                 timescale = source.get('timescale')
2587                 if timescale:
2588                     ms_info['timescale'] = int(timescale)
2589                 segment_duration = source.get('duration')
2590                 if segment_duration:
2591                     ms_info['segment_duration'] = float(segment_duration)
2592
2593             def extract_Initialization(source):
2594                 initialization = source.find(_add_ns('Initialization'))
2595                 if initialization is not None:
2596                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2597
2598             segment_list = element.find(_add_ns('SegmentList'))
2599             if segment_list is not None:
2600                 extract_common(segment_list)
2601                 extract_Initialization(segment_list)
2602                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2603                 if segment_urls_e:
2604                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2605             else:
2606                 segment_template = element.find(_add_ns('SegmentTemplate'))
2607                 if segment_template is not None:
2608                     extract_common(segment_template)
2609                     media = segment_template.get('media')
2610                     if media:
2611                         ms_info['media'] = media
2612                     initialization = segment_template.get('initialization')
2613                     if initialization:
2614                         ms_info['initialization'] = initialization
2615                     else:
2616                         extract_Initialization(segment_template)
2617             return ms_info
2618
2619         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2620         formats, subtitles = [], {}
2621         stream_numbers = collections.defaultdict(int)
2622         for period in mpd_doc.findall(_add_ns('Period')):
2623             period_duration = parse_duration(period.get('duration')) or mpd_duration
2624             period_ms_info = extract_multisegment_info(period, {
2625                 'start_number': 1,
2626                 'timescale': 1,
2627             })
2628             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2629                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2630                 for representation in adaptation_set.findall(_add_ns('Representation')):
2631                     representation_attrib = adaptation_set.attrib.copy()
2632                     representation_attrib.update(representation.attrib)
2633                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2634                     mime_type = representation_attrib['mimeType']
2635                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2636
2637                     codec_str = representation_attrib.get('codecs', '')
2638                     # Some kind of binary subtitle found in some youtube livestreams
2639                     if mime_type == 'application/x-rawcc':
2640                         codecs = {'scodec': codec_str}
2641                     else:
2642                         codecs = parse_codecs(codec_str)
2643                     if content_type not in ('video', 'audio', 'text'):
2644                         if mime_type == 'image/jpeg':
2645                             content_type = mime_type
2646                         elif codecs.get('vcodec', 'none') != 'none':
2647                             content_type = 'video'
2648                         elif codecs.get('acodec', 'none') != 'none':
2649                             content_type = 'audio'
2650                         elif codecs.get('scodec', 'none') != 'none':
2651                             content_type = 'text'
2652                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2653                             content_type = 'text'
2654                         else:
2655                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2656                             continue
2657
2658                     base_url = ''
2659                     for element in (representation, adaptation_set, period, mpd_doc):
2660                         base_url_e = element.find(_add_ns('BaseURL'))
2661                         if try_call(lambda: base_url_e.text) is not None:
2662                             base_url = base_url_e.text + base_url
2663                             if re.match(r'^https?://', base_url):
2664                                 break
2665                     if mpd_base_url and base_url.startswith('/'):
2666                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2667                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2668                         if not mpd_base_url.endswith('/'):
2669                             mpd_base_url += '/'
2670                         base_url = mpd_base_url + base_url
2671                     representation_id = representation_attrib.get('id')
2672                     lang = representation_attrib.get('lang')
2673                     url_el = representation.find(_add_ns('BaseURL'))
2674                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2675                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2676                     if representation_id is not None:
2677                         format_id = representation_id
2678                     else:
2679                         format_id = content_type
2680                     if mpd_id:
2681                         format_id = mpd_id + '-' + format_id
2682                     if content_type in ('video', 'audio'):
2683                         f = {
2684                             'format_id': format_id,
2685                             'manifest_url': mpd_url,
2686                             'ext': mimetype2ext(mime_type),
2687                             'width': int_or_none(representation_attrib.get('width')),
2688                             'height': int_or_none(representation_attrib.get('height')),
2689                             'tbr': float_or_none(bandwidth, 1000),
2690                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2691                             'fps': int_or_none(representation_attrib.get('frameRate')),
2692                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2693                             'format_note': 'DASH %s' % content_type,
2694                             'filesize': filesize,
2695                             'container': mimetype2ext(mime_type) + '_dash',
2696                             **codecs
2697                         }
2698                     elif content_type == 'text':
2699                         f = {
2700                             'ext': mimetype2ext(mime_type),
2701                             'manifest_url': mpd_url,
2702                             'filesize': filesize,
2703                         }
2704                     elif content_type == 'image/jpeg':
2705                         # See test case in VikiIE
2706                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2707                         f = {
2708                             'format_id': format_id,
2709                             'ext': 'mhtml',
2710                             'manifest_url': mpd_url,
2711                             'format_note': 'DASH storyboards (jpeg)',
2712                             'acodec': 'none',
2713                             'vcodec': 'none',
2714                         }
2715                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2716                         f['has_drm'] = True
2717                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2718
2719                     def prepare_template(template_name, identifiers):
2720                         tmpl = representation_ms_info[template_name]
2721                         if representation_id is not None:
2722                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2723                         # First of, % characters outside $...$ templates
2724                         # must be escaped by doubling for proper processing
2725                         # by % operator string formatting used further (see
2726                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2727                         t = ''
2728                         in_template = False
2729                         for c in tmpl:
2730                             t += c
2731                             if c == '$':
2732                                 in_template = not in_template
2733                             elif c == '%' and not in_template:
2734                                 t += c
2735                         # Next, $...$ templates are translated to their
2736                         # %(...) counterparts to be used with % operator
2737                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2738                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2739                         t.replace('$$', '$')
2740                         return t
2741
2742                     # @initialization is a regular template like @media one
2743                     # so it should be handled just the same way (see
2744                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2745                     if 'initialization' in representation_ms_info:
2746                         initialization_template = prepare_template(
2747                             'initialization',
2748                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2749                             # $Time$ shall not be included for @initialization thus
2750                             # only $Bandwidth$ remains
2751                             ('Bandwidth', ))
2752                         representation_ms_info['initialization_url'] = initialization_template % {
2753                             'Bandwidth': bandwidth,
2754                         }
2755
2756                     def location_key(location):
2757                         return 'url' if re.match(r'^https?://', location) else 'path'
2758
2759                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2760
2761                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2762                         media_location_key = location_key(media_template)
2763
2764                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2765                         # can't be used at the same time
2766                         if '%(Number' in media_template and 's' not in representation_ms_info:
2767                             segment_duration = None
2768                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2769                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2770                                 representation_ms_info['total_number'] = int(math.ceil(
2771                                     float_or_none(period_duration, segment_duration, default=0)))
2772                             representation_ms_info['fragments'] = [{
2773                                 media_location_key: media_template % {
2774                                     'Number': segment_number,
2775                                     'Bandwidth': bandwidth,
2776                                 },
2777                                 'duration': segment_duration,
2778                             } for segment_number in range(
2779                                 representation_ms_info['start_number'],
2780                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2781                         else:
2782                             # $Number*$ or $Time$ in media template with S list available
2783                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2784                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2785                             representation_ms_info['fragments'] = []
2786                             segment_time = 0
2787                             segment_d = None
2788                             segment_number = representation_ms_info['start_number']
2789
2790                             def add_segment_url():
2791                                 segment_url = media_template % {
2792                                     'Time': segment_time,
2793                                     'Bandwidth': bandwidth,
2794                                     'Number': segment_number,
2795                                 }
2796                                 representation_ms_info['fragments'].append({
2797                                     media_location_key: segment_url,
2798                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2799                                 })
2800
2801                             for num, s in enumerate(representation_ms_info['s']):
2802                                 segment_time = s.get('t') or segment_time
2803                                 segment_d = s['d']
2804                                 add_segment_url()
2805                                 segment_number += 1
2806                                 for r in range(s.get('r', 0)):
2807                                     segment_time += segment_d
2808                                     add_segment_url()
2809                                     segment_number += 1
2810                                 segment_time += segment_d
2811                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2812                         # No media template,
2813                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2814                         # or any YouTube dashsegments video
2815                         fragments = []
2816                         segment_index = 0
2817                         timescale = representation_ms_info['timescale']
2818                         for s in representation_ms_info['s']:
2819                             duration = float_or_none(s['d'], timescale)
2820                             for r in range(s.get('r', 0) + 1):
2821                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2822                                 fragments.append({
2823                                     location_key(segment_uri): segment_uri,
2824                                     'duration': duration,
2825                                 })
2826                                 segment_index += 1
2827                         representation_ms_info['fragments'] = fragments
2828                     elif 'segment_urls' in representation_ms_info:
2829                         # Segment URLs with no SegmentTimeline
2830                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2831                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2832                         fragments = []
2833                         segment_duration = float_or_none(
2834                             representation_ms_info['segment_duration'],
2835                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2836                         for segment_url in representation_ms_info['segment_urls']:
2837                             fragment = {
2838                                 location_key(segment_url): segment_url,
2839                             }
2840                             if segment_duration:
2841                                 fragment['duration'] = segment_duration
2842                             fragments.append(fragment)
2843                         representation_ms_info['fragments'] = fragments
2844                     # If there is a fragments key available then we correctly recognized fragmented media.
2845                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2846                     # assumption is not necessarily correct since we may simply have no support for
2847                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2848                     if 'fragments' in representation_ms_info:
2849                         f.update({
2850                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2851                             'url': mpd_url or base_url,
2852                             'fragment_base_url': base_url,
2853                             'fragments': [],
2854                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2855                         })
2856                         if 'initialization_url' in representation_ms_info:
2857                             initialization_url = representation_ms_info['initialization_url']
2858                             if not f.get('url'):
2859                                 f['url'] = initialization_url
2860                             f['fragments'].append({location_key(initialization_url): initialization_url})
2861                         f['fragments'].extend(representation_ms_info['fragments'])
2862                         if not period_duration:
2863                             period_duration = try_get(
2864                                 representation_ms_info,
2865                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2866                     else:
2867                         # Assuming direct URL to unfragmented media.
2868                         f['url'] = base_url
2869                     if content_type in ('video', 'audio', 'image/jpeg'):
2870                         f['manifest_stream_number'] = stream_numbers[f['url']]
2871                         stream_numbers[f['url']] += 1
2872                         formats.append(f)
2873                     elif content_type == 'text':
2874                         subtitles.setdefault(lang or 'und', []).append(f)
2875
2876         return formats, subtitles
2877
2878     def _extract_ism_formats(self, *args, **kwargs):
2879         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2880         if subs:
2881             self._report_ignoring_subs('ISM')
2882         return fmts
2883
2884     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2885         if self.get_param('ignore_no_formats_error'):
2886             fatal = False
2887
2888         res = self._download_xml_handle(
2889             ism_url, video_id,
2890             note='Downloading ISM manifest' if note is None else note,
2891             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2892             fatal=fatal, data=data, headers=headers, query=query)
2893         if res is False:
2894             return [], {}
2895         ism_doc, urlh = res
2896         if ism_doc is None:
2897             return [], {}
2898
2899         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2900
2901     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2902         """
2903         Parse formats from ISM manifest.
2904         References:
2905          1. [MS-SSTR]: Smooth Streaming Protocol,
2906             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2907         """
2908         if ism_doc.get('IsLive') == 'TRUE':
2909             return [], {}
2910
2911         duration = int(ism_doc.attrib['Duration'])
2912         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2913
2914         formats = []
2915         subtitles = {}
2916         for stream in ism_doc.findall('StreamIndex'):
2917             stream_type = stream.get('Type')
2918             if stream_type not in ('video', 'audio', 'text'):
2919                 continue
2920             url_pattern = stream.attrib['Url']
2921             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2922             stream_name = stream.get('Name')
2923             stream_language = stream.get('Language', 'und')
2924             for track in stream.findall('QualityLevel'):
2925                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2926                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
2927                 # TODO: add support for WVC1 and WMAP
2928                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
2929                     self.report_warning('%s is not a supported codec' % fourcc)
2930                     continue
2931                 tbr = int(track.attrib['Bitrate']) // 1000
2932                 # [1] does not mention Width and Height attributes. However,
2933                 # they're often present while MaxWidth and MaxHeight are
2934                 # missing, so should be used as fallbacks
2935                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2936                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2937                 sampling_rate = int_or_none(track.get('SamplingRate'))
2938
2939                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2940                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
2941
2942                 fragments = []
2943                 fragment_ctx = {
2944                     'time': 0,
2945                 }
2946                 stream_fragments = stream.findall('c')
2947                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2948                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2949                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2950                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2951                     if not fragment_ctx['duration']:
2952                         try:
2953                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2954                         except IndexError:
2955                             next_fragment_time = duration
2956                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2957                     for _ in range(fragment_repeat):
2958                         fragments.append({
2959                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
2960                             'duration': fragment_ctx['duration'] / stream_timescale,
2961                         })
2962                         fragment_ctx['time'] += fragment_ctx['duration']
2963
2964                 if stream_type == 'text':
2965                     subtitles.setdefault(stream_language, []).append({
2966                         'ext': 'ismt',
2967                         'protocol': 'ism',
2968                         'url': ism_url,
2969                         'manifest_url': ism_url,
2970                         'fragments': fragments,
2971                         '_download_params': {
2972                             'stream_type': stream_type,
2973                             'duration': duration,
2974                             'timescale': stream_timescale,
2975                             'fourcc': fourcc,
2976                             'language': stream_language,
2977                             'codec_private_data': track.get('CodecPrivateData'),
2978                         }
2979                     })
2980                 elif stream_type in ('video', 'audio'):
2981                     formats.append({
2982                         'format_id': join_nonempty(ism_id, stream_name, tbr),
2983                         'url': ism_url,
2984                         'manifest_url': ism_url,
2985                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2986                         'width': width,
2987                         'height': height,
2988                         'tbr': tbr,
2989                         'asr': sampling_rate,
2990                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2991                         'acodec': 'none' if stream_type == 'video' else fourcc,
2992                         'protocol': 'ism',
2993                         'fragments': fragments,
2994                         'has_drm': ism_doc.find('Protection') is not None,
2995                         'language': stream_language,
2996                         'audio_channels': int_or_none(track.get('Channels')),
2997                         '_download_params': {
2998                             'stream_type': stream_type,
2999                             'duration': duration,
3000                             'timescale': stream_timescale,
3001                             'width': width or 0,
3002                             'height': height or 0,
3003                             'fourcc': fourcc,
3004                             'language': stream_language,
3005                             'codec_private_data': track.get('CodecPrivateData'),
3006                             'sampling_rate': sampling_rate,
3007                             'channels': int_or_none(track.get('Channels', 2)),
3008                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3009                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3010                         },
3011                     })
3012         return formats, subtitles
3013
3014     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3015         def absolute_url(item_url):
3016             return urljoin(base_url, item_url)
3017
3018         def parse_content_type(content_type):
3019             if not content_type:
3020                 return {}
3021             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3022             if ctr:
3023                 mimetype, codecs = ctr.groups()
3024                 f = parse_codecs(codecs)
3025                 f['ext'] = mimetype2ext(mimetype)
3026                 return f
3027             return {}
3028
3029         def _media_formats(src, cur_media_type, type_info=None):
3030             type_info = type_info or {}
3031             full_url = absolute_url(src)
3032             ext = type_info.get('ext') or determine_ext(full_url)
3033             if ext == 'm3u8':
3034                 is_plain_url = False
3035                 formats = self._extract_m3u8_formats(
3036                     full_url, video_id, ext='mp4',
3037                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3038                     preference=preference, quality=quality, fatal=False)
3039             elif ext == 'mpd':
3040                 is_plain_url = False
3041                 formats = self._extract_mpd_formats(
3042                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3043             else:
3044                 is_plain_url = True
3045                 formats = [{
3046                     'url': full_url,
3047                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3048                     'ext': ext,
3049                 }]
3050             return is_plain_url, formats
3051
3052         entries = []
3053         # amp-video and amp-audio are very similar to their HTML5 counterparts
3054         # so we will include them right here (see
3055         # https://www.ampproject.org/docs/reference/components/amp-video)
3056         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3057         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3058         media_tags = [(media_tag, media_tag_name, media_type, '')
3059                       for media_tag, media_tag_name, media_type
3060                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3061         media_tags.extend(re.findall(
3062             # We only allow video|audio followed by a whitespace or '>'.
3063             # Allowing more characters may end up in significant slow down (see
3064             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3065             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3066             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3067         for media_tag, _, media_type, media_content in media_tags:
3068             media_info = {
3069                 'formats': [],
3070                 'subtitles': {},
3071             }
3072             media_attributes = extract_attributes(media_tag)
3073             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3074             if src:
3075                 f = parse_content_type(media_attributes.get('type'))
3076                 _, formats = _media_formats(src, media_type, f)
3077                 media_info['formats'].extend(formats)
3078             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3079             if media_content:
3080                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3081                     s_attr = extract_attributes(source_tag)
3082                     # data-video-src and data-src are non standard but seen
3083                     # several times in the wild
3084                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3085                     if not src:
3086                         continue
3087                     f = parse_content_type(s_attr.get('type'))
3088                     is_plain_url, formats = _media_formats(src, media_type, f)
3089                     if is_plain_url:
3090                         # width, height, res, label and title attributes are
3091                         # all not standard but seen several times in the wild
3092                         labels = [
3093                             s_attr.get(lbl)
3094                             for lbl in ('label', 'title')
3095                             if str_or_none(s_attr.get(lbl))
3096                         ]
3097                         width = int_or_none(s_attr.get('width'))
3098                         height = (int_or_none(s_attr.get('height'))
3099                                   or int_or_none(s_attr.get('res')))
3100                         if not width or not height:
3101                             for lbl in labels:
3102                                 resolution = parse_resolution(lbl)
3103                                 if not resolution:
3104                                     continue
3105                                 width = width or resolution.get('width')
3106                                 height = height or resolution.get('height')
3107                         for lbl in labels:
3108                             tbr = parse_bitrate(lbl)
3109                             if tbr:
3110                                 break
3111                         else:
3112                             tbr = None
3113                         f.update({
3114                             'width': width,
3115                             'height': height,
3116                             'tbr': tbr,
3117                             'format_id': s_attr.get('label') or s_attr.get('title'),
3118                         })
3119                         f.update(formats[0])
3120                         media_info['formats'].append(f)
3121                     else:
3122                         media_info['formats'].extend(formats)
3123                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3124                     track_attributes = extract_attributes(track_tag)
3125                     kind = track_attributes.get('kind')
3126                     if not kind or kind in ('subtitles', 'captions'):
3127                         src = strip_or_none(track_attributes.get('src'))
3128                         if not src:
3129                             continue
3130                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3131                         media_info['subtitles'].setdefault(lang, []).append({
3132                             'url': absolute_url(src),
3133                         })
3134             for f in media_info['formats']:
3135                 f.setdefault('http_headers', {})['Referer'] = base_url
3136             if media_info['formats'] or media_info['subtitles']:
3137                 entries.append(media_info)
3138         return entries
3139
3140     def _extract_akamai_formats(self, *args, **kwargs):
3141         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3142         if subs:
3143             self._report_ignoring_subs('akamai')
3144         return fmts
3145
3146     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3147         signed = 'hdnea=' in manifest_url
3148         if not signed:
3149             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3150             manifest_url = re.sub(
3151                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3152                 '', manifest_url).strip('?')
3153
3154         formats = []
3155         subtitles = {}
3156
3157         hdcore_sign = 'hdcore=3.7.0'
3158         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3159         hds_host = hosts.get('hds')
3160         if hds_host:
3161             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3162         if 'hdcore=' not in f4m_url:
3163             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3164         f4m_formats = self._extract_f4m_formats(
3165             f4m_url, video_id, f4m_id='hds', fatal=False)
3166         for entry in f4m_formats:
3167             entry.update({'extra_param_to_segment_url': hdcore_sign})
3168         formats.extend(f4m_formats)
3169
3170         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3171         hls_host = hosts.get('hls')
3172         if hls_host:
3173             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3174         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3175             m3u8_url, video_id, 'mp4', 'm3u8_native',
3176             m3u8_id='hls', fatal=False)
3177         formats.extend(m3u8_formats)
3178         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3179
3180         http_host = hosts.get('http')
3181         if http_host and m3u8_formats and not signed:
3182             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3183             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3184             qualities_length = len(qualities)
3185             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3186                 i = 0
3187                 for f in m3u8_formats:
3188                     if f['vcodec'] != 'none':
3189                         for protocol in ('http', 'https'):
3190                             http_f = f.copy()
3191                             del http_f['manifest_url']
3192                             http_url = re.sub(
3193                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3194                             http_f.update({
3195                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3196                                 'url': http_url,
3197                                 'protocol': protocol,
3198                             })
3199                             formats.append(http_f)
3200                         i += 1
3201
3202         return formats, subtitles
3203
3204     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3205         query = urllib.parse.urlparse(url).query
3206         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3207         mobj = re.search(
3208             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3209         url_base = mobj.group('url')
3210         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3211         formats = []
3212
3213         def manifest_url(manifest):
3214             m_url = f'{http_base_url}/{manifest}'
3215             if query:
3216                 m_url += '?%s' % query
3217             return m_url
3218
3219         if 'm3u8' not in skip_protocols:
3220             formats.extend(self._extract_m3u8_formats(
3221                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3222                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3223         if 'f4m' not in skip_protocols:
3224             formats.extend(self._extract_f4m_formats(
3225                 manifest_url('manifest.f4m'),
3226                 video_id, f4m_id='hds', fatal=False))
3227         if 'dash' not in skip_protocols:
3228             formats.extend(self._extract_mpd_formats(
3229                 manifest_url('manifest.mpd'),
3230                 video_id, mpd_id='dash', fatal=False))
3231         if re.search(r'(?:/smil:|\.smil)', url_base):
3232             if 'smil' not in skip_protocols:
3233                 rtmp_formats = self._extract_smil_formats(
3234                     manifest_url('jwplayer.smil'),
3235                     video_id, fatal=False)
3236                 for rtmp_format in rtmp_formats:
3237                     rtsp_format = rtmp_format.copy()
3238                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3239                     del rtsp_format['play_path']
3240                     del rtsp_format['ext']
3241                     rtsp_format.update({
3242                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3243                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3244                         'protocol': 'rtsp',
3245                     })
3246                     formats.extend([rtmp_format, rtsp_format])
3247         else:
3248             for protocol in ('rtmp', 'rtsp'):
3249                 if protocol not in skip_protocols:
3250                     formats.append({
3251                         'url': f'{protocol}:{url_base}',
3252                         'format_id': protocol,
3253                         'protocol': protocol,
3254                     })
3255         return formats
3256
3257     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3258         mobj = re.search(
3259             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3260             webpage)
3261         if mobj:
3262             try:
3263                 jwplayer_data = self._parse_json(mobj.group('options'),
3264                                                  video_id=video_id,
3265                                                  transform_source=transform_source)
3266             except ExtractorError:
3267                 pass
3268             else:
3269                 if isinstance(jwplayer_data, dict):
3270                     return jwplayer_data
3271
3272     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3273         jwplayer_data = self._find_jwplayer_data(
3274             webpage, video_id, transform_source=js_to_json)
3275         return self._parse_jwplayer_data(
3276             jwplayer_data, video_id, *args, **kwargs)
3277
3278     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3279                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3280         entries = []
3281         if not isinstance(jwplayer_data, dict):
3282             return entries
3283
3284         playlist_items = jwplayer_data.get('playlist')
3285         # JWPlayer backward compatibility: single playlist item/flattened playlists
3286         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3287         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3288         if not isinstance(playlist_items, list):
3289             playlist_items = (playlist_items or jwplayer_data, )
3290
3291         for video_data in playlist_items:
3292             if not isinstance(video_data, dict):
3293                 continue
3294             # JWPlayer backward compatibility: flattened sources
3295             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3296             if 'sources' not in video_data:
3297                 video_data['sources'] = [video_data]
3298
3299             this_video_id = video_id or video_data['mediaid']
3300
3301             formats = self._parse_jwplayer_formats(
3302                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3303                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3304
3305             subtitles = {}
3306             tracks = video_data.get('tracks')
3307             if tracks and isinstance(tracks, list):
3308                 for track in tracks:
3309                     if not isinstance(track, dict):
3310                         continue
3311                     track_kind = track.get('kind')
3312                     if not track_kind or not isinstance(track_kind, str):
3313                         continue
3314                     if track_kind.lower() not in ('captions', 'subtitles'):
3315                         continue
3316                     track_url = urljoin(base_url, track.get('file'))
3317                     if not track_url:
3318                         continue
3319                     subtitles.setdefault(track.get('label') or 'en', []).append({
3320                         'url': self._proto_relative_url(track_url)
3321                     })
3322
3323             entry = {
3324                 'id': this_video_id,
3325                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3326                 'description': clean_html(video_data.get('description')),
3327                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3328                 'timestamp': int_or_none(video_data.get('pubdate')),
3329                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3330                 'subtitles': subtitles,
3331                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3332                 'genre': clean_html(video_data.get('genre')),
3333                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3334                 'season_number': int_or_none(video_data.get('season')),
3335                 'episode_number': int_or_none(video_data.get('episode')),
3336                 'release_year': int_or_none(video_data.get('releasedate')),
3337                 'age_limit': int_or_none(video_data.get('age_restriction')),
3338             }
3339             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3340             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3341                 entry.update({
3342                     '_type': 'url_transparent',
3343                     'url': formats[0]['url'],
3344                 })
3345             else:
3346                 entry['formats'] = formats
3347             entries.append(entry)
3348         if len(entries) == 1:
3349             return entries[0]
3350         else:
3351             return self.playlist_result(entries)
3352
3353     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3354                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3355         urls = set()
3356         formats = []
3357         for source in jwplayer_sources_data:
3358             if not isinstance(source, dict):
3359                 continue
3360             source_url = urljoin(
3361                 base_url, self._proto_relative_url(source.get('file')))
3362             if not source_url or source_url in urls:
3363                 continue
3364             urls.add(source_url)
3365             source_type = source.get('type') or ''
3366             ext = mimetype2ext(source_type) or determine_ext(source_url)
3367             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3368                 formats.extend(self._extract_m3u8_formats(
3369                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3370                     m3u8_id=m3u8_id, fatal=False))
3371             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3372                 formats.extend(self._extract_mpd_formats(
3373                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3374             elif ext == 'smil':
3375                 formats.extend(self._extract_smil_formats(
3376                     source_url, video_id, fatal=False))
3377             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3378             elif source_type.startswith('audio') or ext in (
3379                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3380                 formats.append({
3381                     'url': source_url,
3382                     'vcodec': 'none',
3383                     'ext': ext,
3384                 })
3385             else:
3386                 format_id = str_or_none(source.get('label'))
3387                 height = int_or_none(source.get('height'))
3388                 if height is None and format_id:
3389                     # Often no height is provided but there is a label in
3390                     # format like "1080p", "720p SD", or 1080.
3391                     height = parse_resolution(format_id).get('height')
3392                 a_format = {
3393                     'url': source_url,
3394                     'width': int_or_none(source.get('width')),
3395                     'height': height,
3396                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3397                     'filesize': int_or_none(source.get('filesize')),
3398                     'ext': ext,
3399                     'format_id': format_id
3400                 }
3401                 if source_url.startswith('rtmp'):
3402                     a_format['ext'] = 'flv'
3403                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3404                     # of jwplayer.flash.swf
3405                     rtmp_url_parts = re.split(
3406                         r'((?:mp4|mp3|flv):)', source_url, 1)
3407                     if len(rtmp_url_parts) == 3:
3408                         rtmp_url, prefix, play_path = rtmp_url_parts
3409                         a_format.update({
3410                             'url': rtmp_url,
3411                             'play_path': prefix + play_path,
3412                         })
3413                     if rtmp_params:
3414                         a_format.update(rtmp_params)
3415                 formats.append(a_format)
3416         return formats
3417
3418     def _live_title(self, name):
3419         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3420         return name
3421
3422     def _int(self, v, name, fatal=False, **kwargs):
3423         res = int_or_none(v, **kwargs)
3424         if res is None:
3425             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3426             if fatal:
3427                 raise ExtractorError(msg)
3428             else:
3429                 self.report_warning(msg)
3430         return res
3431
3432     def _float(self, v, name, fatal=False, **kwargs):
3433         res = float_or_none(v, **kwargs)
3434         if res is None:
3435             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3436             if fatal:
3437                 raise ExtractorError(msg)
3438             else:
3439                 self.report_warning(msg)
3440         return res
3441
3442     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3443                     path='/', secure=False, discard=False, rest={}, **kwargs):
3444         cookie = http.cookiejar.Cookie(
3445             0, name, value, port, port is not None, domain, True,
3446             domain.startswith('.'), path, True, secure, expire_time,
3447             discard, None, None, rest)
3448         self.cookiejar.set_cookie(cookie)
3449
3450     def _get_cookies(self, url):
3451         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3452         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3453
3454     def _apply_first_set_cookie_header(self, url_handle, cookie):
3455         """
3456         Apply first Set-Cookie header instead of the last. Experimental.
3457
3458         Some sites (e.g. [1-3]) may serve two cookies under the same name
3459         in Set-Cookie header and expect the first (old) one to be set rather
3460         than second (new). However, as of RFC6265 the newer one cookie
3461         should be set into cookie store what actually happens.
3462         We will workaround this issue by resetting the cookie to
3463         the first one manually.
3464         1. https://new.vk.com/
3465         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3466         3. https://learning.oreilly.com/
3467         """
3468         for header, cookies in url_handle.headers.items():
3469             if header.lower() != 'set-cookie':
3470                 continue
3471             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3472             cookie_value = re.search(
3473                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3474             if cookie_value:
3475                 value, domain = cookie_value.groups()
3476                 self._set_cookie(domain, cookie, value)
3477                 break
3478
3479     @classmethod
3480     def get_testcases(cls, include_onlymatching=False):
3481         # Do not look in super classes
3482         t = vars(cls).get('_TEST')
3483         if t:
3484             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3485             tests = [t]
3486         else:
3487             tests = vars(cls).get('_TESTS', [])
3488         for t in tests:
3489             if not include_onlymatching and t.get('only_matching', False):
3490                 continue
3491             t['name'] = cls.ie_key()
3492             yield t
3493         if getattr(cls, '__wrapped__', None):
3494             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3495
3496     @classmethod
3497     def get_webpage_testcases(cls):
3498         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3499         for t in tests:
3500             t['name'] = cls.ie_key()
3501             yield t
3502         if getattr(cls, '__wrapped__', None):
3503             yield from cls.__wrapped__.get_webpage_testcases()
3504
3505     @classproperty(cache=True)
3506     def age_limit(cls):
3507         """Get age limit from the testcases"""
3508         return max(traverse_obj(
3509             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3510             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3511
3512     @classproperty(cache=True)
3513     def _RETURN_TYPE(cls):
3514         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3515         tests = tuple(cls.get_testcases(include_onlymatching=False))
3516         if not tests:
3517             return None
3518         elif not any(k.startswith('playlist') for test in tests for k in test):
3519             return 'video'
3520         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3521             return 'playlist'
3522         return 'any'
3523
3524     @classmethod
3525     def is_single_video(cls, url):
3526         """Returns whether the URL is of a single video, None if unknown"""
3527         if cls.suitable(url):
3528             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3529
3530     @classmethod
3531     def is_suitable(cls, age_limit):
3532         """Test whether the extractor is generally suitable for the given age limit"""
3533         return not age_restricted(cls.age_limit, age_limit)
3534
3535     @classmethod
3536     def description(cls, *, markdown=True, search_examples=None):
3537         """Description of the extractor"""
3538         desc = ''
3539         if cls._NETRC_MACHINE:
3540             if markdown:
3541                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3542             else:
3543                 desc += f' [{cls._NETRC_MACHINE}]'
3544         if cls.IE_DESC is False:
3545             desc += ' [HIDDEN]'
3546         elif cls.IE_DESC:
3547             desc += f' {cls.IE_DESC}'
3548         if cls.SEARCH_KEY:
3549             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3550             if search_examples:
3551                 _COUNTS = ('', '5', '10', 'all')
3552                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3553         if not cls.working():
3554             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3555
3556         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3557         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3558         return f'{name}:{desc}' if desc else name
3559
3560     def extract_subtitles(self, *args, **kwargs):
3561         if (self.get_param('writesubtitles', False)
3562                 or self.get_param('listsubtitles')):
3563             return self._get_subtitles(*args, **kwargs)
3564         return {}
3565
3566     def _get_subtitles(self, *args, **kwargs):
3567         raise NotImplementedError('This method must be implemented by subclasses')
3568
3569     class CommentsDisabled(Exception):
3570         """Raise in _get_comments if comments are disabled for the video"""
3571
3572     def extract_comments(self, *args, **kwargs):
3573         if not self.get_param('getcomments'):
3574             return None
3575         generator = self._get_comments(*args, **kwargs)
3576
3577         def extractor():
3578             comments = []
3579             interrupted = True
3580             try:
3581                 while True:
3582                     comments.append(next(generator))
3583             except StopIteration:
3584                 interrupted = False
3585             except KeyboardInterrupt:
3586                 self.to_screen('Interrupted by user')
3587             except self.CommentsDisabled:
3588                 return {'comments': None, 'comment_count': None}
3589             except Exception as e:
3590                 if self.get_param('ignoreerrors') is not True:
3591                     raise
3592                 self._downloader.report_error(e)
3593             comment_count = len(comments)
3594             self.to_screen(f'Extracted {comment_count} comments')
3595             return {
3596                 'comments': comments,
3597                 'comment_count': None if interrupted else comment_count
3598             }
3599         return extractor
3600
3601     def _get_comments(self, *args, **kwargs):
3602         raise NotImplementedError('This method must be implemented by subclasses')
3603
3604     @staticmethod
3605     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3606         """ Merge subtitle items for one language. Items with duplicated URLs/data
3607         will be dropped. """
3608         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3609         ret = list(subtitle_list1)
3610         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3611         return ret
3612
3613     @classmethod
3614     def _merge_subtitles(cls, *dicts, target=None):
3615         """ Merge subtitle dictionaries, language by language. """
3616         if target is None:
3617             target = {}
3618         for d in dicts:
3619             for lang, subs in d.items():
3620                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3621         return target
3622
3623     def extract_automatic_captions(self, *args, **kwargs):
3624         if (self.get_param('writeautomaticsub', False)
3625                 or self.get_param('listsubtitles')):
3626             return self._get_automatic_captions(*args, **kwargs)
3627         return {}
3628
3629     def _get_automatic_captions(self, *args, **kwargs):
3630         raise NotImplementedError('This method must be implemented by subclasses')
3631
3632     @functools.cached_property
3633     def _cookies_passed(self):
3634         """Whether cookies have been passed to YoutubeDL"""
3635         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3636
3637     def mark_watched(self, *args, **kwargs):
3638         if not self.get_param('mark_watched', False):
3639             return
3640         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3641             self._mark_watched(*args, **kwargs)
3642
3643     def _mark_watched(self, *args, **kwargs):
3644         raise NotImplementedError('This method must be implemented by subclasses')
3645
3646     def geo_verification_headers(self):
3647         headers = {}
3648         geo_verification_proxy = self.get_param('geo_verification_proxy')
3649         if geo_verification_proxy:
3650             headers['Ytdl-request-proxy'] = geo_verification_proxy
3651         return headers
3652
3653     @staticmethod
3654     def _generic_id(url):
3655         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3656
3657     def _generic_title(self, url='', webpage='', *, default=None):
3658         return (self._og_search_title(webpage, default=None)
3659                 or self._html_extract_title(webpage, default=None)
3660                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3661                 or default)
3662
3663     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3664         if not duration:
3665             return
3666         chapter_list = [{
3667             'start_time': start_function(chapter),
3668             'title': title_function(chapter),
3669         } for chapter in chapter_list or []]
3670         if strict:
3671             warn = self.report_warning
3672         else:
3673             warn = self.write_debug
3674             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3675
3676         chapters = [{'start_time': 0}]
3677         for idx, chapter in enumerate(chapter_list):
3678             if chapter['start_time'] is None:
3679                 warn(f'Incomplete chapter {idx}')
3680             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3681                 chapters.append(chapter)
3682             elif chapter not in chapters:
3683                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3684                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3685                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3686         return chapters[1:]
3687
3688     def _extract_chapters_from_description(self, description, duration):
3689         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3690         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3691         return self._extract_chapters_helper(
3692             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3693             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3694             duration=duration, strict=False) or self._extract_chapters_helper(
3695             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3696             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3697             duration=duration, strict=False)
3698
3699     @staticmethod
3700     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3701         all_known = all(map(
3702             lambda x: x is not None,
3703             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3704         return (
3705             'private' if is_private
3706             else 'premium_only' if needs_premium
3707             else 'subscriber_only' if needs_subscription
3708             else 'needs_auth' if needs_auth
3709             else 'unlisted' if is_unlisted
3710             else 'public' if all_known
3711             else None)
3712
3713     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3714         '''
3715         @returns            A list of values for the extractor argument given by "key"
3716                             or "default" if no such key is present
3717         @param default      The default value to return when the key is not present (default: [])
3718         @param casesense    When false, the values are converted to lower case
3719         '''
3720         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3721         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3722         if val is None:
3723             return [] if default is NO_DEFAULT else default
3724         return list(val) if casesense else [x.lower() for x in val]
3725
3726     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3727         if not playlist_id or not video_id:
3728             return not video_id
3729
3730         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3731         if no_playlist is not None:
3732             return not no_playlist
3733
3734         video_id = '' if video_id is True else f' {video_id}'
3735         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3736         if self.get_param('noplaylist'):
3737             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3738             return False
3739         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3740         return True
3741
3742     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3743         RetryManager.report_retry(
3744             err, _count or int(fatal), _retries,
3745             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3746             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3747
3748     def RetryManager(self, **kwargs):
3749         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3750
3751     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3752         display_id = traverse_obj(info_dict, 'display_id', 'id')
3753         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3754         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3755             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3756
3757     @classmethod
3758     def extract_from_webpage(cls, ydl, url, webpage):
3759         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3760               else ydl.get_info_extractor(cls.ie_key()))
3761         for info in ie._extract_from_webpage(url, webpage) or []:
3762             # url = None since we do not want to set (webpage/original)_url
3763             ydl.add_default_extra_info(info, ie, None)
3764             yield info
3765
3766     @classmethod
3767     def _extract_from_webpage(cls, url, webpage):
3768         for embed_url in orderedSet(
3769                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3770             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3771
3772     @classmethod
3773     def _extract_embed_urls(cls, url, webpage):
3774         """@returns all the embed urls on the webpage"""
3775         if '_EMBED_URL_RE' not in cls.__dict__:
3776             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3777             for idx, regex in enumerate(cls._EMBED_REGEX):
3778                 assert regex.count('(?P<url>') == 1, \
3779                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3780             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3781
3782         for regex in cls._EMBED_URL_RE:
3783             for mobj in regex.finditer(webpage):
3784                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3785                 if cls._VALID_URL is False or cls.suitable(embed_url):
3786                     yield embed_url
3787
3788     class StopExtraction(Exception):
3789         pass
3790
3791     @classmethod
3792     def _extract_url(cls, webpage):  # TODO: Remove
3793         """Only for compatibility with some older extractors"""
3794         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3795
3796     @classmethod
3797     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3798         if plugin_name:
3799             mro = inspect.getmro(cls)
3800             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3801             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3802             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3803             while getattr(super_class, '__wrapped__', None):
3804                 super_class = super_class.__wrapped__
3805             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3806             _PLUGIN_OVERRIDES[super_class].append(cls)
3807
3808         return super().__init_subclass__(**kwargs)
3809
3810
3811 class SearchInfoExtractor(InfoExtractor):
3812     """
3813     Base class for paged search queries extractors.
3814     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3815     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3816     """
3817
3818     _MAX_RESULTS = float('inf')
3819     _RETURN_TYPE = 'playlist'
3820
3821     @classproperty
3822     def _VALID_URL(cls):
3823         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3824
3825     def _real_extract(self, query):
3826         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3827         if prefix == '':
3828             return self._get_n_results(query, 1)
3829         elif prefix == 'all':
3830             return self._get_n_results(query, self._MAX_RESULTS)
3831         else:
3832             n = int(prefix)
3833             if n <= 0:
3834                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3835             elif n > self._MAX_RESULTS:
3836                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3837                 n = self._MAX_RESULTS
3838             return self._get_n_results(query, n)
3839
3840     def _get_n_results(self, query, n):
3841         """Get a specified number of results for a query.
3842         Either this function or _search_results must be overridden by subclasses """
3843         return self.playlist_result(
3844             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3845             query, query)
3846
3847     def _search_results(self, query):
3848         """Returns an iterator of search results"""
3849         raise NotImplementedError('This method must be implemented by subclasses')
3850
3851     @classproperty
3852     def SEARCH_KEY(cls):
3853         return cls._SEARCH_KEY
3854
3855
3856 class UnsupportedURLIE(InfoExtractor):
3857     _VALID_URL = '.*'
3858     _ENABLED = False
3859     IE_DESC = False
3860
3861     def _real_extract(self, url):
3862         raise UnsupportedError(url)
3863
3864
3865 _PLUGIN_OVERRIDES = collections.defaultdict(list)