yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import sys
  17 import time
  18 import types
  19 import urllib.parse
  20 import urllib.request
  21 import xml.etree.ElementTree
  22
  23 from ..compat import functools  # isort: split
  24 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  25 from ..cookies import LenientSimpleCookie
  26 from ..downloader.f4m import get_base_url, remove_encrypted_media
  27 from ..utils import (
  28     IDENTITY,
  29     JSON_LD_RE,
  30     NO_DEFAULT,
  31     ExtractorError,
  32     FormatSorter,
  33     GeoRestrictedError,
  34     GeoUtils,
  35     HEADRequest,
  36     LenientJSONDecoder,
  37     RegexNotFoundError,
  38     RetryManager,
  39     UnsupportedError,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     classproperty,
  44     clean_html,
  45     deprecation_warning,
  46     determine_ext,
  47     dict_get,
  48     encode_data_uri,
  49     error_to_compat_str,
  50     extract_attributes,
  51     filter_dict,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     int_or_none,
  56     join_nonempty,
  57     js_to_json,
  58     mimetype2ext,
  59     network_exceptions,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     sanitize_filename,
  68     sanitize_url,
  69     sanitized_Request,
  70     smuggle_url,
  71     str_or_none,
  72     str_to_int,
  73     strip_or_none,
  74     traverse_obj,
  75     truncate_string,
  76     try_call,
  77     try_get,
  78     unescapeHTML,
  79     unified_strdate,
  80     unified_timestamp,
  81     update_Request,
  82     update_url_query,
  83     url_basename,
  84     url_or_none,
  85     urlhandle_detect_ext,
  86     urljoin,
  87     variadic,
  88     xpath_element,
  89     xpath_text,
  90     xpath_with_ns,
  91 )
  92
  93
  94 class InfoExtractor:
  95     """Information Extractor class.
  96
  97     Information extractors are the classes that, given a URL, extract
  98     information about the video (or videos) the URL refers to. This
  99     information includes the real video URL, the video title, author and
 100     others. The information is stored in a dictionary which is then
 101     passed to the YoutubeDL. The YoutubeDL processes this
 102     information possibly downloading the video to the file system, among
 103     other possible outcomes.
 104
 105     The type field determines the type of the result.
 106     By far the most common value (and the default if _type is missing) is
 107     "video", which indicates a single video.
 108
 109     For a video, the dictionaries must include the following fields:
 110
 111     id:             Video identifier.
 112     title:          Video title, unescaped. Set to an empty string if video has
 113                     no title as opposed to "None" which signifies that the
 114                     extractor failed to obtain a title
 115
 116     Additionally, it must contain either a formats entry or a url one:
 117
 118     formats:        A list of dictionaries for each format available, ordered
 119                     from worst to best quality.
 120
 121                     Potential fields:
 122                     * url        The mandatory URL representing the media:
 123                                    for plain file media - HTTP URL of this file,
 124                                    for RTMP - RTMP URL,
 125                                    for HLS - URL of the M3U8 media playlist,
 126                                    for HDS - URL of the F4M manifest,
 127                                    for DASH
 128                                      - HTTP URL to plain file media (in case of
 129                                        unfragmented media)
 130                                      - URL of the MPD manifest or base URL
 131                                        representing the media if MPD manifest
 132                                        is parsed from a string (in case of
 133                                        fragmented media)
 134                                    for MSS - URL of the ISM manifest.
 135                     * request_data  Data to send in POST request to the URL
 136                     * manifest_url
 137                                  The URL of the manifest file in case of
 138                                  fragmented media:
 139                                    for HLS - URL of the M3U8 master playlist,
 140                                    for HDS - URL of the F4M manifest,
 141                                    for DASH - URL of the MPD manifest,
 142                                    for MSS - URL of the ISM manifest.
 143                     * manifest_stream_number  (For internal use only)
 144                                  The index of the stream in the manifest file
 145                     * ext        Will be calculated from URL if missing
 146                     * format     A human-readable description of the format
 147                                  ("mp4 container with h264/opus").
 148                                  Calculated from the format_id, width, height.
 149                                  and format_note fields if missing.
 150                     * format_id  A short description of the format
 151                                  ("mp4_h264_opus" or "19").
 152                                 Technically optional, but strongly recommended.
 153                     * format_note Additional info about the format
 154                                  ("3D" or "DASH video")
 155                     * width      Width of the video, if known
 156                     * height     Height of the video, if known
 157                     * aspect_ratio  Aspect ratio of the video, if known
 158                                  Automatically calculated from width and height
 159                     * resolution Textual description of width and height
 160                                  Automatically calculated from width and height
 161                     * dynamic_range The dynamic range of the video. One of:
 162                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 163                     * tbr        Average bitrate of audio and video in KBit/s
 164                     * abr        Average audio bitrate in KBit/s
 165                     * acodec     Name of the audio codec in use
 166                     * asr        Audio sampling rate in Hertz
 167                     * audio_channels  Number of audio channels
 168                     * vbr        Average video bitrate in KBit/s
 169                     * fps        Frame rate
 170                     * vcodec     Name of the video codec in use
 171                     * container  Name of the container format
 172                     * filesize   The number of bytes, if known in advance
 173                     * filesize_approx  An estimate for the number of bytes
 174                     * player_url SWF Player URL (used for rtmpdump).
 175                     * protocol   The protocol that will be used for the actual
 176                                  download, lower-case. One of "http", "https" or
 177                                  one of the protocols defined in downloader.PROTOCOL_MAP
 178                     * fragment_base_url
 179                                  Base URL for fragments. Each fragment's path
 180                                  value (if present) will be relative to
 181                                  this URL.
 182                     * fragments  A list of fragments of a fragmented media.
 183                                  Each fragment entry must contain either an url
 184                                  or a path. If an url is present it should be
 185                                  considered by a client. Otherwise both path and
 186                                  fragment_base_url must be present. Here is
 187                                  the list of all potential fields:
 188                                  * "url" - fragment's URL
 189                                  * "path" - fragment's path relative to
 190                                             fragment_base_url
 191                                  * "duration" (optional, int or float)
 192                                  * "filesize" (optional, int)
 193                     * is_from_start  Is a live format that can be downloaded
 194                                 from the start. Boolean
 195                     * preference Order number of this format. If this field is
 196                                  present and not None, the formats get sorted
 197                                  by this field, regardless of all other values.
 198                                  -1 for default (order by other properties),
 199                                  -2 or smaller for less than default.
 200                                  < -1000 to hide the format (if there is
 201                                     another one which is strictly better)
 202                     * language   Language code, e.g. "de" or "en-US".
 203                     * language_preference  Is this in the language mentioned in
 204                                  the URL?
 205                                  10 if it's what the URL is about,
 206                                  -1 for default (don't know),
 207                                  -10 otherwise, other values reserved for now.
 208                     * quality    Order number of the video quality of this
 209                                  format, irrespective of the file format.
 210                                  -1 for default (order by other properties),
 211                                  -2 or smaller for less than default.
 212                     * source_preference  Order number for this video source
 213                                   (quality takes higher priority)
 214                                  -1 for default (order by other properties),
 215                                  -2 or smaller for less than default.
 216                     * http_headers  A dictionary of additional HTTP headers
 217                                  to add to the request.
 218                     * stretched_ratio  If given and not 1, indicates that the
 219                                  video's pixels are not square.
 220                                  width : height ratio as float.
 221                     * no_resume  The server does not support resuming the
 222                                  (HTTP or RTMP) download. Boolean.
 223                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 224                     * extra_param_to_segment_url  A query string to append to each
 225                                  fragment's URL, or to update each existing query string
 226                                  with. Only applied by the native HLS/DASH downloaders.
 227                     * hls_aes    A dictionary of HLS AES-128 decryption information
 228                                  used by the native HLS downloader to override the
 229                                  values in the media playlist when an '#EXT-X-KEY' tag
 230                                  is present in the playlist:
 231                                  * uri  The URI from which the key will be downloaded
 232                                  * key  The key (as hex) used to decrypt fragments.
 233                                         If `key` is given, any key URI will be ignored
 234                                  * iv   The IV (as hex) used to decrypt fragments
 235                     * downloader_options  A dictionary of downloader options
 236                                  (For internal use only)
 237                                  * http_chunk_size Chunk size for HTTP downloads
 238                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 239                     RTMP formats can also have the additional fields: page_url,
 240                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 241                     rtmp_protocol, rtmp_real_time
 242
 243     url:            Final video URL.
 244     ext:            Video filename extension.
 245     format:         The video format, defaults to ext (used for --get-format)
 246     player_url:     SWF Player URL (used for rtmpdump).
 247
 248     The following fields are optional:
 249
 250     direct:         True if a direct video file was given (must only be set by GenericIE)
 251     alt_title:      A secondary title of the video.
 252     display_id      An alternative identifier for the video, not necessarily
 253                     unique, but available before title. Typically, id is
 254                     something like "4234987", title "Dancing naked mole rats",
 255                     and display_id "dancing-naked-mole-rats"
 256     thumbnails:     A list of dictionaries, with the following entries:
 257                         * "id" (optional, string) - Thumbnail format ID
 258                         * "url"
 259                         * "preference" (optional, int) - quality of the image
 260                         * "width" (optional, int)
 261                         * "height" (optional, int)
 262                         * "resolution" (optional, string "{width}x{height}",
 263                                         deprecated)
 264                         * "filesize" (optional, int)
 265                         * "http_headers" (dict) - HTTP headers for the request
 266     thumbnail:      Full URL to a video thumbnail image.
 267     description:    Full video description.
 268     uploader:       Full name of the video uploader.
 269     license:        License name the video is licensed under.
 270     creator:        The creator of the video.
 271     timestamp:      UNIX timestamp of the moment the video was uploaded
 272     upload_date:    Video upload date in UTC (YYYYMMDD).
 273                     If not explicitly set, calculated from timestamp
 274     release_timestamp: UNIX timestamp of the moment the video was released.
 275                     If it is not clear whether to use timestamp or this, use the former
 276     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 277                     If not explicitly set, calculated from release_timestamp
 278     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 279     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 280                     If not explicitly set, calculated from modified_timestamp
 281     uploader_id:    Nickname or id of the video uploader.
 282     uploader_url:   Full URL to a personal webpage of the video uploader.
 283     channel:        Full name of the channel the video is uploaded on.
 284                     Note that channel fields may or may not repeat uploader
 285                     fields. This depends on a particular extractor.
 286     channel_id:     Id of the channel.
 287     channel_url:    Full URL to a channel webpage.
 288     channel_follower_count: Number of followers of the channel.
 289     channel_is_verified: Whether the channel is verified on the platform.
 290     location:       Physical location where the video was filmed.
 291     subtitles:      The available subtitles as a dictionary in the format
 292                     {tag: subformats}. "tag" is usually a language code, and
 293                     "subformats" is a list sorted from lower to higher
 294                     preference, each element is a dictionary with the "ext"
 295                     entry and one of:
 296                         * "data": The subtitles file contents
 297                         * "url": A URL pointing to the subtitles file
 298                     It can optionally also have:
 299                         * "name": Name or description of the subtitles
 300                         * "http_headers": A dictionary of additional HTTP headers
 301                                   to add to the request.
 302                     "ext" will be calculated from URL if missing
 303     automatic_captions: Like 'subtitles'; contains automatically generated
 304                     captions instead of normal subtitles
 305     duration:       Length of the video in seconds, as an integer or float.
 306     view_count:     How many users have watched the video on the platform.
 307     concurrent_view_count: How many users are currently watching the video on the platform.
 308     like_count:     Number of positive ratings of the video
 309     dislike_count:  Number of negative ratings of the video
 310     repost_count:   Number of reposts of the video
 311     average_rating: Average rating give by users, the scale used depends on the webpage
 312     comment_count:  Number of comments on the video
 313     comments:       A list of comments, each with one or more of the following
 314                     properties (all but one of text or html optional):
 315                         * "author" - human-readable name of the comment author
 316                         * "author_id" - user ID of the comment author
 317                         * "author_thumbnail" - The thumbnail of the comment author
 318                         * "author_url" - The url to the comment author's page
 319                         * "author_is_verified" - Whether the author is verified
 320                                                  on the platform
 321                         * "author_is_uploader" - Whether the comment is made by
 322                                                  the video uploader
 323                         * "id" - Comment ID
 324                         * "html" - Comment as HTML
 325                         * "text" - Plain text of the comment
 326                         * "timestamp" - UNIX timestamp of comment
 327                         * "parent" - ID of the comment this one is replying to.
 328                                      Set to "root" to indicate that this is a
 329                                      comment to the original video.
 330                         * "like_count" - Number of positive ratings of the comment
 331                         * "dislike_count" - Number of negative ratings of the comment
 332                         * "is_favorited" - Whether the comment is marked as
 333                                            favorite by the video uploader
 334                         * "is_pinned" - Whether the comment is pinned to
 335                                         the top of the comments
 336     age_limit:      Age restriction for the video, as an integer (years)
 337     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 338                     should allow to get the same result again. (It will be set
 339                     by YoutubeDL if it's missing)
 340     categories:     A list of categories that the video falls in, for example
 341                     ["Sports", "Berlin"]
 342     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 343     cast:           A list of the video cast
 344     is_live:        True, False, or None (=unknown). Whether this video is a
 345                     live stream that goes on instead of a fixed-length video.
 346     was_live:       True, False, or None (=unknown). Whether this video was
 347                     originally a live stream.
 348     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 349                     or 'post_live' (was live, but VOD is not yet processed)
 350                     If absent, automatically set from is_live, was_live
 351     start_time:     Time in seconds where the reproduction should start, as
 352                     specified in the URL.
 353     end_time:       Time in seconds where the reproduction should end, as
 354                     specified in the URL.
 355     chapters:       A list of dictionaries, with the following entries:
 356                         * "start_time" - The start time of the chapter in seconds
 357                         * "end_time" - The end time of the chapter in seconds
 358                         * "title" (optional, string)
 359     heatmap:        A list of dictionaries, with the following entries:
 360                         * "start_time" - The start time of the data point in seconds
 361                         * "end_time" - The end time of the data point in seconds
 362                         * "value" - The normalized value of the data point (float between 0 and 1)
 363     playable_in_embed: Whether this video is allowed to play in embedded
 364                     players on other sites. Can be True (=always allowed),
 365                     False (=never allowed), None (=unknown), or a string
 366                     specifying the criteria for embedability; e.g. 'whitelist'
 367     availability:   Under what condition the video is available. One of
 368                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 369                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 370                     to set it
 371     _old_archive_ids: A list of old archive ids needed for backward compatibility
 372     _format_sort_fields: A list of fields to use for sorting formats
 373     __post_extractor: A function to be called just before the metadata is
 374                     written to either disk, logger or console. The function
 375                     must return a dict which will be added to the info_dict.
 376                     This is usefull for additional information that is
 377                     time-consuming to extract. Note that the fields thus
 378                     extracted will not be available to output template and
 379                     match_filter. So, only "comments" and "comment_count" are
 380                     currently allowed to be extracted via this method.
 381
 382     The following fields should only be used when the video belongs to some logical
 383     chapter or section:
 384
 385     chapter:        Name or title of the chapter the video belongs to.
 386     chapter_number: Number of the chapter the video belongs to, as an integer.
 387     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 388
 389     The following fields should only be used when the video is an episode of some
 390     series, programme or podcast:
 391
 392     series:         Title of the series or programme the video episode belongs to.
 393     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 394     season:         Title of the season the video episode belongs to.
 395     season_number:  Number of the season the video episode belongs to, as an integer.
 396     season_id:      Id of the season the video episode belongs to, as a unicode string.
 397     episode:        Title of the video episode. Unlike mandatory video title field,
 398                     this field should denote the exact title of the video episode
 399                     without any kind of decoration.
 400     episode_number: Number of the video episode within a season, as an integer.
 401     episode_id:     Id of the video episode, as a unicode string.
 402
 403     The following fields should only be used when the media is a track or a part of
 404     a music album:
 405
 406     track:          Title of the track.
 407     track_number:   Number of the track within an album or a disc, as an integer.
 408     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 409                     as a unicode string.
 410     artist:         Artist(s) of the track.
 411     genre:          Genre(s) of the track.
 412     album:          Title of the album the track belongs to.
 413     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 414     album_artist:   List of all artists appeared on the album (e.g.
 415                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 416                     and compilations).
 417     disc_number:    Number of the disc or other physical medium the track belongs to,
 418                     as an integer.
 419     release_year:   Year (YYYY) when the album was released.
 420     composer:       Composer of the piece
 421
 422     The following fields should only be set for clips that should be cut from the original video:
 423
 424     section_start:  Start time of the section in seconds
 425     section_end:    End time of the section in seconds
 426
 427     The following fields should only be set for storyboards:
 428     rows:           Number of rows in each storyboard fragment, as an integer
 429     columns:        Number of columns in each storyboard fragment, as an integer
 430
 431     Unless mentioned otherwise, the fields should be Unicode strings.
 432
 433     Unless mentioned otherwise, None is equivalent to absence of information.
 434
 435
 436     _type "playlist" indicates multiple videos.
 437     There must be a key "entries", which is a list, an iterable, or a PagedList
 438     object, each element of which is a valid dictionary by this specification.
 439
 440     Additionally, playlists can have "id", "title", and any other relevant
 441     attributes with the same semantics as videos (see above).
 442
 443     It can also have the following optional fields:
 444
 445     playlist_count: The total number of videos in a playlist. If not given,
 446                     YoutubeDL tries to calculate it from "entries"
 447
 448
 449     _type "multi_video" indicates that there are multiple videos that
 450     form a single show, for examples multiple acts of an opera or TV episode.
 451     It must have an entries key like a playlist and contain all the keys
 452     required for a video at the same time.
 453
 454
 455     _type "url" indicates that the video must be extracted from another
 456     location, possibly by a different extractor. Its only required key is:
 457     "url" - the next URL to extract.
 458     The key "ie_key" can be set to the class name (minus the trailing "IE",
 459     e.g. "Youtube") if the extractor class is known in advance.
 460     Additionally, the dictionary may have any properties of the resolved entity
 461     known in advance, for example "title" if the title of the referred video is
 462     known ahead of time.
 463
 464
 465     _type "url_transparent" entities have the same specification as "url", but
 466     indicate that the given additional information is more precise than the one
 467     associated with the resolved URL.
 468     This is useful when a site employs a video service that hosts the video and
 469     its technical metadata, but that video service does not embed a useful
 470     title, description etc.
 471
 472
 473     Subclasses of this should also be added to the list of extractors and
 474     should define a _VALID_URL regexp and, re-define the _real_extract() and
 475     (optionally) _real_initialize() methods.
 476
 477     Subclasses may also override suitable() if necessary, but ensure the function
 478     signature is preserved and that this function imports everything it needs
 479     (except other extractors), so that lazy_extractors works correctly.
 480
 481     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 482     the HTML of Generic webpages. It may also override _extract_embed_urls
 483     or _extract_from_webpage as necessary. While these are normally classmethods,
 484     _extract_from_webpage is allowed to be an instance method.
 485
 486     _extract_from_webpage may raise self.StopExtraction() to stop further
 487     processing of the webpage and obtain exclusive rights to it. This is useful
 488     when the extractor cannot reliably be matched using just the URL,
 489     e.g. invidious/peertube instances
 490
 491     Embed-only extractors can be defined by setting _VALID_URL = False.
 492
 493     To support username + password (or netrc) login, the extractor must define a
 494     _NETRC_MACHINE and re-define _perform_login(username, password) and
 495     (optionally) _initialize_pre_login() methods. The _perform_login method will
 496     be called between _initialize_pre_login and _real_initialize if credentials
 497     are passed by the user. In cases where it is necessary to have the login
 498     process as part of the extraction rather than initialization, _perform_login
 499     can be left undefined.
 500
 501     _GEO_BYPASS attribute may be set to False in order to disable
 502     geo restriction bypass mechanisms for a particular extractor.
 503     Though it won't disable explicit geo restriction bypass based on
 504     country code provided with geo_bypass_country.
 505
 506     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 507     countries for this extractor. One of these countries will be used by
 508     geo restriction bypass mechanism right away in order to bypass
 509     geo restriction, of course, if the mechanism is not disabled.
 510
 511     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 512     IP blocks in CIDR notation for this extractor. One of these IP blocks
 513     will be used by geo restriction bypass mechanism similarly
 514     to _GEO_COUNTRIES.
 515
 516     The _ENABLED attribute should be set to False for IEs that
 517     are disabled by default and must be explicitly enabled.
 518
 519     The _WORKING attribute should be set to False for broken IEs
 520     in order to warn the users and skip the tests.
 521     """
 522
 523     _ready = False
 524     _downloader = None
 525     _x_forwarded_for_ip = None
 526     _GEO_BYPASS = True
 527     _GEO_COUNTRIES = None
 528     _GEO_IP_BLOCKS = None
 529     _WORKING = True
 530     _ENABLED = True
 531     _NETRC_MACHINE = None
 532     IE_DESC = None
 533     SEARCH_KEY = None
 534     _VALID_URL = None
 535     _EMBED_REGEX = []
 536
 537     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 538         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 539         return {
 540             None: '',
 541             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 542             'password': f'Use {password_hint}',
 543             'cookies': (
 544                 'Use --cookies-from-browser or --cookies for the authentication. '
 545                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 546         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 547
 548     def __init__(self, downloader=None):
 549         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 550         If a downloader is not passed during initialization,
 551         it must be set using "set_downloader()" before "extract()" is called"""
 552         self._ready = False
 553         self._x_forwarded_for_ip = None
 554         self._printed_messages = set()
 555         self.set_downloader(downloader)
 556
 557     @classmethod
 558     def _match_valid_url(cls, url):
 559         if cls._VALID_URL is False:
 560             return None
 561         # This does not use has/getattr intentionally - we want to know whether
 562         # we have cached the regexp for *this* class, whereas getattr would also
 563         # match the superclass
 564         if '_VALID_URL_RE' not in cls.__dict__:
 565             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 566         return cls._VALID_URL_RE.match(url)
 567
 568     @classmethod
 569     def suitable(cls, url):
 570         """Receives a URL and returns True if suitable for this IE."""
 571         # This function must import everything it needs (except other extractors),
 572         # so that lazy_extractors works correctly
 573         return cls._match_valid_url(url) is not None
 574
 575     @classmethod
 576     def _match_id(cls, url):
 577         return cls._match_valid_url(url).group('id')
 578
 579     @classmethod
 580     def get_temp_id(cls, url):
 581         try:
 582             return cls._match_id(url)
 583         except (IndexError, AttributeError):
 584             return None
 585
 586     @classmethod
 587     def working(cls):
 588         """Getter method for _WORKING."""
 589         return cls._WORKING
 590
 591     @classmethod
 592     def supports_login(cls):
 593         return bool(cls._NETRC_MACHINE)
 594
 595     def initialize(self):
 596         """Initializes an instance (authentication, etc)."""
 597         self._printed_messages = set()
 598         self._initialize_geo_bypass({
 599             'countries': self._GEO_COUNTRIES,
 600             'ip_blocks': self._GEO_IP_BLOCKS,
 601         })
 602         if not self._ready:
 603             self._initialize_pre_login()
 604             if self.supports_login():
 605                 username, password = self._get_login_info()
 606                 if username:
 607                     self._perform_login(username, password)
 608             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 609                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 610             self._real_initialize()
 611             self._ready = True
 612
 613     def _initialize_geo_bypass(self, geo_bypass_context):
 614         """
 615         Initialize geo restriction bypass mechanism.
 616
 617         This method is used to initialize geo bypass mechanism based on faking
 618         X-Forwarded-For HTTP header. A random country from provided country list
 619         is selected and a random IP belonging to this country is generated. This
 620         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 621         HTTP requests.
 622
 623         This method will be used for initial geo bypass mechanism initialization
 624         during the instance initialization with _GEO_COUNTRIES and
 625         _GEO_IP_BLOCKS.
 626
 627         You may also manually call it from extractor's code if geo bypass
 628         information is not available beforehand (e.g. obtained during
 629         extraction) or due to some other reason. In this case you should pass
 630         this information in geo bypass context passed as first argument. It may
 631         contain following fields:
 632
 633         countries:  List of geo unrestricted countries (similar
 634                     to _GEO_COUNTRIES)
 635         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 636                     (similar to _GEO_IP_BLOCKS)
 637
 638         """
 639         if not self._x_forwarded_for_ip:
 640
 641             # Geo bypass mechanism is explicitly disabled by user
 642             if not self.get_param('geo_bypass', True):
 643                 return
 644
 645             if not geo_bypass_context:
 646                 geo_bypass_context = {}
 647
 648             # Backward compatibility: previously _initialize_geo_bypass
 649             # expected a list of countries, some 3rd party code may still use
 650             # it this way
 651             if isinstance(geo_bypass_context, (list, tuple)):
 652                 geo_bypass_context = {
 653                     'countries': geo_bypass_context,
 654                 }
 655
 656             # The whole point of geo bypass mechanism is to fake IP
 657             # as X-Forwarded-For HTTP header based on some IP block or
 658             # country code.
 659
 660             # Path 1: bypassing based on IP block in CIDR notation
 661
 662             # Explicit IP block specified by user, use it right away
 663             # regardless of whether extractor is geo bypassable or not
 664             ip_block = self.get_param('geo_bypass_ip_block', None)
 665
 666             # Otherwise use random IP block from geo bypass context but only
 667             # if extractor is known as geo bypassable
 668             if not ip_block:
 669                 ip_blocks = geo_bypass_context.get('ip_blocks')
 670                 if self._GEO_BYPASS and ip_blocks:
 671                     ip_block = random.choice(ip_blocks)
 672
 673             if ip_block:
 674                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 675                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 676                 return
 677
 678             # Path 2: bypassing based on country code
 679
 680             # Explicit country code specified by user, use it right away
 681             # regardless of whether extractor is geo bypassable or not
 682             country = self.get_param('geo_bypass_country', None)
 683
 684             # Otherwise use random country code from geo bypass context but
 685             # only if extractor is known as geo bypassable
 686             if not country:
 687                 countries = geo_bypass_context.get('countries')
 688                 if self._GEO_BYPASS and countries:
 689                     country = random.choice(countries)
 690
 691             if country:
 692                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 693                 self._downloader.write_debug(
 694                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 695
 696     def extract(self, url):
 697         """Extracts URL information and returns it in list of dicts."""
 698         try:
 699             for _ in range(2):
 700                 try:
 701                     self.initialize()
 702                     self.to_screen('Extracting URL: %s' % (
 703                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 704                     ie_result = self._real_extract(url)
 705                     if ie_result is None:
 706                         return None
 707                     if self._x_forwarded_for_ip:
 708                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 709                     subtitles = ie_result.get('subtitles') or {}
 710                     if 'no-live-chat' in self.get_param('compat_opts'):
 711                         for lang in ('live_chat', 'comments', 'danmaku'):
 712                             subtitles.pop(lang, None)
 713                     return ie_result
 714                 except GeoRestrictedError as e:
 715                     if self.__maybe_fake_ip_and_retry(e.countries):
 716                         continue
 717                     raise
 718         except UnsupportedError:
 719             raise
 720         except ExtractorError as e:
 721             e.video_id = e.video_id or self.get_temp_id(url),
 722             e.ie = e.ie or self.IE_NAME,
 723             e.traceback = e.traceback or sys.exc_info()[2]
 724             raise
 725         except http.client.IncompleteRead as e:
 726             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 727         except (KeyError, StopIteration) as e:
 728             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 729
 730     def __maybe_fake_ip_and_retry(self, countries):
 731         if (not self.get_param('geo_bypass_country', None)
 732                 and self._GEO_BYPASS
 733                 and self.get_param('geo_bypass', True)
 734                 and not self._x_forwarded_for_ip
 735                 and countries):
 736             country_code = random.choice(countries)
 737             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 738             if self._x_forwarded_for_ip:
 739                 self.report_warning(
 740                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 741                     % (self._x_forwarded_for_ip, country_code.upper()))
 742                 return True
 743         return False
 744
 745     def set_downloader(self, downloader):
 746         """Sets a YoutubeDL instance as the downloader for this IE."""
 747         self._downloader = downloader
 748
 749     @property
 750     def cache(self):
 751         return self._downloader.cache
 752
 753     @property
 754     def cookiejar(self):
 755         return self._downloader.cookiejar
 756
 757     def _initialize_pre_login(self):
 758         """ Initialization before login. Redefine in subclasses."""
 759         pass
 760
 761     def _perform_login(self, username, password):
 762         """ Login with username and password. Redefine in subclasses."""
 763         pass
 764
 765     def _real_initialize(self):
 766         """Real initialization process. Redefine in subclasses."""
 767         pass
 768
 769     def _real_extract(self, url):
 770         """Real extraction process. Redefine in subclasses."""
 771         raise NotImplementedError('This method must be implemented by subclasses')
 772
 773     @classmethod
 774     def ie_key(cls):
 775         """A string for getting the InfoExtractor with get_info_extractor"""
 776         return cls.__name__[:-2]
 777
 778     @classproperty
 779     def IE_NAME(cls):
 780         return cls.__name__[:-2]
 781
 782     @staticmethod
 783     def __can_accept_status_code(err, expected_status):
 784         assert isinstance(err, urllib.error.HTTPError)
 785         if expected_status is None:
 786             return False
 787         elif callable(expected_status):
 788             return expected_status(err.code) is True
 789         else:
 790             return err.code in variadic(expected_status)
 791
 792     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 793         if isinstance(url_or_request, urllib.request.Request):
 794             return update_Request(url_or_request, data=data, headers=headers, query=query)
 795         if query:
 796             url_or_request = update_url_query(url_or_request, query)
 797         return sanitized_Request(url_or_request, data, headers or {})
 798
 799     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 800         """
 801         Return the response handle.
 802
 803         See _download_webpage docstring for arguments specification.
 804         """
 805         if not self._downloader._first_webpage_request:
 806             sleep_interval = self.get_param('sleep_interval_requests') or 0
 807             if sleep_interval > 0:
 808                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 809                 time.sleep(sleep_interval)
 810         else:
 811             self._downloader._first_webpage_request = False
 812
 813         if note is None:
 814             self.report_download_webpage(video_id)
 815         elif note is not False:
 816             if video_id is None:
 817                 self.to_screen(str(note))
 818             else:
 819                 self.to_screen(f'{video_id}: {note}')
 820
 821         # Some sites check X-Forwarded-For HTTP header in order to figure out
 822         # the origin of the client behind proxy. This allows bypassing geo
 823         # restriction by faking this header's value to IP that belongs to some
 824         # geo unrestricted country. We will do so once we encounter any
 825         # geo restriction error.
 826         if self._x_forwarded_for_ip:
 827             headers = (headers or {}).copy()
 828             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 829
 830         try:
 831             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 832         except network_exceptions as err:
 833             if isinstance(err, urllib.error.HTTPError):
 834                 if self.__can_accept_status_code(err, expected_status):
 835                     # Retain reference to error to prevent file object from
 836                     # being closed before it can be read. Works around the
 837                     # effects of <https://bugs.python.org/issue15002>
 838                     # introduced in Python 3.4.1.
 839                     err.fp._error = err
 840                     return err.fp
 841
 842             if errnote is False:
 843                 return False
 844             if errnote is None:
 845                 errnote = 'Unable to download webpage'
 846
 847             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 848             if fatal:
 849                 raise ExtractorError(errmsg, cause=err)
 850             else:
 851                 self.report_warning(errmsg)
 852                 return False
 853
 854     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 855                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 856         """
 857         Return a tuple (page content as string, URL handle).
 858
 859         Arguments:
 860         url_or_request -- plain text URL as a string or
 861             a urllib.request.Request object
 862         video_id -- Video/playlist/item identifier (string)
 863
 864         Keyword arguments:
 865         note -- note printed before downloading (string)
 866         errnote -- note printed in case of an error (string)
 867         fatal -- flag denoting whether error should be considered fatal,
 868             i.e. whether it should cause ExtractionError to be raised,
 869             otherwise a warning will be reported and extraction continued
 870         encoding -- encoding for a page content decoding, guessed automatically
 871             when not explicitly specified
 872         data -- POST data (bytes)
 873         headers -- HTTP headers (dict)
 874         query -- URL query (dict)
 875         expected_status -- allows to accept failed HTTP requests (non 2xx
 876             status code) by explicitly specifying a set of accepted status
 877             codes. Can be any of the following entities:
 878                 - an integer type specifying an exact failed status code to
 879                   accept
 880                 - a list or a tuple of integer types specifying a list of
 881                   failed status codes to accept
 882                 - a callable accepting an actual failed status code and
 883                   returning True if it should be accepted
 884             Note that this argument does not affect success status codes (2xx)
 885             which are always accepted.
 886         """
 887
 888         # Strip hashes from the URL (#1038)
 889         if isinstance(url_or_request, str):
 890             url_or_request = url_or_request.partition('#')[0]
 891
 892         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 893         if urlh is False:
 894             assert not fatal
 895             return False
 896         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 897         return (content, urlh)
 898
 899     @staticmethod
 900     def _guess_encoding_from_content(content_type, webpage_bytes):
 901         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 902         if m:
 903             encoding = m.group(1)
 904         else:
 905             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 906                           webpage_bytes[:1024])
 907             if m:
 908                 encoding = m.group(1).decode('ascii')
 909             elif webpage_bytes.startswith(b'\xff\xfe'):
 910                 encoding = 'utf-16'
 911             else:
 912                 encoding = 'utf-8'
 913
 914         return encoding
 915
 916     def __check_blocked(self, content):
 917         first_block = content[:512]
 918         if ('<title>Access to this site is blocked</title>' in content
 919                 and 'Websense' in first_block):
 920             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 921             blocked_iframe = self._html_search_regex(
 922                 r'<iframe src="([^"]+)"', content,
 923                 'Websense information URL', default=None)
 924             if blocked_iframe:
 925                 msg += ' Visit %s for more details' % blocked_iframe
 926             raise ExtractorError(msg, expected=True)
 927         if '<title>The URL you requested has been blocked</title>' in first_block:
 928             msg = (
 929                 'Access to this webpage has been blocked by Indian censorship. '
 930                 'Use a VPN or proxy server (with --proxy) to route around it.')
 931             block_msg = self._html_search_regex(
 932                 r'</h1><p>(.*?)</p>',
 933                 content, 'block message', default=None)
 934             if block_msg:
 935                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 936             raise ExtractorError(msg, expected=True)
 937         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 938                 and 'blocklist.rkn.gov.ru' in content):
 939             raise ExtractorError(
 940                 'Access to this webpage has been blocked by decision of the Russian government. '
 941                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 942                 expected=True)
 943
 944     def _request_dump_filename(self, url, video_id):
 945         basen = f'{video_id}_{url}'
 946         trim_length = self.get_param('trim_file_name') or 240
 947         if len(basen) > trim_length:
 948             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 949             basen = basen[:trim_length - len(h)] + h
 950         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 951         # Working around MAX_PATH limitation on Windows (see
 952         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 953         if compat_os_name == 'nt':
 954             absfilepath = os.path.abspath(filename)
 955             if len(absfilepath) > 259:
 956                 filename = fR'\\?\{absfilepath}'
 957         return filename
 958
 959     def __decode_webpage(self, webpage_bytes, encoding, headers):
 960         if not encoding:
 961             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 962         try:
 963             return webpage_bytes.decode(encoding, 'replace')
 964         except LookupError:
 965             return webpage_bytes.decode('utf-8', 'replace')
 966
 967     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 968         webpage_bytes = urlh.read()
 969         if prefix is not None:
 970             webpage_bytes = prefix + webpage_bytes
 971         if self.get_param('dump_intermediate_pages', False):
 972             self.to_screen('Dumping request to ' + urlh.geturl())
 973             dump = base64.b64encode(webpage_bytes).decode('ascii')
 974             self._downloader.to_screen(dump)
 975         if self.get_param('write_pages'):
 976             filename = self._request_dump_filename(urlh.geturl(), video_id)
 977             self.to_screen(f'Saving request to {filename}')
 978             with open(filename, 'wb') as outf:
 979                 outf.write(webpage_bytes)
 980
 981         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 982         self.__check_blocked(content)
 983
 984         return content
 985
 986     def __print_error(self, errnote, fatal, video_id, err):
 987         if fatal:
 988             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 989         elif errnote:
 990             self.report_warning(f'{video_id}: {errnote}: {err}')
 991
 992     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 993         if transform_source:
 994             xml_string = transform_source(xml_string)
 995         try:
 996             return compat_etree_fromstring(xml_string.encode('utf-8'))
 997         except xml.etree.ElementTree.ParseError as ve:
 998             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 999
1000     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1001         try:
1002             return json.loads(
1003                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1004         except ValueError as ve:
1005             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1006
1007     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1008         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1009
1010     def __create_download_methods(name, parser, note, errnote, return_value):
1011
1012         def parse(ie, content, *args, errnote=errnote, **kwargs):
1013             if parser is None:
1014                 return content
1015             if errnote is False:
1016                 kwargs['errnote'] = errnote
1017             # parser is fetched by name so subclasses can override it
1018             return getattr(ie, parser)(content, *args, **kwargs)
1019
1020         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1021                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1022             res = self._download_webpage_handle(
1023                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1024                 data=data, headers=headers, query=query, expected_status=expected_status)
1025             if res is False:
1026                 return res
1027             content, urlh = res
1028             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1029
1030         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1031                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1032             if self.get_param('load_pages'):
1033                 url_or_request = self._create_request(url_or_request, data, headers, query)
1034                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1035                 self.to_screen(f'Loading request from {filename}')
1036                 try:
1037                     with open(filename, 'rb') as dumpf:
1038                         webpage_bytes = dumpf.read()
1039                 except OSError as e:
1040                     self.report_warning(f'Unable to load request from disk: {e}')
1041                 else:
1042                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1043                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1044             kwargs = {
1045                 'note': note,
1046                 'errnote': errnote,
1047                 'transform_source': transform_source,
1048                 'fatal': fatal,
1049                 'encoding': encoding,
1050                 'data': data,
1051                 'headers': headers,
1052                 'query': query,
1053                 'expected_status': expected_status,
1054             }
1055             if parser is None:
1056                 kwargs.pop('transform_source')
1057             # The method is fetched by name so subclasses can override _download_..._handle
1058             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1059             return res if res is False else res[0]
1060
1061         def impersonate(func, name, return_value):
1062             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1063             func.__doc__ = f'''
1064                 @param transform_source     Apply this transformation before parsing
1065                 @returns                    {return_value}
1066
1067                 See _download_webpage_handle docstring for other arguments specification
1068             '''
1069
1070         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1071         impersonate(download_content, f'_download_{name}', f'{return_value}')
1072         return download_handle, download_content
1073
1074     _download_xml_handle, _download_xml = __create_download_methods(
1075         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1076     _download_json_handle, _download_json = __create_download_methods(
1077         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1078     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1079         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1080     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1081
1082     def _download_webpage(
1083             self, url_or_request, video_id, note=None, errnote=None,
1084             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1085         """
1086         Return the data of the page as a string.
1087
1088         Keyword arguments:
1089         tries -- number of tries
1090         timeout -- sleep interval between tries
1091
1092         See _download_webpage_handle docstring for other arguments specification.
1093         """
1094
1095         R''' # NB: These are unused; should they be deprecated?
1096         if tries != 1:
1097             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1098         if timeout is NO_DEFAULT:
1099             timeout = 5
1100         else:
1101             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1102         '''
1103
1104         try_count = 0
1105         while True:
1106             try:
1107                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1108             except http.client.IncompleteRead as e:
1109                 try_count += 1
1110                 if try_count >= tries:
1111                     raise e
1112                 self._sleep(timeout, video_id)
1113
1114     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1115         idstr = format_field(video_id, None, '%s: ')
1116         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1117         if only_once:
1118             if f'WARNING: {msg}' in self._printed_messages:
1119                 return
1120             self._printed_messages.add(f'WARNING: {msg}')
1121         self._downloader.report_warning(msg, *args, **kwargs)
1122
1123     def to_screen(self, msg, *args, **kwargs):
1124         """Print msg to screen, prefixing it with '[ie_name]'"""
1125         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1126
1127     def write_debug(self, msg, *args, **kwargs):
1128         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1129
1130     def get_param(self, name, default=None, *args, **kwargs):
1131         if self._downloader:
1132             return self._downloader.params.get(name, default, *args, **kwargs)
1133         return default
1134
1135     def report_drm(self, video_id, partial=NO_DEFAULT):
1136         if partial is not NO_DEFAULT:
1137             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1138         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1139
1140     def report_extraction(self, id_or_name):
1141         """Report information extraction."""
1142         self.to_screen('%s: Extracting information' % id_or_name)
1143
1144     def report_download_webpage(self, video_id):
1145         """Report webpage download."""
1146         self.to_screen('%s: Downloading webpage' % video_id)
1147
1148     def report_age_confirmation(self):
1149         """Report attempt to confirm age."""
1150         self.to_screen('Confirming age')
1151
1152     def report_login(self):
1153         """Report attempt to log in."""
1154         self.to_screen('Logging in')
1155
1156     def raise_login_required(
1157             self, msg='This video is only available for registered users',
1158             metadata_available=False, method=NO_DEFAULT):
1159         if metadata_available and (
1160                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1161             self.report_warning(msg)
1162             return
1163         msg += format_field(self._login_hint(method), None, '. %s')
1164         raise ExtractorError(msg, expected=True)
1165
1166     def raise_geo_restricted(
1167             self, msg='This video is not available from your location due to geo restriction',
1168             countries=None, metadata_available=False):
1169         if metadata_available and (
1170                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1171             self.report_warning(msg)
1172         else:
1173             raise GeoRestrictedError(msg, countries=countries)
1174
1175     def raise_no_formats(self, msg, expected=False, video_id=None):
1176         if expected and (
1177                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1178             self.report_warning(msg, video_id)
1179         elif isinstance(msg, ExtractorError):
1180             raise msg
1181         else:
1182             raise ExtractorError(msg, expected=expected, video_id=video_id)
1183
1184     # Methods for following #608
1185     @staticmethod
1186     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1187         """Returns a URL that points to a page that should be processed"""
1188         if ie is not None:
1189             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1190         if video_id is not None:
1191             kwargs['id'] = video_id
1192         if video_title is not None:
1193             kwargs['title'] = video_title
1194         return {
1195             **kwargs,
1196             '_type': 'url_transparent' if url_transparent else 'url',
1197             'url': url,
1198         }
1199
1200     @classmethod
1201     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1202                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1203         return cls.playlist_result(
1204             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1205             playlist_id, playlist_title, **kwargs)
1206
1207     @staticmethod
1208     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1209         """Returns a playlist"""
1210         if playlist_id:
1211             kwargs['id'] = playlist_id
1212         if playlist_title:
1213             kwargs['title'] = playlist_title
1214         if playlist_description is not None:
1215             kwargs['description'] = playlist_description
1216         return {
1217             **kwargs,
1218             '_type': 'multi_video' if multi_video else 'playlist',
1219             'entries': entries,
1220         }
1221
1222     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1223         """
1224         Perform a regex search on the given string, using a single or a list of
1225         patterns returning the first matching group.
1226         In case of failure return a default value or raise a WARNING or a
1227         RegexNotFoundError, depending on fatal, specifying the field name.
1228         """
1229         if string is None:
1230             mobj = None
1231         elif isinstance(pattern, (str, re.Pattern)):
1232             mobj = re.search(pattern, string, flags)
1233         else:
1234             for p in pattern:
1235                 mobj = re.search(p, string, flags)
1236                 if mobj:
1237                     break
1238
1239         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1240
1241         if mobj:
1242             if group is None:
1243                 # return the first matching group
1244                 return next(g for g in mobj.groups() if g is not None)
1245             elif isinstance(group, (list, tuple)):
1246                 return tuple(mobj.group(g) for g in group)
1247             else:
1248                 return mobj.group(group)
1249         elif default is not NO_DEFAULT:
1250             return default
1251         elif fatal:
1252             raise RegexNotFoundError('Unable to extract %s' % _name)
1253         else:
1254             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1255             return None
1256
1257     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1258                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1259         """Searches string for the JSON object specified by start_pattern"""
1260         # NB: end_pattern is only used to reduce the size of the initial match
1261         if default is NO_DEFAULT:
1262             default, has_default = {}, False
1263         else:
1264             fatal, has_default = False, True
1265
1266         json_string = self._search_regex(
1267             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1268             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1269         if not json_string:
1270             return default
1271
1272         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1273         try:
1274             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1275         except ExtractorError as e:
1276             if fatal:
1277                 raise ExtractorError(
1278                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1279             elif not has_default:
1280                 self.report_warning(
1281                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1282         return default
1283
1284     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1285         """
1286         Like _search_regex, but strips HTML tags and unescapes entities.
1287         """
1288         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1289         if isinstance(res, tuple):
1290             return tuple(map(clean_html, res))
1291         return clean_html(res)
1292
1293     def _get_netrc_login_info(self, netrc_machine=None):
1294         username = None
1295         password = None
1296         netrc_machine = netrc_machine or self._NETRC_MACHINE
1297
1298         if self.get_param('usenetrc', False):
1299             try:
1300                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1301                 if os.path.isdir(netrc_file):
1302                     netrc_file = os.path.join(netrc_file, '.netrc')
1303                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1304                 if info is not None:
1305                     username = info[0]
1306                     password = info[2]
1307                 else:
1308                     raise netrc.NetrcParseError(
1309                         'No authenticators for %s' % netrc_machine)
1310             except (OSError, netrc.NetrcParseError) as err:
1311                 self.report_warning(
1312                     'parsing .netrc: %s' % error_to_compat_str(err))
1313
1314         return username, password
1315
1316     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1317         """
1318         Get the login info as (username, password)
1319         First look for the manually specified credentials using username_option
1320         and password_option as keys in params dictionary. If no such credentials
1321         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1322         value.
1323         If there's no info available, return (None, None)
1324         """
1325
1326         # Attempt to use provided username and password or .netrc data
1327         username = self.get_param(username_option)
1328         if username is not None:
1329             password = self.get_param(password_option)
1330         else:
1331             username, password = self._get_netrc_login_info(netrc_machine)
1332
1333         return username, password
1334
1335     def _get_tfa_info(self, note='two-factor verification code'):
1336         """
1337         Get the two-factor authentication info
1338         TODO - asking the user will be required for sms/phone verify
1339         currently just uses the command line option
1340         If there's no info available, return None
1341         """
1342
1343         tfa = self.get_param('twofactor')
1344         if tfa is not None:
1345             return tfa
1346
1347         return getpass.getpass('Type %s and press [Return]: ' % note)
1348
1349     # Helper functions for extracting OpenGraph info
1350     @staticmethod
1351     def _og_regexes(prop):
1352         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1353         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1354                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1355         template = r'<meta[^>]+?%s[^>]+?%s'
1356         return [
1357             template % (property_re, content_re),
1358             template % (content_re, property_re),
1359         ]
1360
1361     @staticmethod
1362     def _meta_regex(prop):
1363         return r'''(?isx)<meta
1364                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1365                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1366
1367     def _og_search_property(self, prop, html, name=None, **kargs):
1368         prop = variadic(prop)
1369         if name is None:
1370             name = 'OpenGraph %s' % prop[0]
1371         og_regexes = []
1372         for p in prop:
1373             og_regexes.extend(self._og_regexes(p))
1374         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1375         if escaped is None:
1376             return None
1377         return unescapeHTML(escaped)
1378
1379     def _og_search_thumbnail(self, html, **kargs):
1380         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1381
1382     def _og_search_description(self, html, **kargs):
1383         return self._og_search_property('description', html, fatal=False, **kargs)
1384
1385     def _og_search_title(self, html, *, fatal=False, **kargs):
1386         return self._og_search_property('title', html, fatal=fatal, **kargs)
1387
1388     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1389         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1390         if secure:
1391             regexes = self._og_regexes('video:secure_url') + regexes
1392         return self._html_search_regex(regexes, html, name, **kargs)
1393
1394     def _og_search_url(self, html, **kargs):
1395         return self._og_search_property('url', html, **kargs)
1396
1397     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1398         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1399
1400     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1401         name = variadic(name)
1402         if display_name is None:
1403             display_name = name[0]
1404         return self._html_search_regex(
1405             [self._meta_regex(n) for n in name],
1406             html, display_name, fatal=fatal, group='content', **kwargs)
1407
1408     def _dc_search_uploader(self, html):
1409         return self._html_search_meta('dc.creator', html, 'uploader')
1410
1411     @staticmethod
1412     def _rta_search(html):
1413         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1414         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1415                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1416                      html):
1417             return 18
1418
1419         # And then there are the jokers who advertise that they use RTA, but actually don't.
1420         AGE_LIMIT_MARKERS = [
1421             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1422             r'>[^<]*you acknowledge you are at least (\d+) years old',
1423             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1424         ]
1425
1426         age_limit = 0
1427         for marker in AGE_LIMIT_MARKERS:
1428             mobj = re.search(marker, html)
1429             if mobj:
1430                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1431         return age_limit
1432
1433     def _media_rating_search(self, html):
1434         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1435         rating = self._html_search_meta('rating', html)
1436
1437         if not rating:
1438             return None
1439
1440         RATING_TABLE = {
1441             'safe for kids': 0,
1442             'general': 8,
1443             '14 years': 14,
1444             'mature': 17,
1445             'restricted': 19,
1446         }
1447         return RATING_TABLE.get(rating.lower())
1448
1449     def _family_friendly_search(self, html):
1450         # See http://schema.org/VideoObject
1451         family_friendly = self._html_search_meta(
1452             'isFamilyFriendly', html, default=None)
1453
1454         if not family_friendly:
1455             return None
1456
1457         RATING_TABLE = {
1458             '1': 0,
1459             'true': 0,
1460             '0': 18,
1461             'false': 18,
1462         }
1463         return RATING_TABLE.get(family_friendly.lower())
1464
1465     def _twitter_search_player(self, html):
1466         return self._html_search_meta('twitter:player', html,
1467                                       'twitter card player')
1468
1469     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1470         """Yield all json ld objects in the html"""
1471         if default is not NO_DEFAULT:
1472             fatal = False
1473         for mobj in re.finditer(JSON_LD_RE, html):
1474             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1475             for json_ld in variadic(json_ld_item):
1476                 if isinstance(json_ld, dict):
1477                     yield json_ld
1478
1479     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1480         """Search for a video in any json ld in the html"""
1481         if default is not NO_DEFAULT:
1482             fatal = False
1483         info = self._json_ld(
1484             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1485             video_id, fatal=fatal, expected_type=expected_type)
1486         if info:
1487             return info
1488         if default is not NO_DEFAULT:
1489             return default
1490         elif fatal:
1491             raise RegexNotFoundError('Unable to extract JSON-LD')
1492         else:
1493             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1494             return {}
1495
1496     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1497         if isinstance(json_ld, str):
1498             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1499         if not json_ld:
1500             return {}
1501         info = {}
1502
1503         INTERACTION_TYPE_MAP = {
1504             'CommentAction': 'comment',
1505             'AgreeAction': 'like',
1506             'DisagreeAction': 'dislike',
1507             'LikeAction': 'like',
1508             'DislikeAction': 'dislike',
1509             'ListenAction': 'view',
1510             'WatchAction': 'view',
1511             'ViewAction': 'view',
1512         }
1513
1514         def is_type(e, *expected_types):
1515             type = variadic(traverse_obj(e, '@type'))
1516             return any(x in type for x in expected_types)
1517
1518         def extract_interaction_type(e):
1519             interaction_type = e.get('interactionType')
1520             if isinstance(interaction_type, dict):
1521                 interaction_type = interaction_type.get('@type')
1522             return str_or_none(interaction_type)
1523
1524         def extract_interaction_statistic(e):
1525             interaction_statistic = e.get('interactionStatistic')
1526             if isinstance(interaction_statistic, dict):
1527                 interaction_statistic = [interaction_statistic]
1528             if not isinstance(interaction_statistic, list):
1529                 return
1530             for is_e in interaction_statistic:
1531                 if not is_type(is_e, 'InteractionCounter'):
1532                     continue
1533                 interaction_type = extract_interaction_type(is_e)
1534                 if not interaction_type:
1535                     continue
1536                 # For interaction count some sites provide string instead of
1537                 # an integer (as per spec) with non digit characters (e.g. ",")
1538                 # so extracting count with more relaxed str_to_int
1539                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1540                 if interaction_count is None:
1541                     continue
1542                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1543                 if not count_kind:
1544                     continue
1545                 count_key = '%s_count' % count_kind
1546                 if info.get(count_key) is not None:
1547                     continue
1548                 info[count_key] = interaction_count
1549
1550         def extract_chapter_information(e):
1551             chapters = [{
1552                 'title': part.get('name'),
1553                 'start_time': part.get('startOffset'),
1554                 'end_time': part.get('endOffset'),
1555             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1556             for idx, (last_c, current_c, next_c) in enumerate(zip(
1557                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1558                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1559                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1560                 if None in current_c.values():
1561                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1562                     return
1563             if chapters:
1564                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1565                 info['chapters'] = chapters
1566
1567         def extract_video_object(e):
1568             author = e.get('author')
1569             info.update({
1570                 'url': url_or_none(e.get('contentUrl')),
1571                 'ext': mimetype2ext(e.get('encodingFormat')),
1572                 'title': unescapeHTML(e.get('name')),
1573                 'description': unescapeHTML(e.get('description')),
1574                 'thumbnails': [{'url': unescapeHTML(url)}
1575                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1576                                if url_or_none(url)],
1577                 'duration': parse_duration(e.get('duration')),
1578                 'timestamp': unified_timestamp(e.get('uploadDate')),
1579                 # author can be an instance of 'Organization' or 'Person' types.
1580                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1581                 # however some websites are using 'Text' type instead.
1582                 # 1. https://schema.org/VideoObject
1583                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1584                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1585                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1586                 'tbr': int_or_none(e.get('bitrate')),
1587                 'width': int_or_none(e.get('width')),
1588                 'height': int_or_none(e.get('height')),
1589                 'view_count': int_or_none(e.get('interactionCount')),
1590                 'tags': try_call(lambda: e.get('keywords').split(',')),
1591             })
1592             if is_type(e, 'AudioObject'):
1593                 info.update({
1594                     'vcodec': 'none',
1595                     'abr': int_or_none(e.get('bitrate')),
1596                 })
1597             extract_interaction_statistic(e)
1598             extract_chapter_information(e)
1599
1600         def traverse_json_ld(json_ld, at_top_level=True):
1601             for e in variadic(json_ld):
1602                 if not isinstance(e, dict):
1603                     continue
1604                 if at_top_level and '@context' not in e:
1605                     continue
1606                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1607                     traverse_json_ld(e['@graph'], at_top_level=False)
1608                     continue
1609                 if expected_type is not None and not is_type(e, expected_type):
1610                     continue
1611                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1612                 if rating is not None:
1613                     info['average_rating'] = rating
1614                 if is_type(e, 'TVEpisode', 'Episode'):
1615                     episode_name = unescapeHTML(e.get('name'))
1616                     info.update({
1617                         'episode': episode_name,
1618                         'episode_number': int_or_none(e.get('episodeNumber')),
1619                         'description': unescapeHTML(e.get('description')),
1620                     })
1621                     if not info.get('title') and episode_name:
1622                         info['title'] = episode_name
1623                     part_of_season = e.get('partOfSeason')
1624                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1625                         info.update({
1626                             'season': unescapeHTML(part_of_season.get('name')),
1627                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1628                         })
1629                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1630                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1631                         info['series'] = unescapeHTML(part_of_series.get('name'))
1632                 elif is_type(e, 'Movie'):
1633                     info.update({
1634                         'title': unescapeHTML(e.get('name')),
1635                         'description': unescapeHTML(e.get('description')),
1636                         'duration': parse_duration(e.get('duration')),
1637                         'timestamp': unified_timestamp(e.get('dateCreated')),
1638                     })
1639                 elif is_type(e, 'Article', 'NewsArticle'):
1640                     info.update({
1641                         'timestamp': parse_iso8601(e.get('datePublished')),
1642                         'title': unescapeHTML(e.get('headline')),
1643                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1644                     })
1645                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1646                         extract_video_object(e['video'][0])
1647                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1648                         extract_video_object(e['subjectOf'][0])
1649                 elif is_type(e, 'VideoObject', 'AudioObject'):
1650                     extract_video_object(e)
1651                     if expected_type is None:
1652                         continue
1653                     else:
1654                         break
1655                 video = e.get('video')
1656                 if is_type(video, 'VideoObject'):
1657                     extract_video_object(video)
1658                 if expected_type is None:
1659                     continue
1660                 else:
1661                     break
1662
1663         traverse_json_ld(json_ld)
1664         return filter_dict(info)
1665
1666     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1667         return self._parse_json(
1668             self._search_regex(
1669                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1670                 webpage, 'next.js data', fatal=fatal, **kw),
1671             video_id, transform_source=transform_source, fatal=fatal)
1672
1673     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1674         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1675         rectx = re.escape(context_name)
1676         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1677         js, arg_keys, arg_vals = self._search_regex(
1678             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1679             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1680             default=NO_DEFAULT if fatal else (None, None, None))
1681         if js is None:
1682             return {}
1683
1684         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1685             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1686
1687         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1688         return traverse_obj(ret, traverse) or {}
1689
1690     @staticmethod
1691     def _hidden_inputs(html):
1692         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1693         hidden_inputs = {}
1694         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1695             attrs = extract_attributes(input)
1696             if not input:
1697                 continue
1698             if attrs.get('type') not in ('hidden', 'submit'):
1699                 continue
1700             name = attrs.get('name') or attrs.get('id')
1701             value = attrs.get('value')
1702             if name and value is not None:
1703                 hidden_inputs[name] = value
1704         return hidden_inputs
1705
1706     def _form_hidden_inputs(self, form_id, html):
1707         form = self._search_regex(
1708             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1709             html, '%s form' % form_id, group='form')
1710         return self._hidden_inputs(form)
1711
1712     @classproperty(cache=True)
1713     def FormatSort(cls):
1714         class FormatSort(FormatSorter):
1715             def __init__(ie, *args, **kwargs):
1716                 super().__init__(ie._downloader, *args, **kwargs)
1717
1718         deprecation_warning(
1719             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1720             'Use yt_dlp.utils.FormatSorter instead')
1721         return FormatSort
1722
1723     def _sort_formats(self, formats, field_preference=[]):
1724         if not field_preference:
1725             self._downloader.deprecation_warning(
1726                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1727             return
1728         self._downloader.deprecation_warning(
1729             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1730             'Return _format_sort_fields in the info_dict instead')
1731         if formats:
1732             formats[0]['__sort_fields'] = field_preference
1733
1734     def _check_formats(self, formats, video_id):
1735         if formats:
1736             formats[:] = filter(
1737                 lambda f: self._is_valid_url(
1738                     f['url'], video_id,
1739                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1740                 formats)
1741
1742     @staticmethod
1743     def _remove_duplicate_formats(formats):
1744         format_urls = set()
1745         unique_formats = []
1746         for f in formats:
1747             if f['url'] not in format_urls:
1748                 format_urls.add(f['url'])
1749                 unique_formats.append(f)
1750         formats[:] = unique_formats
1751
1752     def _is_valid_url(self, url, video_id, item='video', headers={}):
1753         url = self._proto_relative_url(url, scheme='http:')
1754         # For now assume non HTTP(S) URLs always valid
1755         if not (url.startswith('http://') or url.startswith('https://')):
1756             return True
1757         try:
1758             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1759             return True
1760         except ExtractorError as e:
1761             self.to_screen(
1762                 '%s: %s URL is invalid, skipping: %s'
1763                 % (video_id, item, error_to_compat_str(e.cause)))
1764             return False
1765
1766     def http_scheme(self):
1767         """ Either "http:" or "https:", depending on the user's preferences """
1768         return (
1769             'http:'
1770             if self.get_param('prefer_insecure', False)
1771             else 'https:')
1772
1773     def _proto_relative_url(self, url, scheme=None):
1774         scheme = scheme or self.http_scheme()
1775         assert scheme.endswith(':')
1776         return sanitize_url(url, scheme=scheme[:-1])
1777
1778     def _sleep(self, timeout, video_id, msg_template=None):
1779         if msg_template is None:
1780             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1781         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1782         self.to_screen(msg)
1783         time.sleep(timeout)
1784
1785     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1786                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1787                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1788         if self.get_param('ignore_no_formats_error'):
1789             fatal = False
1790
1791         res = self._download_xml_handle(
1792             manifest_url, video_id, 'Downloading f4m manifest',
1793             'Unable to download f4m manifest',
1794             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1795             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1796             transform_source=transform_source,
1797             fatal=fatal, data=data, headers=headers, query=query)
1798         if res is False:
1799             return []
1800
1801         manifest, urlh = res
1802         manifest_url = urlh.geturl()
1803
1804         return self._parse_f4m_formats(
1805             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1806             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1807
1808     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1809                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1810                            fatal=True, m3u8_id=None):
1811         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1812             return []
1813
1814         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1815         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1816         if akamai_pv is not None and ';' in akamai_pv.text:
1817             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1818             if playerVerificationChallenge.strip() != '':
1819                 return []
1820
1821         formats = []
1822         manifest_version = '1.0'
1823         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1824         if not media_nodes:
1825             manifest_version = '2.0'
1826             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1827         # Remove unsupported DRM protected media from final formats
1828         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1829         media_nodes = remove_encrypted_media(media_nodes)
1830         if not media_nodes:
1831             return formats
1832
1833         manifest_base_url = get_base_url(manifest)
1834
1835         bootstrap_info = xpath_element(
1836             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1837             'bootstrap info', default=None)
1838
1839         vcodec = None
1840         mime_type = xpath_text(
1841             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1842             'base URL', default=None)
1843         if mime_type and mime_type.startswith('audio/'):
1844             vcodec = 'none'
1845
1846         for i, media_el in enumerate(media_nodes):
1847             tbr = int_or_none(media_el.attrib.get('bitrate'))
1848             width = int_or_none(media_el.attrib.get('width'))
1849             height = int_or_none(media_el.attrib.get('height'))
1850             format_id = join_nonempty(f4m_id, tbr or i)
1851             # If <bootstrapInfo> is present, the specified f4m is a
1852             # stream-level manifest, and only set-level manifests may refer to
1853             # external resources.  See section 11.4 and section 4 of F4M spec
1854             if bootstrap_info is None:
1855                 media_url = None
1856                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1857                 if manifest_version == '2.0':
1858                     media_url = media_el.attrib.get('href')
1859                 if media_url is None:
1860                     media_url = media_el.attrib.get('url')
1861                 if not media_url:
1862                     continue
1863                 manifest_url = (
1864                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1865                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1866                 # If media_url is itself a f4m manifest do the recursive extraction
1867                 # since bitrates in parent manifest (this one) and media_url manifest
1868                 # may differ leading to inability to resolve the format by requested
1869                 # bitrate in f4m downloader
1870                 ext = determine_ext(manifest_url)
1871                 if ext == 'f4m':
1872                     f4m_formats = self._extract_f4m_formats(
1873                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1874                         transform_source=transform_source, fatal=fatal)
1875                     # Sometimes stream-level manifest contains single media entry that
1876                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1877                     # At the same time parent's media entry in set-level manifest may
1878                     # contain it. We will copy it from parent in such cases.
1879                     if len(f4m_formats) == 1:
1880                         f = f4m_formats[0]
1881                         f.update({
1882                             'tbr': f.get('tbr') or tbr,
1883                             'width': f.get('width') or width,
1884                             'height': f.get('height') or height,
1885                             'format_id': f.get('format_id') if not tbr else format_id,
1886                             'vcodec': vcodec,
1887                         })
1888                     formats.extend(f4m_formats)
1889                     continue
1890                 elif ext == 'm3u8':
1891                     formats.extend(self._extract_m3u8_formats(
1892                         manifest_url, video_id, 'mp4', preference=preference,
1893                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1894                     continue
1895             formats.append({
1896                 'format_id': format_id,
1897                 'url': manifest_url,
1898                 'manifest_url': manifest_url,
1899                 'ext': 'flv' if bootstrap_info is not None else None,
1900                 'protocol': 'f4m',
1901                 'tbr': tbr,
1902                 'width': width,
1903                 'height': height,
1904                 'vcodec': vcodec,
1905                 'preference': preference,
1906                 'quality': quality,
1907             })
1908         return formats
1909
1910     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1911         return {
1912             'format_id': join_nonempty(m3u8_id, 'meta'),
1913             'url': m3u8_url,
1914             'ext': ext,
1915             'protocol': 'm3u8',
1916             'preference': preference - 100 if preference else -100,
1917             'quality': quality,
1918             'resolution': 'multiple',
1919             'format_note': 'Quality selection URL',
1920         }
1921
1922     def _report_ignoring_subs(self, name):
1923         self.report_warning(bug_reports_message(
1924             f'Ignoring subtitle tracks found in the {name} manifest; '
1925             'if any subtitle tracks are missing,'
1926         ), only_once=True)
1927
1928     def _extract_m3u8_formats(self, *args, **kwargs):
1929         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1930         if subs:
1931             self._report_ignoring_subs('HLS')
1932         return fmts
1933
1934     def _extract_m3u8_formats_and_subtitles(
1935             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1936             preference=None, quality=None, m3u8_id=None, note=None,
1937             errnote=None, fatal=True, live=False, data=None, headers={},
1938             query={}):
1939
1940         if self.get_param('ignore_no_formats_error'):
1941             fatal = False
1942
1943         if not m3u8_url:
1944             if errnote is not False:
1945                 errnote = errnote or 'Failed to obtain m3u8 URL'
1946                 if fatal:
1947                     raise ExtractorError(errnote, video_id=video_id)
1948                 self.report_warning(f'{errnote}{bug_reports_message()}')
1949             return [], {}
1950
1951         res = self._download_webpage_handle(
1952             m3u8_url, video_id,
1953             note='Downloading m3u8 information' if note is None else note,
1954             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1955             fatal=fatal, data=data, headers=headers, query=query)
1956
1957         if res is False:
1958             return [], {}
1959
1960         m3u8_doc, urlh = res
1961         m3u8_url = urlh.geturl()
1962
1963         return self._parse_m3u8_formats_and_subtitles(
1964             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1965             preference=preference, quality=quality, m3u8_id=m3u8_id,
1966             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1967             headers=headers, query=query, video_id=video_id)
1968
1969     def _parse_m3u8_formats_and_subtitles(
1970             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
1971             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1972             errnote=None, fatal=True, data=None, headers={}, query={},
1973             video_id=None):
1974         formats, subtitles = [], {}
1975
1976         has_drm = re.search('|'.join([
1977             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
1978             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
1979         ]), m3u8_doc)
1980
1981         def format_url(url):
1982             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
1983
1984         if self.get_param('hls_split_discontinuity', False):
1985             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1986                 if not m3u8_doc:
1987                     if not manifest_url:
1988                         return []
1989                     m3u8_doc = self._download_webpage(
1990                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
1991                         note=False, errnote='Failed to download m3u8 playlist information')
1992                     if m3u8_doc is False:
1993                         return []
1994                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
1995
1996         else:
1997             def _extract_m3u8_playlist_indices(*args, **kwargs):
1998                 return [None]
1999
2000         # References:
2001         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2002         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2003         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2004
2005         # We should try extracting formats only from master playlists [1, 4.3.4],
2006         # i.e. playlists that describe available qualities. On the other hand
2007         # media playlists [1, 4.3.3] should be returned as is since they contain
2008         # just the media without qualities renditions.
2009         # Fortunately, master playlist can be easily distinguished from media
2010         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2011         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2012         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2013         # media playlist and MUST NOT appear in master playlist thus we can
2014         # clearly detect media playlist with this criterion.
2015
2016         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2017             formats = [{
2018                 'format_id': join_nonempty(m3u8_id, idx),
2019                 'format_index': idx,
2020                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2021                 'ext': ext,
2022                 'protocol': entry_protocol,
2023                 'preference': preference,
2024                 'quality': quality,
2025                 'has_drm': has_drm,
2026             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2027
2028             return formats, subtitles
2029
2030         groups = {}
2031         last_stream_inf = {}
2032
2033         def extract_media(x_media_line):
2034             media = parse_m3u8_attributes(x_media_line)
2035             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2036             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2037             if not (media_type and group_id and name):
2038                 return
2039             groups.setdefault(group_id, []).append(media)
2040             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2041             if media_type == 'SUBTITLES':
2042                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2043                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2044                 # However, lack of URI has been spotted in the wild.
2045                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2046                 if not media.get('URI'):
2047                     return
2048                 url = format_url(media['URI'])
2049                 sub_info = {
2050                     'url': url,
2051                     'ext': determine_ext(url),
2052                 }
2053                 if sub_info['ext'] == 'm3u8':
2054                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2055                     # files may contain is WebVTT:
2056                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2057                     sub_info['ext'] = 'vtt'
2058                     sub_info['protocol'] = 'm3u8_native'
2059                 lang = media.get('LANGUAGE') or 'und'
2060                 subtitles.setdefault(lang, []).append(sub_info)
2061             if media_type not in ('VIDEO', 'AUDIO'):
2062                 return
2063             media_url = media.get('URI')
2064             if media_url:
2065                 manifest_url = format_url(media_url)
2066                 formats.extend({
2067                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2068                     'format_note': name,
2069                     'format_index': idx,
2070                     'url': manifest_url,
2071                     'manifest_url': m3u8_url,
2072                     'language': media.get('LANGUAGE'),
2073                     'ext': ext,
2074                     'protocol': entry_protocol,
2075                     'preference': preference,
2076                     'quality': quality,
2077                     'has_drm': has_drm,
2078                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2079                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2080
2081         def build_stream_name():
2082             # Despite specification does not mention NAME attribute for
2083             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2084             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2085             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2086             stream_name = last_stream_inf.get('NAME')
2087             if stream_name:
2088                 return stream_name
2089             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2090             # from corresponding rendition group
2091             stream_group_id = last_stream_inf.get('VIDEO')
2092             if not stream_group_id:
2093                 return
2094             stream_group = groups.get(stream_group_id)
2095             if not stream_group:
2096                 return stream_group_id
2097             rendition = stream_group[0]
2098             return rendition.get('NAME') or stream_group_id
2099
2100         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2101         # chance to detect video only formats when EXT-X-STREAM-INF tags
2102         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2103         for line in m3u8_doc.splitlines():
2104             if line.startswith('#EXT-X-MEDIA:'):
2105                 extract_media(line)
2106
2107         for line in m3u8_doc.splitlines():
2108             if line.startswith('#EXT-X-STREAM-INF:'):
2109                 last_stream_inf = parse_m3u8_attributes(line)
2110             elif line.startswith('#') or not line.strip():
2111                 continue
2112             else:
2113                 tbr = float_or_none(
2114                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2115                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2116                 manifest_url = format_url(line.strip())
2117
2118                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2119                     format_id = [m3u8_id, None, idx]
2120                     # Bandwidth of live streams may differ over time thus making
2121                     # format_id unpredictable. So it's better to keep provided
2122                     # format_id intact.
2123                     if not live:
2124                         stream_name = build_stream_name()
2125                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2126                     f = {
2127                         'format_id': join_nonempty(*format_id),
2128                         'format_index': idx,
2129                         'url': manifest_url,
2130                         'manifest_url': m3u8_url,
2131                         'tbr': tbr,
2132                         'ext': ext,
2133                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2134                         'protocol': entry_protocol,
2135                         'preference': preference,
2136                         'quality': quality,
2137                         'has_drm': has_drm,
2138                     }
2139                     resolution = last_stream_inf.get('RESOLUTION')
2140                     if resolution:
2141                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2142                         if mobj:
2143                             f['width'] = int(mobj.group('width'))
2144                             f['height'] = int(mobj.group('height'))
2145                     # Unified Streaming Platform
2146                     mobj = re.search(
2147                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2148                     if mobj:
2149                         abr, vbr = mobj.groups()
2150                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2151                         f.update({
2152                             'vbr': vbr,
2153                             'abr': abr,
2154                         })
2155                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2156                     f.update(codecs)
2157                     audio_group_id = last_stream_inf.get('AUDIO')
2158                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2159                     # references a rendition group MUST have a CODECS attribute.
2160                     # However, this is not always respected. E.g. [2]
2161                     # contains EXT-X-STREAM-INF tag which references AUDIO
2162                     # rendition group but does not have CODECS and despite
2163                     # referencing an audio group it represents a complete
2164                     # (with audio and video) format. So, for such cases we will
2165                     # ignore references to rendition groups and treat them
2166                     # as complete formats.
2167                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2168                         audio_group = groups.get(audio_group_id)
2169                         if audio_group and audio_group[0].get('URI'):
2170                             # TODO: update acodec for audio only formats with
2171                             # the same GROUP-ID
2172                             f['acodec'] = 'none'
2173                     if not f.get('ext'):
2174                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2175                     formats.append(f)
2176
2177                     # for DailyMotion
2178                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2179                     if progressive_uri:
2180                         http_f = f.copy()
2181                         del http_f['manifest_url']
2182                         http_f.update({
2183                             'format_id': f['format_id'].replace('hls-', 'http-'),
2184                             'protocol': 'http',
2185                             'url': progressive_uri,
2186                         })
2187                         formats.append(http_f)
2188
2189                 last_stream_inf = {}
2190         return formats, subtitles
2191
2192     def _extract_m3u8_vod_duration(
2193             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2194
2195         m3u8_vod = self._download_webpage(
2196             m3u8_vod_url, video_id,
2197             note='Downloading m3u8 VOD manifest' if note is None else note,
2198             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2199             fatal=False, data=data, headers=headers, query=query)
2200
2201         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2202
2203     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2204         if '#EXT-X-ENDLIST' not in m3u8_vod:
2205             return None
2206
2207         return int(sum(
2208             float(line[len('#EXTINF:'):].split(',')[0])
2209             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2210
2211     def _extract_mpd_vod_duration(
2212             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2213
2214         mpd_doc = self._download_xml(
2215             mpd_url, video_id,
2216             note='Downloading MPD VOD manifest' if note is None else note,
2217             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2218             fatal=False, data=data, headers=headers, query=query) or {}
2219         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2220
2221     @staticmethod
2222     def _xpath_ns(path, namespace=None):
2223         if not namespace:
2224             return path
2225         out = []
2226         for c in path.split('/'):
2227             if not c or c == '.':
2228                 out.append(c)
2229             else:
2230                 out.append('{%s}%s' % (namespace, c))
2231         return '/'.join(out)
2232
2233     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2234         if self.get_param('ignore_no_formats_error'):
2235             fatal = False
2236
2237         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2238         if res is False:
2239             assert not fatal
2240             return [], {}
2241
2242         smil, urlh = res
2243         smil_url = urlh.geturl()
2244
2245         namespace = self._parse_smil_namespace(smil)
2246
2247         fmts = self._parse_smil_formats(
2248             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2249         subs = self._parse_smil_subtitles(
2250             smil, namespace=namespace)
2251
2252         return fmts, subs
2253
2254     def _extract_smil_formats(self, *args, **kwargs):
2255         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2256         if subs:
2257             self._report_ignoring_subs('SMIL')
2258         return fmts
2259
2260     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2261         res = self._download_smil(smil_url, video_id, fatal=fatal)
2262         if res is False:
2263             return {}
2264
2265         smil, urlh = res
2266         smil_url = urlh.geturl()
2267
2268         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2269
2270     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2271         return self._download_xml_handle(
2272             smil_url, video_id, 'Downloading SMIL file',
2273             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2274
2275     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2276         namespace = self._parse_smil_namespace(smil)
2277
2278         formats = self._parse_smil_formats(
2279             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2280         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2281
2282         video_id = os.path.splitext(url_basename(smil_url))[0]
2283         title = None
2284         description = None
2285         upload_date = None
2286         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2287             name = meta.attrib.get('name')
2288             content = meta.attrib.get('content')
2289             if not name or not content:
2290                 continue
2291             if not title and name == 'title':
2292                 title = content
2293             elif not description and name in ('description', 'abstract'):
2294                 description = content
2295             elif not upload_date and name == 'date':
2296                 upload_date = unified_strdate(content)
2297
2298         thumbnails = [{
2299             'id': image.get('type'),
2300             'url': image.get('src'),
2301             'width': int_or_none(image.get('width')),
2302             'height': int_or_none(image.get('height')),
2303         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2304
2305         return {
2306             'id': video_id,
2307             'title': title or video_id,
2308             'description': description,
2309             'upload_date': upload_date,
2310             'thumbnails': thumbnails,
2311             'formats': formats,
2312             'subtitles': subtitles,
2313         }
2314
2315     def _parse_smil_namespace(self, smil):
2316         return self._search_regex(
2317             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2318
2319     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2320         base = smil_url
2321         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2322             b = meta.get('base') or meta.get('httpBase')
2323             if b:
2324                 base = b
2325                 break
2326
2327         formats = []
2328         rtmp_count = 0
2329         http_count = 0
2330         m3u8_count = 0
2331         imgs_count = 0
2332
2333         srcs = set()
2334         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2335         for medium in media:
2336             src = medium.get('src')
2337             if not src or src in srcs:
2338                 continue
2339             srcs.add(src)
2340
2341             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2342             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2343             width = int_or_none(medium.get('width'))
2344             height = int_or_none(medium.get('height'))
2345             proto = medium.get('proto')
2346             ext = medium.get('ext')
2347             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2348                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2349             streamer = medium.get('streamer') or base
2350
2351             if proto == 'rtmp' or streamer.startswith('rtmp'):
2352                 rtmp_count += 1
2353                 formats.append({
2354                     'url': streamer,
2355                     'play_path': src,
2356                     'ext': 'flv',
2357                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2358                     'tbr': bitrate,
2359                     'filesize': filesize,
2360                     'width': width,
2361                     'height': height,
2362                 })
2363                 if transform_rtmp_url:
2364                     streamer, src = transform_rtmp_url(streamer, src)
2365                     formats[-1].update({
2366                         'url': streamer,
2367                         'play_path': src,
2368                     })
2369                 continue
2370
2371             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2372             src_url = src_url.strip()
2373
2374             if proto == 'm3u8' or src_ext == 'm3u8':
2375                 m3u8_formats = self._extract_m3u8_formats(
2376                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2377                 if len(m3u8_formats) == 1:
2378                     m3u8_count += 1
2379                     m3u8_formats[0].update({
2380                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2381                         'tbr': bitrate,
2382                         'width': width,
2383                         'height': height,
2384                     })
2385                 formats.extend(m3u8_formats)
2386             elif src_ext == 'f4m':
2387                 f4m_url = src_url
2388                 if not f4m_params:
2389                     f4m_params = {
2390                         'hdcore': '3.2.0',
2391                         'plugin': 'flowplayer-3.2.0.1',
2392                     }
2393                 f4m_url += '&' if '?' in f4m_url else '?'
2394                 f4m_url += urllib.parse.urlencode(f4m_params)
2395                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2396             elif src_ext == 'mpd':
2397                 formats.extend(self._extract_mpd_formats(
2398                     src_url, video_id, mpd_id='dash', fatal=False))
2399             elif re.search(r'\.ism/[Mm]anifest', src_url):
2400                 formats.extend(self._extract_ism_formats(
2401                     src_url, video_id, ism_id='mss', fatal=False))
2402             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2403                 http_count += 1
2404                 formats.append({
2405                     'url': src_url,
2406                     'ext': ext or src_ext or 'flv',
2407                     'format_id': 'http-%d' % (bitrate or http_count),
2408                     'tbr': bitrate,
2409                     'filesize': filesize,
2410                     'width': width,
2411                     'height': height,
2412                 })
2413
2414         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2415             src = medium.get('src')
2416             if not src or src in srcs:
2417                 continue
2418             srcs.add(src)
2419
2420             imgs_count += 1
2421             formats.append({
2422                 'format_id': 'imagestream-%d' % (imgs_count),
2423                 'url': src,
2424                 'ext': mimetype2ext(medium.get('type')),
2425                 'acodec': 'none',
2426                 'vcodec': 'none',
2427                 'width': int_or_none(medium.get('width')),
2428                 'height': int_or_none(medium.get('height')),
2429                 'format_note': 'SMIL storyboards',
2430             })
2431
2432         return formats
2433
2434     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2435         urls = []
2436         subtitles = {}
2437         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2438             src = textstream.get('src')
2439             if not src or src in urls:
2440                 continue
2441             urls.append(src)
2442             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2443             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2444             subtitles.setdefault(lang, []).append({
2445                 'url': src,
2446                 'ext': ext,
2447             })
2448         return subtitles
2449
2450     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2451         res = self._download_xml_handle(
2452             xspf_url, playlist_id, 'Downloading xpsf playlist',
2453             'Unable to download xspf manifest', fatal=fatal)
2454         if res is False:
2455             return []
2456
2457         xspf, urlh = res
2458         xspf_url = urlh.geturl()
2459
2460         return self._parse_xspf(
2461             xspf, playlist_id, xspf_url=xspf_url,
2462             xspf_base_url=base_url(xspf_url))
2463
2464     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2465         NS_MAP = {
2466             'xspf': 'http://xspf.org/ns/0/',
2467             's1': 'http://static.streamone.nl/player/ns/0',
2468         }
2469
2470         entries = []
2471         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2472             title = xpath_text(
2473                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2474             description = xpath_text(
2475                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2476             thumbnail = xpath_text(
2477                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2478             duration = float_or_none(
2479                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2480
2481             formats = []
2482             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2483                 format_url = urljoin(xspf_base_url, location.text)
2484                 if not format_url:
2485                     continue
2486                 formats.append({
2487                     'url': format_url,
2488                     'manifest_url': xspf_url,
2489                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2490                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2491                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2492                 })
2493
2494             entries.append({
2495                 'id': playlist_id,
2496                 'title': title,
2497                 'description': description,
2498                 'thumbnail': thumbnail,
2499                 'duration': duration,
2500                 'formats': formats,
2501             })
2502         return entries
2503
2504     def _extract_mpd_formats(self, *args, **kwargs):
2505         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2506         if subs:
2507             self._report_ignoring_subs('DASH')
2508         return fmts
2509
2510     def _extract_mpd_formats_and_subtitles(
2511             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2512             fatal=True, data=None, headers={}, query={}):
2513
2514         if self.get_param('ignore_no_formats_error'):
2515             fatal = False
2516
2517         res = self._download_xml_handle(
2518             mpd_url, video_id,
2519             note='Downloading MPD manifest' if note is None else note,
2520             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2521             fatal=fatal, data=data, headers=headers, query=query)
2522         if res is False:
2523             return [], {}
2524         mpd_doc, urlh = res
2525         if mpd_doc is None:
2526             return [], {}
2527
2528         # We could have been redirected to a new url when we retrieved our mpd file.
2529         mpd_url = urlh.geturl()
2530         mpd_base_url = base_url(mpd_url)
2531
2532         return self._parse_mpd_formats_and_subtitles(
2533             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2534
2535     def _parse_mpd_formats(self, *args, **kwargs):
2536         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2537         if subs:
2538             self._report_ignoring_subs('DASH')
2539         return fmts
2540
2541     def _parse_mpd_formats_and_subtitles(
2542             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2543         """
2544         Parse formats from MPD manifest.
2545         References:
2546          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2547             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2548          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2549         """
2550         if not self.get_param('dynamic_mpd', True):
2551             if mpd_doc.get('type') == 'dynamic':
2552                 return [], {}
2553
2554         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2555
2556         def _add_ns(path):
2557             return self._xpath_ns(path, namespace)
2558
2559         def is_drm_protected(element):
2560             return element.find(_add_ns('ContentProtection')) is not None
2561
2562         def extract_multisegment_info(element, ms_parent_info):
2563             ms_info = ms_parent_info.copy()
2564
2565             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2566             # common attributes and elements.  We will only extract relevant
2567             # for us.
2568             def extract_common(source):
2569                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2570                 if segment_timeline is not None:
2571                     s_e = segment_timeline.findall(_add_ns('S'))
2572                     if s_e:
2573                         ms_info['total_number'] = 0
2574                         ms_info['s'] = []
2575                         for s in s_e:
2576                             r = int(s.get('r', 0))
2577                             ms_info['total_number'] += 1 + r
2578                             ms_info['s'].append({
2579                                 't': int(s.get('t', 0)),
2580                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2581                                 'd': int(s.attrib['d']),
2582                                 'r': r,
2583                             })
2584                 start_number = source.get('startNumber')
2585                 if start_number:
2586                     ms_info['start_number'] = int(start_number)
2587                 timescale = source.get('timescale')
2588                 if timescale:
2589                     ms_info['timescale'] = int(timescale)
2590                 segment_duration = source.get('duration')
2591                 if segment_duration:
2592                     ms_info['segment_duration'] = float(segment_duration)
2593
2594             def extract_Initialization(source):
2595                 initialization = source.find(_add_ns('Initialization'))
2596                 if initialization is not None:
2597                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2598
2599             segment_list = element.find(_add_ns('SegmentList'))
2600             if segment_list is not None:
2601                 extract_common(segment_list)
2602                 extract_Initialization(segment_list)
2603                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2604                 if segment_urls_e:
2605                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2606             else:
2607                 segment_template = element.find(_add_ns('SegmentTemplate'))
2608                 if segment_template is not None:
2609                     extract_common(segment_template)
2610                     media = segment_template.get('media')
2611                     if media:
2612                         ms_info['media'] = media
2613                     initialization = segment_template.get('initialization')
2614                     if initialization:
2615                         ms_info['initialization'] = initialization
2616                     else:
2617                         extract_Initialization(segment_template)
2618             return ms_info
2619
2620         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2621         formats, subtitles = [], {}
2622         stream_numbers = collections.defaultdict(int)
2623         for period in mpd_doc.findall(_add_ns('Period')):
2624             period_duration = parse_duration(period.get('duration')) or mpd_duration
2625             period_ms_info = extract_multisegment_info(period, {
2626                 'start_number': 1,
2627                 'timescale': 1,
2628             })
2629             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2630                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2631                 for representation in adaptation_set.findall(_add_ns('Representation')):
2632                     representation_attrib = adaptation_set.attrib.copy()
2633                     representation_attrib.update(representation.attrib)
2634                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2635                     mime_type = representation_attrib['mimeType']
2636                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2637
2638                     codec_str = representation_attrib.get('codecs', '')
2639                     # Some kind of binary subtitle found in some youtube livestreams
2640                     if mime_type == 'application/x-rawcc':
2641                         codecs = {'scodec': codec_str}
2642                     else:
2643                         codecs = parse_codecs(codec_str)
2644                     if content_type not in ('video', 'audio', 'text'):
2645                         if mime_type == 'image/jpeg':
2646                             content_type = mime_type
2647                         elif codecs.get('vcodec', 'none') != 'none':
2648                             content_type = 'video'
2649                         elif codecs.get('acodec', 'none') != 'none':
2650                             content_type = 'audio'
2651                         elif codecs.get('scodec', 'none') != 'none':
2652                             content_type = 'text'
2653                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2654                             content_type = 'text'
2655                         else:
2656                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2657                             continue
2658
2659                     base_url = ''
2660                     for element in (representation, adaptation_set, period, mpd_doc):
2661                         base_url_e = element.find(_add_ns('BaseURL'))
2662                         if try_call(lambda: base_url_e.text) is not None:
2663                             base_url = base_url_e.text + base_url
2664                             if re.match(r'^https?://', base_url):
2665                                 break
2666                     if mpd_base_url and base_url.startswith('/'):
2667                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2668                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2669                         if not mpd_base_url.endswith('/'):
2670                             mpd_base_url += '/'
2671                         base_url = mpd_base_url + base_url
2672                     representation_id = representation_attrib.get('id')
2673                     lang = representation_attrib.get('lang')
2674                     url_el = representation.find(_add_ns('BaseURL'))
2675                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2676                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2677                     if representation_id is not None:
2678                         format_id = representation_id
2679                     else:
2680                         format_id = content_type
2681                     if mpd_id:
2682                         format_id = mpd_id + '-' + format_id
2683                     if content_type in ('video', 'audio'):
2684                         f = {
2685                             'format_id': format_id,
2686                             'manifest_url': mpd_url,
2687                             'ext': mimetype2ext(mime_type),
2688                             'width': int_or_none(representation_attrib.get('width')),
2689                             'height': int_or_none(representation_attrib.get('height')),
2690                             'tbr': float_or_none(bandwidth, 1000),
2691                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2692                             'fps': int_or_none(representation_attrib.get('frameRate')),
2693                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2694                             'format_note': 'DASH %s' % content_type,
2695                             'filesize': filesize,
2696                             'container': mimetype2ext(mime_type) + '_dash',
2697                             **codecs
2698                         }
2699                     elif content_type == 'text':
2700                         f = {
2701                             'ext': mimetype2ext(mime_type),
2702                             'manifest_url': mpd_url,
2703                             'filesize': filesize,
2704                         }
2705                     elif content_type == 'image/jpeg':
2706                         # See test case in VikiIE
2707                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2708                         f = {
2709                             'format_id': format_id,
2710                             'ext': 'mhtml',
2711                             'manifest_url': mpd_url,
2712                             'format_note': 'DASH storyboards (jpeg)',
2713                             'acodec': 'none',
2714                             'vcodec': 'none',
2715                         }
2716                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2717                         f['has_drm'] = True
2718                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2719
2720                     def prepare_template(template_name, identifiers):
2721                         tmpl = representation_ms_info[template_name]
2722                         if representation_id is not None:
2723                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2724                         # First of, % characters outside $...$ templates
2725                         # must be escaped by doubling for proper processing
2726                         # by % operator string formatting used further (see
2727                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2728                         t = ''
2729                         in_template = False
2730                         for c in tmpl:
2731                             t += c
2732                             if c == '$':
2733                                 in_template = not in_template
2734                             elif c == '%' and not in_template:
2735                                 t += c
2736                         # Next, $...$ templates are translated to their
2737                         # %(...) counterparts to be used with % operator
2738                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2739                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2740                         t.replace('$$', '$')
2741                         return t
2742
2743                     # @initialization is a regular template like @media one
2744                     # so it should be handled just the same way (see
2745                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2746                     if 'initialization' in representation_ms_info:
2747                         initialization_template = prepare_template(
2748                             'initialization',
2749                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2750                             # $Time$ shall not be included for @initialization thus
2751                             # only $Bandwidth$ remains
2752                             ('Bandwidth', ))
2753                         representation_ms_info['initialization_url'] = initialization_template % {
2754                             'Bandwidth': bandwidth,
2755                         }
2756
2757                     def location_key(location):
2758                         return 'url' if re.match(r'^https?://', location) else 'path'
2759
2760                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2761
2762                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2763                         media_location_key = location_key(media_template)
2764
2765                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2766                         # can't be used at the same time
2767                         if '%(Number' in media_template and 's' not in representation_ms_info:
2768                             segment_duration = None
2769                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2770                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2771                                 representation_ms_info['total_number'] = int(math.ceil(
2772                                     float_or_none(period_duration, segment_duration, default=0)))
2773                             representation_ms_info['fragments'] = [{
2774                                 media_location_key: media_template % {
2775                                     'Number': segment_number,
2776                                     'Bandwidth': bandwidth,
2777                                 },
2778                                 'duration': segment_duration,
2779                             } for segment_number in range(
2780                                 representation_ms_info['start_number'],
2781                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2782                         else:
2783                             # $Number*$ or $Time$ in media template with S list available
2784                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2785                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2786                             representation_ms_info['fragments'] = []
2787                             segment_time = 0
2788                             segment_d = None
2789                             segment_number = representation_ms_info['start_number']
2790
2791                             def add_segment_url():
2792                                 segment_url = media_template % {
2793                                     'Time': segment_time,
2794                                     'Bandwidth': bandwidth,
2795                                     'Number': segment_number,
2796                                 }
2797                                 representation_ms_info['fragments'].append({
2798                                     media_location_key: segment_url,
2799                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2800                                 })
2801
2802                             for num, s in enumerate(representation_ms_info['s']):
2803                                 segment_time = s.get('t') or segment_time
2804                                 segment_d = s['d']
2805                                 add_segment_url()
2806                                 segment_number += 1
2807                                 for r in range(s.get('r', 0)):
2808                                     segment_time += segment_d
2809                                     add_segment_url()
2810                                     segment_number += 1
2811                                 segment_time += segment_d
2812                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2813                         # No media template,
2814                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2815                         # or any YouTube dashsegments video
2816                         fragments = []
2817                         segment_index = 0
2818                         timescale = representation_ms_info['timescale']
2819                         for s in representation_ms_info['s']:
2820                             duration = float_or_none(s['d'], timescale)
2821                             for r in range(s.get('r', 0) + 1):
2822                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2823                                 fragments.append({
2824                                     location_key(segment_uri): segment_uri,
2825                                     'duration': duration,
2826                                 })
2827                                 segment_index += 1
2828                         representation_ms_info['fragments'] = fragments
2829                     elif 'segment_urls' in representation_ms_info:
2830                         # Segment URLs with no SegmentTimeline
2831                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2832                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2833                         fragments = []
2834                         segment_duration = float_or_none(
2835                             representation_ms_info['segment_duration'],
2836                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2837                         for segment_url in representation_ms_info['segment_urls']:
2838                             fragment = {
2839                                 location_key(segment_url): segment_url,
2840                             }
2841                             if segment_duration:
2842                                 fragment['duration'] = segment_duration
2843                             fragments.append(fragment)
2844                         representation_ms_info['fragments'] = fragments
2845                     # If there is a fragments key available then we correctly recognized fragmented media.
2846                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2847                     # assumption is not necessarily correct since we may simply have no support for
2848                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2849                     if 'fragments' in representation_ms_info:
2850                         f.update({
2851                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2852                             'url': mpd_url or base_url,
2853                             'fragment_base_url': base_url,
2854                             'fragments': [],
2855                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2856                         })
2857                         if 'initialization_url' in representation_ms_info:
2858                             initialization_url = representation_ms_info['initialization_url']
2859                             if not f.get('url'):
2860                                 f['url'] = initialization_url
2861                             f['fragments'].append({location_key(initialization_url): initialization_url})
2862                         f['fragments'].extend(representation_ms_info['fragments'])
2863                         if not period_duration:
2864                             period_duration = try_get(
2865                                 representation_ms_info,
2866                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2867                     else:
2868                         # Assuming direct URL to unfragmented media.
2869                         f['url'] = base_url
2870                     if content_type in ('video', 'audio', 'image/jpeg'):
2871                         f['manifest_stream_number'] = stream_numbers[f['url']]
2872                         stream_numbers[f['url']] += 1
2873                         formats.append(f)
2874                     elif content_type == 'text':
2875                         subtitles.setdefault(lang or 'und', []).append(f)
2876
2877         return formats, subtitles
2878
2879     def _extract_ism_formats(self, *args, **kwargs):
2880         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2881         if subs:
2882             self._report_ignoring_subs('ISM')
2883         return fmts
2884
2885     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2886         if self.get_param('ignore_no_formats_error'):
2887             fatal = False
2888
2889         res = self._download_xml_handle(
2890             ism_url, video_id,
2891             note='Downloading ISM manifest' if note is None else note,
2892             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2893             fatal=fatal, data=data, headers=headers, query=query)
2894         if res is False:
2895             return [], {}
2896         ism_doc, urlh = res
2897         if ism_doc is None:
2898             return [], {}
2899
2900         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2901
2902     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2903         """
2904         Parse formats from ISM manifest.
2905         References:
2906          1. [MS-SSTR]: Smooth Streaming Protocol,
2907             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2908         """
2909         if ism_doc.get('IsLive') == 'TRUE':
2910             return [], {}
2911
2912         duration = int(ism_doc.attrib['Duration'])
2913         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2914
2915         formats = []
2916         subtitles = {}
2917         for stream in ism_doc.findall('StreamIndex'):
2918             stream_type = stream.get('Type')
2919             if stream_type not in ('video', 'audio', 'text'):
2920                 continue
2921             url_pattern = stream.attrib['Url']
2922             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2923             stream_name = stream.get('Name')
2924             stream_language = stream.get('Language', 'und')
2925             for track in stream.findall('QualityLevel'):
2926                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2927                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
2928                 # TODO: add support for WVC1 and WMAP
2929                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
2930                     self.report_warning('%s is not a supported codec' % fourcc)
2931                     continue
2932                 tbr = int(track.attrib['Bitrate']) // 1000
2933                 # [1] does not mention Width and Height attributes. However,
2934                 # they're often present while MaxWidth and MaxHeight are
2935                 # missing, so should be used as fallbacks
2936                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2937                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2938                 sampling_rate = int_or_none(track.get('SamplingRate'))
2939
2940                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2941                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
2942
2943                 fragments = []
2944                 fragment_ctx = {
2945                     'time': 0,
2946                 }
2947                 stream_fragments = stream.findall('c')
2948                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2949                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2950                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2951                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2952                     if not fragment_ctx['duration']:
2953                         try:
2954                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2955                         except IndexError:
2956                             next_fragment_time = duration
2957                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2958                     for _ in range(fragment_repeat):
2959                         fragments.append({
2960                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
2961                             'duration': fragment_ctx['duration'] / stream_timescale,
2962                         })
2963                         fragment_ctx['time'] += fragment_ctx['duration']
2964
2965                 if stream_type == 'text':
2966                     subtitles.setdefault(stream_language, []).append({
2967                         'ext': 'ismt',
2968                         'protocol': 'ism',
2969                         'url': ism_url,
2970                         'manifest_url': ism_url,
2971                         'fragments': fragments,
2972                         '_download_params': {
2973                             'stream_type': stream_type,
2974                             'duration': duration,
2975                             'timescale': stream_timescale,
2976                             'fourcc': fourcc,
2977                             'language': stream_language,
2978                             'codec_private_data': track.get('CodecPrivateData'),
2979                         }
2980                     })
2981                 elif stream_type in ('video', 'audio'):
2982                     formats.append({
2983                         'format_id': join_nonempty(ism_id, stream_name, tbr),
2984                         'url': ism_url,
2985                         'manifest_url': ism_url,
2986                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2987                         'width': width,
2988                         'height': height,
2989                         'tbr': tbr,
2990                         'asr': sampling_rate,
2991                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2992                         'acodec': 'none' if stream_type == 'video' else fourcc,
2993                         'protocol': 'ism',
2994                         'fragments': fragments,
2995                         'has_drm': ism_doc.find('Protection') is not None,
2996                         'language': stream_language,
2997                         'audio_channels': int_or_none(track.get('Channels')),
2998                         '_download_params': {
2999                             'stream_type': stream_type,
3000                             'duration': duration,
3001                             'timescale': stream_timescale,
3002                             'width': width or 0,
3003                             'height': height or 0,
3004                             'fourcc': fourcc,
3005                             'language': stream_language,
3006                             'codec_private_data': track.get('CodecPrivateData'),
3007                             'sampling_rate': sampling_rate,
3008                             'channels': int_or_none(track.get('Channels', 2)),
3009                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3010                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3011                         },
3012                     })
3013         return formats, subtitles
3014
3015     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3016         def absolute_url(item_url):
3017             return urljoin(base_url, item_url)
3018
3019         def parse_content_type(content_type):
3020             if not content_type:
3021                 return {}
3022             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3023             if ctr:
3024                 mimetype, codecs = ctr.groups()
3025                 f = parse_codecs(codecs)
3026                 f['ext'] = mimetype2ext(mimetype)
3027                 return f
3028             return {}
3029
3030         def _media_formats(src, cur_media_type, type_info=None):
3031             type_info = type_info or {}
3032             full_url = absolute_url(src)
3033             ext = type_info.get('ext') or determine_ext(full_url)
3034             if ext == 'm3u8':
3035                 is_plain_url = False
3036                 formats = self._extract_m3u8_formats(
3037                     full_url, video_id, ext='mp4',
3038                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3039                     preference=preference, quality=quality, fatal=False)
3040             elif ext == 'mpd':
3041                 is_plain_url = False
3042                 formats = self._extract_mpd_formats(
3043                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3044             else:
3045                 is_plain_url = True
3046                 formats = [{
3047                     'url': full_url,
3048                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3049                     'ext': ext,
3050                 }]
3051             return is_plain_url, formats
3052
3053         entries = []
3054         # amp-video and amp-audio are very similar to their HTML5 counterparts
3055         # so we will include them right here (see
3056         # https://www.ampproject.org/docs/reference/components/amp-video)
3057         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3058         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3059         media_tags = [(media_tag, media_tag_name, media_type, '')
3060                       for media_tag, media_tag_name, media_type
3061                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3062         media_tags.extend(re.findall(
3063             # We only allow video|audio followed by a whitespace or '>'.
3064             # Allowing more characters may end up in significant slow down (see
3065             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3066             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3067             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3068         for media_tag, _, media_type, media_content in media_tags:
3069             media_info = {
3070                 'formats': [],
3071                 'subtitles': {},
3072             }
3073             media_attributes = extract_attributes(media_tag)
3074             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3075             if src:
3076                 f = parse_content_type(media_attributes.get('type'))
3077                 _, formats = _media_formats(src, media_type, f)
3078                 media_info['formats'].extend(formats)
3079             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3080             if media_content:
3081                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3082                     s_attr = extract_attributes(source_tag)
3083                     # data-video-src and data-src are non standard but seen
3084                     # several times in the wild
3085                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3086                     if not src:
3087                         continue
3088                     f = parse_content_type(s_attr.get('type'))
3089                     is_plain_url, formats = _media_formats(src, media_type, f)
3090                     if is_plain_url:
3091                         # width, height, res, label and title attributes are
3092                         # all not standard but seen several times in the wild
3093                         labels = [
3094                             s_attr.get(lbl)
3095                             for lbl in ('label', 'title')
3096                             if str_or_none(s_attr.get(lbl))
3097                         ]
3098                         width = int_or_none(s_attr.get('width'))
3099                         height = (int_or_none(s_attr.get('height'))
3100                                   or int_or_none(s_attr.get('res')))
3101                         if not width or not height:
3102                             for lbl in labels:
3103                                 resolution = parse_resolution(lbl)
3104                                 if not resolution:
3105                                     continue
3106                                 width = width or resolution.get('width')
3107                                 height = height or resolution.get('height')
3108                         for lbl in labels:
3109                             tbr = parse_bitrate(lbl)
3110                             if tbr:
3111                                 break
3112                         else:
3113                             tbr = None
3114                         f.update({
3115                             'width': width,
3116                             'height': height,
3117                             'tbr': tbr,
3118                             'format_id': s_attr.get('label') or s_attr.get('title'),
3119                         })
3120                         f.update(formats[0])
3121                         media_info['formats'].append(f)
3122                     else:
3123                         media_info['formats'].extend(formats)
3124                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3125                     track_attributes = extract_attributes(track_tag)
3126                     kind = track_attributes.get('kind')
3127                     if not kind or kind in ('subtitles', 'captions'):
3128                         src = strip_or_none(track_attributes.get('src'))
3129                         if not src:
3130                             continue
3131                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3132                         media_info['subtitles'].setdefault(lang, []).append({
3133                             'url': absolute_url(src),
3134                         })
3135             for f in media_info['formats']:
3136                 f.setdefault('http_headers', {})['Referer'] = base_url
3137             if media_info['formats'] or media_info['subtitles']:
3138                 entries.append(media_info)
3139         return entries
3140
3141     def _extract_akamai_formats(self, *args, **kwargs):
3142         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3143         if subs:
3144             self._report_ignoring_subs('akamai')
3145         return fmts
3146
3147     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3148         signed = 'hdnea=' in manifest_url
3149         if not signed:
3150             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3151             manifest_url = re.sub(
3152                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3153                 '', manifest_url).strip('?')
3154
3155         formats = []
3156         subtitles = {}
3157
3158         hdcore_sign = 'hdcore=3.7.0'
3159         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3160         hds_host = hosts.get('hds')
3161         if hds_host:
3162             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3163         if 'hdcore=' not in f4m_url:
3164             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3165         f4m_formats = self._extract_f4m_formats(
3166             f4m_url, video_id, f4m_id='hds', fatal=False)
3167         for entry in f4m_formats:
3168             entry.update({'extra_param_to_segment_url': hdcore_sign})
3169         formats.extend(f4m_formats)
3170
3171         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3172         hls_host = hosts.get('hls')
3173         if hls_host:
3174             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3175         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3176             m3u8_url, video_id, 'mp4', 'm3u8_native',
3177             m3u8_id='hls', fatal=False)
3178         formats.extend(m3u8_formats)
3179         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3180
3181         http_host = hosts.get('http')
3182         if http_host and m3u8_formats and not signed:
3183             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3184             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3185             qualities_length = len(qualities)
3186             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3187                 i = 0
3188                 for f in m3u8_formats:
3189                     if f['vcodec'] != 'none':
3190                         for protocol in ('http', 'https'):
3191                             http_f = f.copy()
3192                             del http_f['manifest_url']
3193                             http_url = re.sub(
3194                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3195                             http_f.update({
3196                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3197                                 'url': http_url,
3198                                 'protocol': protocol,
3199                             })
3200                             formats.append(http_f)
3201                         i += 1
3202
3203         return formats, subtitles
3204
3205     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3206         query = urllib.parse.urlparse(url).query
3207         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3208         mobj = re.search(
3209             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3210         url_base = mobj.group('url')
3211         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3212         formats = []
3213
3214         def manifest_url(manifest):
3215             m_url = f'{http_base_url}/{manifest}'
3216             if query:
3217                 m_url += '?%s' % query
3218             return m_url
3219
3220         if 'm3u8' not in skip_protocols:
3221             formats.extend(self._extract_m3u8_formats(
3222                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3223                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3224         if 'f4m' not in skip_protocols:
3225             formats.extend(self._extract_f4m_formats(
3226                 manifest_url('manifest.f4m'),
3227                 video_id, f4m_id='hds', fatal=False))
3228         if 'dash' not in skip_protocols:
3229             formats.extend(self._extract_mpd_formats(
3230                 manifest_url('manifest.mpd'),
3231                 video_id, mpd_id='dash', fatal=False))
3232         if re.search(r'(?:/smil:|\.smil)', url_base):
3233             if 'smil' not in skip_protocols:
3234                 rtmp_formats = self._extract_smil_formats(
3235                     manifest_url('jwplayer.smil'),
3236                     video_id, fatal=False)
3237                 for rtmp_format in rtmp_formats:
3238                     rtsp_format = rtmp_format.copy()
3239                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3240                     del rtsp_format['play_path']
3241                     del rtsp_format['ext']
3242                     rtsp_format.update({
3243                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3244                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3245                         'protocol': 'rtsp',
3246                     })
3247                     formats.extend([rtmp_format, rtsp_format])
3248         else:
3249             for protocol in ('rtmp', 'rtsp'):
3250                 if protocol not in skip_protocols:
3251                     formats.append({
3252                         'url': f'{protocol}:{url_base}',
3253                         'format_id': protocol,
3254                         'protocol': protocol,
3255                     })
3256         return formats
3257
3258     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3259         mobj = re.search(
3260             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3261             webpage)
3262         if mobj:
3263             try:
3264                 jwplayer_data = self._parse_json(mobj.group('options'),
3265                                                  video_id=video_id,
3266                                                  transform_source=transform_source)
3267             except ExtractorError:
3268                 pass
3269             else:
3270                 if isinstance(jwplayer_data, dict):
3271                     return jwplayer_data
3272
3273     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3274         jwplayer_data = self._find_jwplayer_data(
3275             webpage, video_id, transform_source=js_to_json)
3276         return self._parse_jwplayer_data(
3277             jwplayer_data, video_id, *args, **kwargs)
3278
3279     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3280                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3281         entries = []
3282         if not isinstance(jwplayer_data, dict):
3283             return entries
3284
3285         playlist_items = jwplayer_data.get('playlist')
3286         # JWPlayer backward compatibility: single playlist item/flattened playlists
3287         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3288         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3289         if not isinstance(playlist_items, list):
3290             playlist_items = (playlist_items or jwplayer_data, )
3291
3292         for video_data in playlist_items:
3293             if not isinstance(video_data, dict):
3294                 continue
3295             # JWPlayer backward compatibility: flattened sources
3296             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3297             if 'sources' not in video_data:
3298                 video_data['sources'] = [video_data]
3299
3300             this_video_id = video_id or video_data['mediaid']
3301
3302             formats = self._parse_jwplayer_formats(
3303                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3304                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3305
3306             subtitles = {}
3307             tracks = video_data.get('tracks')
3308             if tracks and isinstance(tracks, list):
3309                 for track in tracks:
3310                     if not isinstance(track, dict):
3311                         continue
3312                     track_kind = track.get('kind')
3313                     if not track_kind or not isinstance(track_kind, str):
3314                         continue
3315                     if track_kind.lower() not in ('captions', 'subtitles'):
3316                         continue
3317                     track_url = urljoin(base_url, track.get('file'))
3318                     if not track_url:
3319                         continue
3320                     subtitles.setdefault(track.get('label') or 'en', []).append({
3321                         'url': self._proto_relative_url(track_url)
3322                     })
3323
3324             entry = {
3325                 'id': this_video_id,
3326                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3327                 'description': clean_html(video_data.get('description')),
3328                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3329                 'timestamp': int_or_none(video_data.get('pubdate')),
3330                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3331                 'subtitles': subtitles,
3332                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3333                 'genre': clean_html(video_data.get('genre')),
3334                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3335                 'season_number': int_or_none(video_data.get('season')),
3336                 'episode_number': int_or_none(video_data.get('episode')),
3337                 'release_year': int_or_none(video_data.get('releasedate')),
3338                 'age_limit': int_or_none(video_data.get('age_restriction')),
3339             }
3340             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3341             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3342                 entry.update({
3343                     '_type': 'url_transparent',
3344                     'url': formats[0]['url'],
3345                 })
3346             else:
3347                 entry['formats'] = formats
3348             entries.append(entry)
3349         if len(entries) == 1:
3350             return entries[0]
3351         else:
3352             return self.playlist_result(entries)
3353
3354     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3355                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3356         urls = set()
3357         formats = []
3358         for source in jwplayer_sources_data:
3359             if not isinstance(source, dict):
3360                 continue
3361             source_url = urljoin(
3362                 base_url, self._proto_relative_url(source.get('file')))
3363             if not source_url or source_url in urls:
3364                 continue
3365             urls.add(source_url)
3366             source_type = source.get('type') or ''
3367             ext = mimetype2ext(source_type) or determine_ext(source_url)
3368             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3369                 formats.extend(self._extract_m3u8_formats(
3370                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3371                     m3u8_id=m3u8_id, fatal=False))
3372             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3373                 formats.extend(self._extract_mpd_formats(
3374                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3375             elif ext == 'smil':
3376                 formats.extend(self._extract_smil_formats(
3377                     source_url, video_id, fatal=False))
3378             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3379             elif source_type.startswith('audio') or ext in (
3380                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3381                 formats.append({
3382                     'url': source_url,
3383                     'vcodec': 'none',
3384                     'ext': ext,
3385                 })
3386             else:
3387                 format_id = str_or_none(source.get('label'))
3388                 height = int_or_none(source.get('height'))
3389                 if height is None and format_id:
3390                     # Often no height is provided but there is a label in
3391                     # format like "1080p", "720p SD", or 1080.
3392                     height = parse_resolution(format_id).get('height')
3393                 a_format = {
3394                     'url': source_url,
3395                     'width': int_or_none(source.get('width')),
3396                     'height': height,
3397                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3398                     'filesize': int_or_none(source.get('filesize')),
3399                     'ext': ext,
3400                     'format_id': format_id
3401                 }
3402                 if source_url.startswith('rtmp'):
3403                     a_format['ext'] = 'flv'
3404                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3405                     # of jwplayer.flash.swf
3406                     rtmp_url_parts = re.split(
3407                         r'((?:mp4|mp3|flv):)', source_url, 1)
3408                     if len(rtmp_url_parts) == 3:
3409                         rtmp_url, prefix, play_path = rtmp_url_parts
3410                         a_format.update({
3411                             'url': rtmp_url,
3412                             'play_path': prefix + play_path,
3413                         })
3414                     if rtmp_params:
3415                         a_format.update(rtmp_params)
3416                 formats.append(a_format)
3417         return formats
3418
3419     def _live_title(self, name):
3420         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3421         return name
3422
3423     def _int(self, v, name, fatal=False, **kwargs):
3424         res = int_or_none(v, **kwargs)
3425         if res is None:
3426             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3427             if fatal:
3428                 raise ExtractorError(msg)
3429             else:
3430                 self.report_warning(msg)
3431         return res
3432
3433     def _float(self, v, name, fatal=False, **kwargs):
3434         res = float_or_none(v, **kwargs)
3435         if res is None:
3436             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3437             if fatal:
3438                 raise ExtractorError(msg)
3439             else:
3440                 self.report_warning(msg)
3441         return res
3442
3443     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3444                     path='/', secure=False, discard=False, rest={}, **kwargs):
3445         cookie = http.cookiejar.Cookie(
3446             0, name, value, port, port is not None, domain, True,
3447             domain.startswith('.'), path, True, secure, expire_time,
3448             discard, None, None, rest)
3449         self.cookiejar.set_cookie(cookie)
3450
3451     def _get_cookies(self, url):
3452         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3453         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3454
3455     def _apply_first_set_cookie_header(self, url_handle, cookie):
3456         """
3457         Apply first Set-Cookie header instead of the last. Experimental.
3458
3459         Some sites (e.g. [1-3]) may serve two cookies under the same name
3460         in Set-Cookie header and expect the first (old) one to be set rather
3461         than second (new). However, as of RFC6265 the newer one cookie
3462         should be set into cookie store what actually happens.
3463         We will workaround this issue by resetting the cookie to
3464         the first one manually.
3465         1. https://new.vk.com/
3466         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3467         3. https://learning.oreilly.com/
3468         """
3469         for header, cookies in url_handle.headers.items():
3470             if header.lower() != 'set-cookie':
3471                 continue
3472             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3473             cookie_value = re.search(
3474                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3475             if cookie_value:
3476                 value, domain = cookie_value.groups()
3477                 self._set_cookie(domain, cookie, value)
3478                 break
3479
3480     @classmethod
3481     def get_testcases(cls, include_onlymatching=False):
3482         # Do not look in super classes
3483         t = vars(cls).get('_TEST')
3484         if t:
3485             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3486             tests = [t]
3487         else:
3488             tests = vars(cls).get('_TESTS', [])
3489         for t in tests:
3490             if not include_onlymatching and t.get('only_matching', False):
3491                 continue
3492             t['name'] = cls.ie_key()
3493             yield t
3494         if getattr(cls, '__wrapped__', None):
3495             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3496
3497     @classmethod
3498     def get_webpage_testcases(cls):
3499         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3500         for t in tests:
3501             t['name'] = cls.ie_key()
3502             yield t
3503         if getattr(cls, '__wrapped__', None):
3504             yield from cls.__wrapped__.get_webpage_testcases()
3505
3506     @classproperty(cache=True)
3507     def age_limit(cls):
3508         """Get age limit from the testcases"""
3509         return max(traverse_obj(
3510             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3511             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3512
3513     @classproperty(cache=True)
3514     def _RETURN_TYPE(cls):
3515         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3516         tests = tuple(cls.get_testcases(include_onlymatching=False))
3517         if not tests:
3518             return None
3519         elif not any(k.startswith('playlist') for test in tests for k in test):
3520             return 'video'
3521         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3522             return 'playlist'
3523         return 'any'
3524
3525     @classmethod
3526     def is_single_video(cls, url):
3527         """Returns whether the URL is of a single video, None if unknown"""
3528         if cls.suitable(url):
3529             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3530
3531     @classmethod
3532     def is_suitable(cls, age_limit):
3533         """Test whether the extractor is generally suitable for the given age limit"""
3534         return not age_restricted(cls.age_limit, age_limit)
3535
3536     @classmethod
3537     def description(cls, *, markdown=True, search_examples=None):
3538         """Description of the extractor"""
3539         desc = ''
3540         if cls._NETRC_MACHINE:
3541             if markdown:
3542                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3543             else:
3544                 desc += f' [{cls._NETRC_MACHINE}]'
3545         if cls.IE_DESC is False:
3546             desc += ' [HIDDEN]'
3547         elif cls.IE_DESC:
3548             desc += f' {cls.IE_DESC}'
3549         if cls.SEARCH_KEY:
3550             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3551             if search_examples:
3552                 _COUNTS = ('', '5', '10', 'all')
3553                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3554         if not cls.working():
3555             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3556
3557         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3558         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3559         return f'{name}:{desc}' if desc else name
3560
3561     def extract_subtitles(self, *args, **kwargs):
3562         if (self.get_param('writesubtitles', False)
3563                 or self.get_param('listsubtitles')):
3564             return self._get_subtitles(*args, **kwargs)
3565         return {}
3566
3567     def _get_subtitles(self, *args, **kwargs):
3568         raise NotImplementedError('This method must be implemented by subclasses')
3569
3570     class CommentsDisabled(Exception):
3571         """Raise in _get_comments if comments are disabled for the video"""
3572
3573     def extract_comments(self, *args, **kwargs):
3574         if not self.get_param('getcomments'):
3575             return None
3576         generator = self._get_comments(*args, **kwargs)
3577
3578         def extractor():
3579             comments = []
3580             interrupted = True
3581             try:
3582                 while True:
3583                     comments.append(next(generator))
3584             except StopIteration:
3585                 interrupted = False
3586             except KeyboardInterrupt:
3587                 self.to_screen('Interrupted by user')
3588             except self.CommentsDisabled:
3589                 return {'comments': None, 'comment_count': None}
3590             except Exception as e:
3591                 if self.get_param('ignoreerrors') is not True:
3592                     raise
3593                 self._downloader.report_error(e)
3594             comment_count = len(comments)
3595             self.to_screen(f'Extracted {comment_count} comments')
3596             return {
3597                 'comments': comments,
3598                 'comment_count': None if interrupted else comment_count
3599             }
3600         return extractor
3601
3602     def _get_comments(self, *args, **kwargs):
3603         raise NotImplementedError('This method must be implemented by subclasses')
3604
3605     @staticmethod
3606     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3607         """ Merge subtitle items for one language. Items with duplicated URLs/data
3608         will be dropped. """
3609         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3610         ret = list(subtitle_list1)
3611         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3612         return ret
3613
3614     @classmethod
3615     def _merge_subtitles(cls, *dicts, target=None):
3616         """ Merge subtitle dictionaries, language by language. """
3617         if target is None:
3618             target = {}
3619         for d in dicts:
3620             for lang, subs in d.items():
3621                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3622         return target
3623
3624     def extract_automatic_captions(self, *args, **kwargs):
3625         if (self.get_param('writeautomaticsub', False)
3626                 or self.get_param('listsubtitles')):
3627             return self._get_automatic_captions(*args, **kwargs)
3628         return {}
3629
3630     def _get_automatic_captions(self, *args, **kwargs):
3631         raise NotImplementedError('This method must be implemented by subclasses')
3632
3633     @functools.cached_property
3634     def _cookies_passed(self):
3635         """Whether cookies have been passed to YoutubeDL"""
3636         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3637
3638     def mark_watched(self, *args, **kwargs):
3639         if not self.get_param('mark_watched', False):
3640             return
3641         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3642             self._mark_watched(*args, **kwargs)
3643
3644     def _mark_watched(self, *args, **kwargs):
3645         raise NotImplementedError('This method must be implemented by subclasses')
3646
3647     def geo_verification_headers(self):
3648         headers = {}
3649         geo_verification_proxy = self.get_param('geo_verification_proxy')
3650         if geo_verification_proxy:
3651             headers['Ytdl-request-proxy'] = geo_verification_proxy
3652         return headers
3653
3654     @staticmethod
3655     def _generic_id(url):
3656         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3657
3658     def _generic_title(self, url='', webpage='', *, default=None):
3659         return (self._og_search_title(webpage, default=None)
3660                 or self._html_extract_title(webpage, default=None)
3661                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3662                 or default)
3663
3664     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3665         if not duration:
3666             return
3667         chapter_list = [{
3668             'start_time': start_function(chapter),
3669             'title': title_function(chapter),
3670         } for chapter in chapter_list or []]
3671         if strict:
3672             warn = self.report_warning
3673         else:
3674             warn = self.write_debug
3675             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3676
3677         chapters = [{'start_time': 0}]
3678         for idx, chapter in enumerate(chapter_list):
3679             if chapter['start_time'] is None:
3680                 warn(f'Incomplete chapter {idx}')
3681             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3682                 chapters.append(chapter)
3683             elif chapter not in chapters:
3684                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3685                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3686                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3687         return chapters[1:]
3688
3689     def _extract_chapters_from_description(self, description, duration):
3690         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3691         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3692         return self._extract_chapters_helper(
3693             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3694             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3695             duration=duration, strict=False) or self._extract_chapters_helper(
3696             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3697             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3698             duration=duration, strict=False)
3699
3700     @staticmethod
3701     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3702         all_known = all(map(
3703             lambda x: x is not None,
3704             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3705         return (
3706             'private' if is_private
3707             else 'premium_only' if needs_premium
3708             else 'subscriber_only' if needs_subscription
3709             else 'needs_auth' if needs_auth
3710             else 'unlisted' if is_unlisted
3711             else 'public' if all_known
3712             else None)
3713
3714     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3715         '''
3716         @returns            A list of values for the extractor argument given by "key"
3717                             or "default" if no such key is present
3718         @param default      The default value to return when the key is not present (default: [])
3719         @param casesense    When false, the values are converted to lower case
3720         '''
3721         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3722         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3723         if val is None:
3724             return [] if default is NO_DEFAULT else default
3725         return list(val) if casesense else [x.lower() for x in val]
3726
3727     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3728         if not playlist_id or not video_id:
3729             return not video_id
3730
3731         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3732         if no_playlist is not None:
3733             return not no_playlist
3734
3735         video_id = '' if video_id is True else f' {video_id}'
3736         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3737         if self.get_param('noplaylist'):
3738             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3739             return False
3740         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3741         return True
3742
3743     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3744         RetryManager.report_retry(
3745             err, _count or int(fatal), _retries,
3746             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3747             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3748
3749     def RetryManager(self, **kwargs):
3750         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3751
3752     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3753         display_id = traverse_obj(info_dict, 'display_id', 'id')
3754         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3755         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3756             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3757
3758     @classmethod
3759     def extract_from_webpage(cls, ydl, url, webpage):
3760         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3761               else ydl.get_info_extractor(cls.ie_key()))
3762         for info in ie._extract_from_webpage(url, webpage) or []:
3763             # url = None since we do not want to set (webpage/original)_url
3764             ydl.add_default_extra_info(info, ie, None)
3765             yield info
3766
3767     @classmethod
3768     def _extract_from_webpage(cls, url, webpage):
3769         for embed_url in orderedSet(
3770                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3771             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3772
3773     @classmethod
3774     def _extract_embed_urls(cls, url, webpage):
3775         """@returns all the embed urls on the webpage"""
3776         if '_EMBED_URL_RE' not in cls.__dict__:
3777             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3778             for idx, regex in enumerate(cls._EMBED_REGEX):
3779                 assert regex.count('(?P<url>') == 1, \
3780                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3781             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3782
3783         for regex in cls._EMBED_URL_RE:
3784             for mobj in regex.finditer(webpage):
3785                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3786                 if cls._VALID_URL is False or cls.suitable(embed_url):
3787                     yield embed_url
3788
3789     class StopExtraction(Exception):
3790         pass
3791
3792     @classmethod
3793     def _extract_url(cls, webpage):  # TODO: Remove
3794         """Only for compatibility with some older extractors"""
3795         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3796
3797     @classmethod
3798     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3799         if plugin_name:
3800             mro = inspect.getmro(cls)
3801             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3802             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3803             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3804             while getattr(super_class, '__wrapped__', None):
3805                 super_class = super_class.__wrapped__
3806             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3807             _PLUGIN_OVERRIDES[super_class].append(cls)
3808
3809         return super().__init_subclass__(**kwargs)
3810
3811
3812 class SearchInfoExtractor(InfoExtractor):
3813     """
3814     Base class for paged search queries extractors.
3815     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3816     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3817     """
3818
3819     _MAX_RESULTS = float('inf')
3820     _RETURN_TYPE = 'playlist'
3821
3822     @classproperty
3823     def _VALID_URL(cls):
3824         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3825
3826     def _real_extract(self, query):
3827         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3828         if prefix == '':
3829             return self._get_n_results(query, 1)
3830         elif prefix == 'all':
3831             return self._get_n_results(query, self._MAX_RESULTS)
3832         else:
3833             n = int(prefix)
3834             if n <= 0:
3835                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3836             elif n > self._MAX_RESULTS:
3837                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3838                 n = self._MAX_RESULTS
3839             return self._get_n_results(query, n)
3840
3841     def _get_n_results(self, query, n):
3842         """Get a specified number of results for a query.
3843         Either this function or _search_results must be overridden by subclasses """
3844         return self.playlist_result(
3845             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3846             query, query)
3847
3848     def _search_results(self, query):
3849         """Returns an iterator of search results"""
3850         raise NotImplementedError('This method must be implemented by subclasses')
3851
3852     @classproperty
3853     def SEARCH_KEY(cls):
3854         return cls._SEARCH_KEY
3855
3856
3857 class UnsupportedURLIE(InfoExtractor):
3858     _VALID_URL = '.*'
3859     _ENABLED = False
3860     IE_DESC = False
3861
3862     def _real_extract(self, url):
3863         raise UnsupportedError(url)
3864
3865
3866 _PLUGIN_OVERRIDES = collections.defaultdict(list)