yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import hashlib
   4 import itertools
   5 import json
   6 import math
   7 import netrc
   8 import os
   9 import random
  10 import sys
  11 import time
  12 import xml.etree.ElementTree
  13
  14 from ..compat import functools, re  # isort: split
  15 from ..compat import (
  16     compat_cookiejar_Cookie,
  17     compat_cookies_SimpleCookie,
  18     compat_etree_fromstring,
  19     compat_expanduser,
  20     compat_getpass,
  21     compat_http_client,
  22     compat_os_name,
  23     compat_str,
  24     compat_urllib_error,
  25     compat_urllib_parse_unquote,
  26     compat_urllib_parse_urlencode,
  27     compat_urllib_request,
  28     compat_urlparse,
  29 )
  30 from ..downloader import FileDownloader
  31 from ..downloader.f4m import get_base_url, remove_encrypted_media
  32 from ..utils import (
  33     JSON_LD_RE,
  34     NO_DEFAULT,
  35     ExtractorError,
  36     GeoRestrictedError,
  37     GeoUtils,
  38     RegexNotFoundError,
  39     UnsupportedError,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     classproperty,
  44     clean_html,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     encode_data_uri,
  49     error_to_compat_str,
  50     extract_attributes,
  51     filter_dict,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     int_or_none,
  56     join_nonempty,
  57     js_to_json,
  58     mimetype2ext,
  59     network_exceptions,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     sanitize_filename,
  68     sanitized_Request,
  69     str_or_none,
  70     str_to_int,
  71     strip_or_none,
  72     traverse_obj,
  73     try_get,
  74     unescapeHTML,
  75     unified_strdate,
  76     unified_timestamp,
  77     update_Request,
  78     update_url_query,
  79     url_basename,
  80     url_or_none,
  81     urljoin,
  82     variadic,
  83     xpath_element,
  84     xpath_text,
  85     xpath_with_ns,
  86 )
  87
  88
  89 class InfoExtractor:
  90     """Information Extractor class.
  91
  92     Information extractors are the classes that, given a URL, extract
  93     information about the video (or videos) the URL refers to. This
  94     information includes the real video URL, the video title, author and
  95     others. The information is stored in a dictionary which is then
  96     passed to the YoutubeDL. The YoutubeDL processes this
  97     information possibly downloading the video to the file system, among
  98     other possible outcomes.
  99
 100     The type field determines the type of the result.
 101     By far the most common value (and the default if _type is missing) is
 102     "video", which indicates a single video.
 103
 104     For a video, the dictionaries must include the following fields:
 105
 106     id:             Video identifier.
 107     title:          Video title, unescaped. Set to an empty string if video has
 108                     no title as opposed to "None" which signifies that the
 109                     extractor failed to obtain a title
 110
 111     Additionally, it must contain either a formats entry or a url one:
 112
 113     formats:        A list of dictionaries for each format available, ordered
 114                     from worst to best quality.
 115
 116                     Potential fields:
 117                     * url        The mandatory URL representing the media:
 118                                    for plain file media - HTTP URL of this file,
 119                                    for RTMP - RTMP URL,
 120                                    for HLS - URL of the M3U8 media playlist,
 121                                    for HDS - URL of the F4M manifest,
 122                                    for DASH
 123                                      - HTTP URL to plain file media (in case of
 124                                        unfragmented media)
 125                                      - URL of the MPD manifest or base URL
 126                                        representing the media if MPD manifest
 127                                        is parsed from a string (in case of
 128                                        fragmented media)
 129                                    for MSS - URL of the ISM manifest.
 130                     * manifest_url
 131                                  The URL of the manifest file in case of
 132                                  fragmented media:
 133                                    for HLS - URL of the M3U8 master playlist,
 134                                    for HDS - URL of the F4M manifest,
 135                                    for DASH - URL of the MPD manifest,
 136                                    for MSS - URL of the ISM manifest.
 137                     * manifest_stream_number  (For internal use only)
 138                                  The index of the stream in the manifest file
 139                     * ext        Will be calculated from URL if missing
 140                     * format     A human-readable description of the format
 141                                  ("mp4 container with h264/opus").
 142                                  Calculated from the format_id, width, height.
 143                                  and format_note fields if missing.
 144                     * format_id  A short description of the format
 145                                  ("mp4_h264_opus" or "19").
 146                                 Technically optional, but strongly recommended.
 147                     * format_note Additional info about the format
 148                                  ("3D" or "DASH video")
 149                     * width      Width of the video, if known
 150                     * height     Height of the video, if known
 151                     * resolution Textual description of width and height
 152                     * dynamic_range The dynamic range of the video. One of:
 153                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 154                     * tbr        Average bitrate of audio and video in KBit/s
 155                     * abr        Average audio bitrate in KBit/s
 156                     * acodec     Name of the audio codec in use
 157                     * asr        Audio sampling rate in Hertz
 158                     * vbr        Average video bitrate in KBit/s
 159                     * fps        Frame rate
 160                     * vcodec     Name of the video codec in use
 161                     * container  Name of the container format
 162                     * filesize   The number of bytes, if known in advance
 163                     * filesize_approx  An estimate for the number of bytes
 164                     * player_url SWF Player URL (used for rtmpdump).
 165                     * protocol   The protocol that will be used for the actual
 166                                  download, lower-case. One of "http", "https" or
 167                                  one of the protocols defined in downloader.PROTOCOL_MAP
 168                     * fragment_base_url
 169                                  Base URL for fragments. Each fragment's path
 170                                  value (if present) will be relative to
 171                                  this URL.
 172                     * fragments  A list of fragments of a fragmented media.
 173                                  Each fragment entry must contain either an url
 174                                  or a path. If an url is present it should be
 175                                  considered by a client. Otherwise both path and
 176                                  fragment_base_url must be present. Here is
 177                                  the list of all potential fields:
 178                                  * "url" - fragment's URL
 179                                  * "path" - fragment's path relative to
 180                                             fragment_base_url
 181                                  * "duration" (optional, int or float)
 182                                  * "filesize" (optional, int)
 183                     * is_from_start  Is a live format that can be downloaded
 184                                 from the start. Boolean
 185                     * preference Order number of this format. If this field is
 186                                  present and not None, the formats get sorted
 187                                  by this field, regardless of all other values.
 188                                  -1 for default (order by other properties),
 189                                  -2 or smaller for less than default.
 190                                  < -1000 to hide the format (if there is
 191                                     another one which is strictly better)
 192                     * language   Language code, e.g. "de" or "en-US".
 193                     * language_preference  Is this in the language mentioned in
 194                                  the URL?
 195                                  10 if it's what the URL is about,
 196                                  -1 for default (don't know),
 197                                  -10 otherwise, other values reserved for now.
 198                     * quality    Order number of the video quality of this
 199                                  format, irrespective of the file format.
 200                                  -1 for default (order by other properties),
 201                                  -2 or smaller for less than default.
 202                     * source_preference  Order number for this video source
 203                                   (quality takes higher priority)
 204                                  -1 for default (order by other properties),
 205                                  -2 or smaller for less than default.
 206                     * http_headers  A dictionary of additional HTTP headers
 207                                  to add to the request.
 208                     * stretched_ratio  If given and not 1, indicates that the
 209                                  video's pixels are not square.
 210                                  width : height ratio as float.
 211                     * no_resume  The server does not support resuming the
 212                                  (HTTP or RTMP) download. Boolean.
 213                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 214                     * downloader_options  A dictionary of downloader options
 215                                  (For internal use only)
 216                                  * http_chunk_size Chunk size for HTTP downloads
 217                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 218                     RTMP formats can also have the additional fields: page_url,
 219                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 220                     rtmp_protocol, rtmp_real_time
 221
 222     url:            Final video URL.
 223     ext:            Video filename extension.
 224     format:         The video format, defaults to ext (used for --get-format)
 225     player_url:     SWF Player URL (used for rtmpdump).
 226
 227     The following fields are optional:
 228
 229     direct:         True if a direct video file was given (must only be set by GenericIE)
 230     alt_title:      A secondary title of the video.
 231     display_id      An alternative identifier for the video, not necessarily
 232                     unique, but available before title. Typically, id is
 233                     something like "4234987", title "Dancing naked mole rats",
 234                     and display_id "dancing-naked-mole-rats"
 235     thumbnails:     A list of dictionaries, with the following entries:
 236                         * "id" (optional, string) - Thumbnail format ID
 237                         * "url"
 238                         * "preference" (optional, int) - quality of the image
 239                         * "width" (optional, int)
 240                         * "height" (optional, int)
 241                         * "resolution" (optional, string "{width}x{height}",
 242                                         deprecated)
 243                         * "filesize" (optional, int)
 244                         * "http_headers" (dict) - HTTP headers for the request
 245     thumbnail:      Full URL to a video thumbnail image.
 246     description:    Full video description.
 247     uploader:       Full name of the video uploader.
 248     license:        License name the video is licensed under.
 249     creator:        The creator of the video.
 250     timestamp:      UNIX timestamp of the moment the video was uploaded
 251     upload_date:    Video upload date in UTC (YYYYMMDD).
 252                     If not explicitly set, calculated from timestamp
 253     release_timestamp: UNIX timestamp of the moment the video was released.
 254                     If it is not clear whether to use timestamp or this, use the former
 255     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 256                     If not explicitly set, calculated from release_timestamp
 257     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 258     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 259                     If not explicitly set, calculated from modified_timestamp
 260     uploader_id:    Nickname or id of the video uploader.
 261     uploader_url:   Full URL to a personal webpage of the video uploader.
 262     channel:        Full name of the channel the video is uploaded on.
 263                     Note that channel fields may or may not repeat uploader
 264                     fields. This depends on a particular extractor.
 265     channel_id:     Id of the channel.
 266     channel_url:    Full URL to a channel webpage.
 267     channel_follower_count: Number of followers of the channel.
 268     location:       Physical location where the video was filmed.
 269     subtitles:      The available subtitles as a dictionary in the format
 270                     {tag: subformats}. "tag" is usually a language code, and
 271                     "subformats" is a list sorted from lower to higher
 272                     preference, each element is a dictionary with the "ext"
 273                     entry and one of:
 274                         * "data": The subtitles file contents
 275                         * "url": A URL pointing to the subtitles file
 276                     It can optionally also have:
 277                         * "name": Name or description of the subtitles
 278                         * "http_headers": A dictionary of additional HTTP headers
 279                                   to add to the request.
 280                     "ext" will be calculated from URL if missing
 281     automatic_captions: Like 'subtitles'; contains automatically generated
 282                     captions instead of normal subtitles
 283     duration:       Length of the video in seconds, as an integer or float.
 284     view_count:     How many users have watched the video on the platform.
 285     like_count:     Number of positive ratings of the video
 286     dislike_count:  Number of negative ratings of the video
 287     repost_count:   Number of reposts of the video
 288     average_rating: Average rating give by users, the scale used depends on the webpage
 289     comment_count:  Number of comments on the video
 290     comments:       A list of comments, each with one or more of the following
 291                     properties (all but one of text or html optional):
 292                         * "author" - human-readable name of the comment author
 293                         * "author_id" - user ID of the comment author
 294                         * "author_thumbnail" - The thumbnail of the comment author
 295                         * "id" - Comment ID
 296                         * "html" - Comment as HTML
 297                         * "text" - Plain text of the comment
 298                         * "timestamp" - UNIX timestamp of comment
 299                         * "parent" - ID of the comment this one is replying to.
 300                                      Set to "root" to indicate that this is a
 301                                      comment to the original video.
 302                         * "like_count" - Number of positive ratings of the comment
 303                         * "dislike_count" - Number of negative ratings of the comment
 304                         * "is_favorited" - Whether the comment is marked as
 305                                            favorite by the video uploader
 306                         * "author_is_uploader" - Whether the comment is made by
 307                                                  the video uploader
 308     age_limit:      Age restriction for the video, as an integer (years)
 309     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 310                     should allow to get the same result again. (It will be set
 311                     by YoutubeDL if it's missing)
 312     categories:     A list of categories that the video falls in, for example
 313                     ["Sports", "Berlin"]
 314     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 315     cast:           A list of the video cast
 316     is_live:        True, False, or None (=unknown). Whether this video is a
 317                     live stream that goes on instead of a fixed-length video.
 318     was_live:       True, False, or None (=unknown). Whether this video was
 319                     originally a live stream.
 320     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 321                     If absent, automatically set from is_live, was_live
 322     start_time:     Time in seconds where the reproduction should start, as
 323                     specified in the URL.
 324     end_time:       Time in seconds where the reproduction should end, as
 325                     specified in the URL.
 326     chapters:       A list of dictionaries, with the following entries:
 327                         * "start_time" - The start time of the chapter in seconds
 328                         * "end_time" - The end time of the chapter in seconds
 329                         * "title" (optional, string)
 330     playable_in_embed: Whether this video is allowed to play in embedded
 331                     players on other sites. Can be True (=always allowed),
 332                     False (=never allowed), None (=unknown), or a string
 333                     specifying the criteria for embedability (Eg: 'whitelist')
 334     availability:   Under what condition the video is available. One of
 335                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 336                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 337                     to set it
 338     __post_extractor: A function to be called just before the metadata is
 339                     written to either disk, logger or console. The function
 340                     must return a dict which will be added to the info_dict.
 341                     This is usefull for additional information that is
 342                     time-consuming to extract. Note that the fields thus
 343                     extracted will not be available to output template and
 344                     match_filter. So, only "comments" and "comment_count" are
 345                     currently allowed to be extracted via this method.
 346
 347     The following fields should only be used when the video belongs to some logical
 348     chapter or section:
 349
 350     chapter:        Name or title of the chapter the video belongs to.
 351     chapter_number: Number of the chapter the video belongs to, as an integer.
 352     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 353
 354     The following fields should only be used when the video is an episode of some
 355     series, programme or podcast:
 356
 357     series:         Title of the series or programme the video episode belongs to.
 358     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 359     season:         Title of the season the video episode belongs to.
 360     season_number:  Number of the season the video episode belongs to, as an integer.
 361     season_id:      Id of the season the video episode belongs to, as a unicode string.
 362     episode:        Title of the video episode. Unlike mandatory video title field,
 363                     this field should denote the exact title of the video episode
 364                     without any kind of decoration.
 365     episode_number: Number of the video episode within a season, as an integer.
 366     episode_id:     Id of the video episode, as a unicode string.
 367
 368     The following fields should only be used when the media is a track or a part of
 369     a music album:
 370
 371     track:          Title of the track.
 372     track_number:   Number of the track within an album or a disc, as an integer.
 373     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 374                     as a unicode string.
 375     artist:         Artist(s) of the track.
 376     genre:          Genre(s) of the track.
 377     album:          Title of the album the track belongs to.
 378     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 379     album_artist:   List of all artists appeared on the album (e.g.
 380                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 381                     and compilations).
 382     disc_number:    Number of the disc or other physical medium the track belongs to,
 383                     as an integer.
 384     release_year:   Year (YYYY) when the album was released.
 385     composer:       Composer of the piece
 386
 387     Unless mentioned otherwise, the fields should be Unicode strings.
 388
 389     Unless mentioned otherwise, None is equivalent to absence of information.
 390
 391
 392     _type "playlist" indicates multiple videos.
 393     There must be a key "entries", which is a list, an iterable, or a PagedList
 394     object, each element of which is a valid dictionary by this specification.
 395
 396     Additionally, playlists can have "id", "title", and any other relevent
 397     attributes with the same semantics as videos (see above).
 398
 399     It can also have the following optional fields:
 400
 401     playlist_count: The total number of videos in a playlist. If not given,
 402                     YoutubeDL tries to calculate it from "entries"
 403
 404
 405     _type "multi_video" indicates that there are multiple videos that
 406     form a single show, for examples multiple acts of an opera or TV episode.
 407     It must have an entries key like a playlist and contain all the keys
 408     required for a video at the same time.
 409
 410
 411     _type "url" indicates that the video must be extracted from another
 412     location, possibly by a different extractor. Its only required key is:
 413     "url" - the next URL to extract.
 414     The key "ie_key" can be set to the class name (minus the trailing "IE",
 415     e.g. "Youtube") if the extractor class is known in advance.
 416     Additionally, the dictionary may have any properties of the resolved entity
 417     known in advance, for example "title" if the title of the referred video is
 418     known ahead of time.
 419
 420
 421     _type "url_transparent" entities have the same specification as "url", but
 422     indicate that the given additional information is more precise than the one
 423     associated with the resolved URL.
 424     This is useful when a site employs a video service that hosts the video and
 425     its technical metadata, but that video service does not embed a useful
 426     title, description etc.
 427
 428
 429     Subclasses of this should define a _VALID_URL regexp and, re-define the
 430     _real_extract() and (optionally) _real_initialize() methods.
 431     Probably, they should also be added to the list of extractors.
 432
 433     Subclasses may also override suitable() if necessary, but ensure the function
 434     signature is preserved and that this function imports everything it needs
 435     (except other extractors), so that lazy_extractors works correctly.
 436
 437     To support username + password (or netrc) login, the extractor must define a
 438     _NETRC_MACHINE and re-define _perform_login(username, password) and
 439     (optionally) _initialize_pre_login() methods. The _perform_login method will
 440     be called between _initialize_pre_login and _real_initialize if credentials
 441     are passed by the user. In cases where it is necessary to have the login
 442     process as part of the extraction rather than initialization, _perform_login
 443     can be left undefined.
 444
 445     _GEO_BYPASS attribute may be set to False in order to disable
 446     geo restriction bypass mechanisms for a particular extractor.
 447     Though it won't disable explicit geo restriction bypass based on
 448     country code provided with geo_bypass_country.
 449
 450     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 451     countries for this extractor. One of these countries will be used by
 452     geo restriction bypass mechanism right away in order to bypass
 453     geo restriction, of course, if the mechanism is not disabled.
 454
 455     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 456     IP blocks in CIDR notation for this extractor. One of these IP blocks
 457     will be used by geo restriction bypass mechanism similarly
 458     to _GEO_COUNTRIES.
 459
 460     The _WORKING attribute should be set to False for broken IEs
 461     in order to warn the users and skip the tests.
 462     """
 463
 464     _ready = False
 465     _downloader = None
 466     _x_forwarded_for_ip = None
 467     _GEO_BYPASS = True
 468     _GEO_COUNTRIES = None
 469     _GEO_IP_BLOCKS = None
 470     _WORKING = True
 471     _NETRC_MACHINE = None
 472     IE_DESC = None
 473     SEARCH_KEY = None
 474
 475     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 476         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 477         return {
 478             None: '',
 479             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 480             'password': f'Use {password_hint}',
 481             'cookies': (
 482                 'Use --cookies-from-browser or --cookies for the authentication. '
 483                 'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 484         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 485
 486     def __init__(self, downloader=None):
 487         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 488         If a downloader is not passed during initialization,
 489         it must be set using "set_downloader()" before "extract()" is called"""
 490         self._ready = False
 491         self._x_forwarded_for_ip = None
 492         self._printed_messages = set()
 493         self.set_downloader(downloader)
 494
 495     @classmethod
 496     def _match_valid_url(cls, url):
 497         # This does not use has/getattr intentionally - we want to know whether
 498         # we have cached the regexp for *this* class, whereas getattr would also
 499         # match the superclass
 500         if '_VALID_URL_RE' not in cls.__dict__:
 501             if '_VALID_URL' not in cls.__dict__:
 502                 cls._VALID_URL = cls._make_valid_url()
 503             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 504         return cls._VALID_URL_RE.match(url)
 505
 506     @classmethod
 507     def suitable(cls, url):
 508         """Receives a URL and returns True if suitable for this IE."""
 509         # This function must import everything it needs (except other extractors),
 510         # so that lazy_extractors works correctly
 511         return cls._match_valid_url(url) is not None
 512
 513     @classmethod
 514     def _match_id(cls, url):
 515         return cls._match_valid_url(url).group('id')
 516
 517     @classmethod
 518     def get_temp_id(cls, url):
 519         try:
 520             return cls._match_id(url)
 521         except (IndexError, AttributeError):
 522             return None
 523
 524     @classmethod
 525     def working(cls):
 526         """Getter method for _WORKING."""
 527         return cls._WORKING
 528
 529     @classmethod
 530     def supports_login(cls):
 531         return bool(cls._NETRC_MACHINE)
 532
 533     def initialize(self):
 534         """Initializes an instance (authentication, etc)."""
 535         self._printed_messages = set()
 536         self._initialize_geo_bypass({
 537             'countries': self._GEO_COUNTRIES,
 538             'ip_blocks': self._GEO_IP_BLOCKS,
 539         })
 540         if not self._ready:
 541             self._initialize_pre_login()
 542             if self.supports_login():
 543                 username, password = self._get_login_info()
 544                 if username:
 545                     self._perform_login(username, password)
 546             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 547                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 548             self._real_initialize()
 549             self._ready = True
 550
 551     def _initialize_geo_bypass(self, geo_bypass_context):
 552         """
 553         Initialize geo restriction bypass mechanism.
 554
 555         This method is used to initialize geo bypass mechanism based on faking
 556         X-Forwarded-For HTTP header. A random country from provided country list
 557         is selected and a random IP belonging to this country is generated. This
 558         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 559         HTTP requests.
 560
 561         This method will be used for initial geo bypass mechanism initialization
 562         during the instance initialization with _GEO_COUNTRIES and
 563         _GEO_IP_BLOCKS.
 564
 565         You may also manually call it from extractor's code if geo bypass
 566         information is not available beforehand (e.g. obtained during
 567         extraction) or due to some other reason. In this case you should pass
 568         this information in geo bypass context passed as first argument. It may
 569         contain following fields:
 570
 571         countries:  List of geo unrestricted countries (similar
 572                     to _GEO_COUNTRIES)
 573         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 574                     (similar to _GEO_IP_BLOCKS)
 575
 576         """
 577         if not self._x_forwarded_for_ip:
 578
 579             # Geo bypass mechanism is explicitly disabled by user
 580             if not self.get_param('geo_bypass', True):
 581                 return
 582
 583             if not geo_bypass_context:
 584                 geo_bypass_context = {}
 585
 586             # Backward compatibility: previously _initialize_geo_bypass
 587             # expected a list of countries, some 3rd party code may still use
 588             # it this way
 589             if isinstance(geo_bypass_context, (list, tuple)):
 590                 geo_bypass_context = {
 591                     'countries': geo_bypass_context,
 592                 }
 593
 594             # The whole point of geo bypass mechanism is to fake IP
 595             # as X-Forwarded-For HTTP header based on some IP block or
 596             # country code.
 597
 598             # Path 1: bypassing based on IP block in CIDR notation
 599
 600             # Explicit IP block specified by user, use it right away
 601             # regardless of whether extractor is geo bypassable or not
 602             ip_block = self.get_param('geo_bypass_ip_block', None)
 603
 604             # Otherwise use random IP block from geo bypass context but only
 605             # if extractor is known as geo bypassable
 606             if not ip_block:
 607                 ip_blocks = geo_bypass_context.get('ip_blocks')
 608                 if self._GEO_BYPASS and ip_blocks:
 609                     ip_block = random.choice(ip_blocks)
 610
 611             if ip_block:
 612                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 613                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 614                 return
 615
 616             # Path 2: bypassing based on country code
 617
 618             # Explicit country code specified by user, use it right away
 619             # regardless of whether extractor is geo bypassable or not
 620             country = self.get_param('geo_bypass_country', None)
 621
 622             # Otherwise use random country code from geo bypass context but
 623             # only if extractor is known as geo bypassable
 624             if not country:
 625                 countries = geo_bypass_context.get('countries')
 626                 if self._GEO_BYPASS and countries:
 627                     country = random.choice(countries)
 628
 629             if country:
 630                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 631                 self._downloader.write_debug(
 632                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 633
 634     def extract(self, url):
 635         """Extracts URL information and returns it in list of dicts."""
 636         try:
 637             for _ in range(2):
 638                 try:
 639                     self.initialize()
 640                     self.write_debug('Extracting URL: %s' % url)
 641                     ie_result = self._real_extract(url)
 642                     if ie_result is None:
 643                         return None
 644                     if self._x_forwarded_for_ip:
 645                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 646                     subtitles = ie_result.get('subtitles')
 647                     if (subtitles and 'live_chat' in subtitles
 648                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 649                         del subtitles['live_chat']
 650                     return ie_result
 651                 except GeoRestrictedError as e:
 652                     if self.__maybe_fake_ip_and_retry(e.countries):
 653                         continue
 654                     raise
 655         except UnsupportedError:
 656             raise
 657         except ExtractorError as e:
 658             kwargs = {
 659                 'video_id': e.video_id or self.get_temp_id(url),
 660                 'ie': self.IE_NAME,
 661                 'tb': e.traceback or sys.exc_info()[2],
 662                 'expected': e.expected,
 663                 'cause': e.cause
 664             }
 665             if hasattr(e, 'countries'):
 666                 kwargs['countries'] = e.countries
 667             raise type(e)(e.orig_msg, **kwargs)
 668         except compat_http_client.IncompleteRead as e:
 669             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 670         except (KeyError, StopIteration) as e:
 671             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 672
 673     def __maybe_fake_ip_and_retry(self, countries):
 674         if (not self.get_param('geo_bypass_country', None)
 675                 and self._GEO_BYPASS
 676                 and self.get_param('geo_bypass', True)
 677                 and not self._x_forwarded_for_ip
 678                 and countries):
 679             country_code = random.choice(countries)
 680             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 681             if self._x_forwarded_for_ip:
 682                 self.report_warning(
 683                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 684                     % (self._x_forwarded_for_ip, country_code.upper()))
 685                 return True
 686         return False
 687
 688     def set_downloader(self, downloader):
 689         """Sets a YoutubeDL instance as the downloader for this IE."""
 690         self._downloader = downloader
 691
 692     def _initialize_pre_login(self):
 693         """ Intialization before login. Redefine in subclasses."""
 694         pass
 695
 696     def _perform_login(self, username, password):
 697         """ Login with username and password. Redefine in subclasses."""
 698         pass
 699
 700     def _real_initialize(self):
 701         """Real initialization process. Redefine in subclasses."""
 702         pass
 703
 704     def _real_extract(self, url):
 705         """Real extraction process. Redefine in subclasses."""
 706         raise NotImplementedError('This method must be implemented by subclasses')
 707
 708     @classmethod
 709     def ie_key(cls):
 710         """A string for getting the InfoExtractor with get_info_extractor"""
 711         return cls.__name__[:-2]
 712
 713     @classproperty
 714     def IE_NAME(cls):
 715         return cls.__name__[:-2]
 716
 717     @staticmethod
 718     def __can_accept_status_code(err, expected_status):
 719         assert isinstance(err, compat_urllib_error.HTTPError)
 720         if expected_status is None:
 721             return False
 722         elif callable(expected_status):
 723             return expected_status(err.code) is True
 724         else:
 725             return err.code in variadic(expected_status)
 726
 727     def _create_request(self, url_or_request, data=None, headers={}, query={}):
 728         if isinstance(url_or_request, compat_urllib_request.Request):
 729             return update_Request(url_or_request, data=data, headers=headers, query=query)
 730         if query:
 731             url_or_request = update_url_query(url_or_request, query)
 732         return sanitized_Request(url_or_request, data, headers)
 733
 734     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 735         """
 736         Return the response handle.
 737
 738         See _download_webpage docstring for arguments specification.
 739         """
 740         if not self._downloader._first_webpage_request:
 741             sleep_interval = self.get_param('sleep_interval_requests') or 0
 742             if sleep_interval > 0:
 743                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 744                 time.sleep(sleep_interval)
 745         else:
 746             self._downloader._first_webpage_request = False
 747
 748         if note is None:
 749             self.report_download_webpage(video_id)
 750         elif note is not False:
 751             if video_id is None:
 752                 self.to_screen(str(note))
 753             else:
 754                 self.to_screen(f'{video_id}: {note}')
 755
 756         # Some sites check X-Forwarded-For HTTP header in order to figure out
 757         # the origin of the client behind proxy. This allows bypassing geo
 758         # restriction by faking this header's value to IP that belongs to some
 759         # geo unrestricted country. We will do so once we encounter any
 760         # geo restriction error.
 761         if self._x_forwarded_for_ip:
 762             if 'X-Forwarded-For' not in headers:
 763                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 764
 765         try:
 766             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 767         except network_exceptions as err:
 768             if isinstance(err, compat_urllib_error.HTTPError):
 769                 if self.__can_accept_status_code(err, expected_status):
 770                     # Retain reference to error to prevent file object from
 771                     # being closed before it can be read. Works around the
 772                     # effects of <https://bugs.python.org/issue15002>
 773                     # introduced in Python 3.4.1.
 774                     err.fp._error = err
 775                     return err.fp
 776
 777             if errnote is False:
 778                 return False
 779             if errnote is None:
 780                 errnote = 'Unable to download webpage'
 781
 782             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 783             if fatal:
 784                 raise ExtractorError(errmsg, cause=err)
 785             else:
 786                 self.report_warning(errmsg)
 787                 return False
 788
 789     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 790                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 791         """
 792         Return a tuple (page content as string, URL handle).
 793
 794         Arguments:
 795         url_or_request -- plain text URL as a string or
 796             a compat_urllib_request.Requestobject
 797         video_id -- Video/playlist/item identifier (string)
 798
 799         Keyword arguments:
 800         note -- note printed before downloading (string)
 801         errnote -- note printed in case of an error (string)
 802         fatal -- flag denoting whether error should be considered fatal,
 803             i.e. whether it should cause ExtractionError to be raised,
 804             otherwise a warning will be reported and extraction continued
 805         encoding -- encoding for a page content decoding, guessed automatically
 806             when not explicitly specified
 807         data -- POST data (bytes)
 808         headers -- HTTP headers (dict)
 809         query -- URL query (dict)
 810         expected_status -- allows to accept failed HTTP requests (non 2xx
 811             status code) by explicitly specifying a set of accepted status
 812             codes. Can be any of the following entities:
 813                 - an integer type specifying an exact failed status code to
 814                   accept
 815                 - a list or a tuple of integer types specifying a list of
 816                   failed status codes to accept
 817                 - a callable accepting an actual failed status code and
 818                   returning True if it should be accepted
 819             Note that this argument does not affect success status codes (2xx)
 820             which are always accepted.
 821         """
 822
 823         # Strip hashes from the URL (#1038)
 824         if isinstance(url_or_request, (compat_str, str)):
 825             url_or_request = url_or_request.partition('#')[0]
 826
 827         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 828         if urlh is False:
 829             assert not fatal
 830             return False
 831         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 832         return (content, urlh)
 833
 834     @staticmethod
 835     def _guess_encoding_from_content(content_type, webpage_bytes):
 836         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 837         if m:
 838             encoding = m.group(1)
 839         else:
 840             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 841                           webpage_bytes[:1024])
 842             if m:
 843                 encoding = m.group(1).decode('ascii')
 844             elif webpage_bytes.startswith(b'\xff\xfe'):
 845                 encoding = 'utf-16'
 846             else:
 847                 encoding = 'utf-8'
 848
 849         return encoding
 850
 851     def __check_blocked(self, content):
 852         first_block = content[:512]
 853         if ('<title>Access to this site is blocked</title>' in content
 854                 and 'Websense' in first_block):
 855             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 856             blocked_iframe = self._html_search_regex(
 857                 r'<iframe src="([^"]+)"', content,
 858                 'Websense information URL', default=None)
 859             if blocked_iframe:
 860                 msg += ' Visit %s for more details' % blocked_iframe
 861             raise ExtractorError(msg, expected=True)
 862         if '<title>The URL you requested has been blocked</title>' in first_block:
 863             msg = (
 864                 'Access to this webpage has been blocked by Indian censorship. '
 865                 'Use a VPN or proxy server (with --proxy) to route around it.')
 866             block_msg = self._html_search_regex(
 867                 r'</h1><p>(.*?)</p>',
 868                 content, 'block message', default=None)
 869             if block_msg:
 870                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 871             raise ExtractorError(msg, expected=True)
 872         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 873                 and 'blocklist.rkn.gov.ru' in content):
 874             raise ExtractorError(
 875                 'Access to this webpage has been blocked by decision of the Russian government. '
 876                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 877                 expected=True)
 878
 879     def _request_dump_filename(self, url, video_id):
 880         basen = f'{video_id}_{url}'
 881         trim_length = self.get_param('trim_file_name') or 240
 882         if len(basen) > trim_length:
 883             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 884             basen = basen[:trim_length - len(h)] + h
 885         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 886         # Working around MAX_PATH limitation on Windows (see
 887         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 888         if compat_os_name == 'nt':
 889             absfilepath = os.path.abspath(filename)
 890             if len(absfilepath) > 259:
 891                 filename = fR'\\?\{absfilepath}'
 892         return filename
 893
 894     def __decode_webpage(self, webpage_bytes, encoding, headers):
 895         if not encoding:
 896             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 897         try:
 898             return webpage_bytes.decode(encoding, 'replace')
 899         except LookupError:
 900             return webpage_bytes.decode('utf-8', 'replace')
 901
 902     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 903         webpage_bytes = urlh.read()
 904         if prefix is not None:
 905             webpage_bytes = prefix + webpage_bytes
 906         if self.get_param('dump_intermediate_pages', False):
 907             self.to_screen('Dumping request to ' + urlh.geturl())
 908             dump = base64.b64encode(webpage_bytes).decode('ascii')
 909             self._downloader.to_screen(dump)
 910         if self.get_param('write_pages'):
 911             filename = self._request_dump_filename(video_id, urlh.geturl())
 912             self.to_screen(f'Saving request to {filename}')
 913             with open(filename, 'wb') as outf:
 914                 outf.write(webpage_bytes)
 915
 916         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 917         self.__check_blocked(content)
 918
 919         return content
 920
 921     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 922         if transform_source:
 923             xml_string = transform_source(xml_string)
 924         try:
 925             return compat_etree_fromstring(xml_string.encode('utf-8'))
 926         except xml.etree.ElementTree.ParseError as ve:
 927             errmsg = '%s: Failed to parse XML ' % video_id
 928             if fatal:
 929                 raise ExtractorError(errmsg, cause=ve)
 930             else:
 931                 self.report_warning(errmsg + str(ve))
 932
 933     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, lenient=False):
 934         if transform_source:
 935             json_string = transform_source(json_string)
 936         try:
 937             try:
 938                 return json.loads(json_string, strict=False)
 939             except json.JSONDecodeError as e:
 940                 if not lenient:
 941                     raise
 942                 try:
 943                     return json.loads(json_string[:e.pos], strict=False)
 944                 except ValueError:
 945                     raise e
 946         except ValueError as ve:
 947             errmsg = f'{video_id}: Failed to parse JSON'
 948             if fatal:
 949                 raise ExtractorError(errmsg, cause=ve)
 950             else:
 951                 self.report_warning(errmsg + str(ve))
 952
 953     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 954         return self._parse_json(
 955             data[data.find('{'):data.rfind('}') + 1],
 956             video_id, transform_source, fatal)
 957
 958     def __create_download_methods(name, parser, note, errnote, return_value):
 959
 960         def parse(ie, content, *args, **kwargs):
 961             if parser is None:
 962                 return content
 963             # parser is fetched by name so subclasses can override it
 964             return getattr(ie, parser)(content, *args, **kwargs)
 965
 966         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 967                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 968             res = self._download_webpage_handle(
 969                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 970                 data=data, headers=headers, query=query, expected_status=expected_status)
 971             if res is False:
 972                 return res
 973             content, urlh = res
 974             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal), urlh
 975
 976         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 977                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 978             if self.get_param('load_pages'):
 979                 url_or_request = self._create_request(url_or_request, data, headers, query)
 980                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
 981                 self.to_screen(f'Loading request from {filename}')
 982                 try:
 983                     with open(filename, 'rb') as dumpf:
 984                         webpage_bytes = dumpf.read()
 985                 except OSError as e:
 986                     self.report_warning(f'Unable to load request from disk: {e}')
 987                 else:
 988                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
 989                     return parse(self, content, video_id, transform_source, fatal)
 990             kwargs = {
 991                 'note': note,
 992                 'errnote': errnote,
 993                 'transform_source': transform_source,
 994                 'fatal': fatal,
 995                 'encoding': encoding,
 996                 'data': data,
 997                 'headers': headers,
 998                 'query': query,
 999                 'expected_status': expected_status,
1000             }
1001             if parser is None:
1002                 kwargs.pop('transform_source')
1003             # The method is fetched by name so subclasses can override _download_..._handle
1004             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1005             return res if res is False else res[0]
1006
1007         def impersonate(func, name, return_value):
1008             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1009             func.__doc__ = f'''
1010                 @param transform_source     Apply this transformation before parsing
1011                 @returns                    {return_value}
1012
1013                 See _download_webpage_handle docstring for other arguments specification
1014             '''
1015
1016         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1017         impersonate(download_content, f'_download_{name}', f'{return_value}')
1018         return download_handle, download_content
1019
1020     _download_xml_handle, _download_xml = __create_download_methods(
1021         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1022     _download_json_handle, _download_json = __create_download_methods(
1023         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1024     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1025         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1026     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1027
1028     def _download_webpage(
1029             self, url_or_request, video_id, note=None, errnote=None,
1030             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1031         """
1032         Return the data of the page as a string.
1033
1034         Keyword arguments:
1035         tries -- number of tries
1036         timeout -- sleep interval between tries
1037
1038         See _download_webpage_handle docstring for other arguments specification.
1039         """
1040
1041         R''' # NB: These are unused; should they be deprecated?
1042         if tries != 1:
1043             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1044         if timeout is NO_DEFAULT:
1045             timeout = 5
1046         else:
1047             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1048         '''
1049
1050         try_count = 0
1051         while True:
1052             try:
1053                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1054             except compat_http_client.IncompleteRead as e:
1055                 try_count += 1
1056                 if try_count >= tries:
1057                     raise e
1058                 self._sleep(timeout, video_id)
1059
1060     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1061         idstr = format_field(video_id, template='%s: ')
1062         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1063         if only_once:
1064             if f'WARNING: {msg}' in self._printed_messages:
1065                 return
1066             self._printed_messages.add(f'WARNING: {msg}')
1067         self._downloader.report_warning(msg, *args, **kwargs)
1068
1069     def to_screen(self, msg, *args, **kwargs):
1070         """Print msg to screen, prefixing it with '[ie_name]'"""
1071         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1072
1073     def write_debug(self, msg, *args, **kwargs):
1074         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1075
1076     def get_param(self, name, default=None, *args, **kwargs):
1077         if self._downloader:
1078             return self._downloader.params.get(name, default, *args, **kwargs)
1079         return default
1080
1081     def report_drm(self, video_id, partial=False):
1082         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1083
1084     def report_extraction(self, id_or_name):
1085         """Report information extraction."""
1086         self.to_screen('%s: Extracting information' % id_or_name)
1087
1088     def report_download_webpage(self, video_id):
1089         """Report webpage download."""
1090         self.to_screen('%s: Downloading webpage' % video_id)
1091
1092     def report_age_confirmation(self):
1093         """Report attempt to confirm age."""
1094         self.to_screen('Confirming age')
1095
1096     def report_login(self):
1097         """Report attempt to log in."""
1098         self.to_screen('Logging in')
1099
1100     def raise_login_required(
1101             self, msg='This video is only available for registered users',
1102             metadata_available=False, method=NO_DEFAULT):
1103         if metadata_available and (
1104                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1105             self.report_warning(msg)
1106             return
1107         msg += format_field(self._login_hint(method), template='. %s')
1108         raise ExtractorError(msg, expected=True)
1109
1110     def raise_geo_restricted(
1111             self, msg='This video is not available from your location due to geo restriction',
1112             countries=None, metadata_available=False):
1113         if metadata_available and (
1114                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1115             self.report_warning(msg)
1116         else:
1117             raise GeoRestrictedError(msg, countries=countries)
1118
1119     def raise_no_formats(self, msg, expected=False, video_id=None):
1120         if expected and (
1121                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1122             self.report_warning(msg, video_id)
1123         elif isinstance(msg, ExtractorError):
1124             raise msg
1125         else:
1126             raise ExtractorError(msg, expected=expected, video_id=video_id)
1127
1128     # Methods for following #608
1129     @staticmethod
1130     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1131         """Returns a URL that points to a page that should be processed"""
1132         if ie is not None:
1133             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1134         if video_id is not None:
1135             kwargs['id'] = video_id
1136         if video_title is not None:
1137             kwargs['title'] = video_title
1138         return {
1139             **kwargs,
1140             '_type': 'url_transparent' if url_transparent else 'url',
1141             'url': url,
1142         }
1143
1144     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1145         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1146                 for m in orderedSet(map(getter, matches) if getter else matches))
1147         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1148
1149     @staticmethod
1150     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1151         """Returns a playlist"""
1152         if playlist_id:
1153             kwargs['id'] = playlist_id
1154         if playlist_title:
1155             kwargs['title'] = playlist_title
1156         if playlist_description is not None:
1157             kwargs['description'] = playlist_description
1158         return {
1159             **kwargs,
1160             '_type': 'multi_video' if multi_video else 'playlist',
1161             'entries': entries,
1162         }
1163
1164     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1165         """
1166         Perform a regex search on the given string, using a single or a list of
1167         patterns returning the first matching group.
1168         In case of failure return a default value or raise a WARNING or a
1169         RegexNotFoundError, depending on fatal, specifying the field name.
1170         """
1171         if string is None:
1172             mobj = None
1173         elif isinstance(pattern, (str, re.Pattern)):
1174             mobj = re.search(pattern, string, flags)
1175         else:
1176             for p in pattern:
1177                 mobj = re.search(p, string, flags)
1178                 if mobj:
1179                     break
1180
1181         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1182
1183         if mobj:
1184             if group is None:
1185                 # return the first matching group
1186                 return next(g for g in mobj.groups() if g is not None)
1187             elif isinstance(group, (list, tuple)):
1188                 return tuple(mobj.group(g) for g in group)
1189             else:
1190                 return mobj.group(group)
1191         elif default is not NO_DEFAULT:
1192             return default
1193         elif fatal:
1194             raise RegexNotFoundError('Unable to extract %s' % _name)
1195         else:
1196             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1197             return None
1198
1199     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1200         """
1201         Like _search_regex, but strips HTML tags and unescapes entities.
1202         """
1203         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1204         if res:
1205             return clean_html(res).strip()
1206         else:
1207             return res
1208
1209     def _get_netrc_login_info(self, netrc_machine=None):
1210         username = None
1211         password = None
1212         netrc_machine = netrc_machine or self._NETRC_MACHINE
1213
1214         if self.get_param('usenetrc', False):
1215             try:
1216                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1217                 if os.path.isdir(netrc_file):
1218                     netrc_file = os.path.join(netrc_file, '.netrc')
1219                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1220                 if info is not None:
1221                     username = info[0]
1222                     password = info[2]
1223                 else:
1224                     raise netrc.NetrcParseError(
1225                         'No authenticators for %s' % netrc_machine)
1226             except (OSError, netrc.NetrcParseError) as err:
1227                 self.report_warning(
1228                     'parsing .netrc: %s' % error_to_compat_str(err))
1229
1230         return username, password
1231
1232     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1233         """
1234         Get the login info as (username, password)
1235         First look for the manually specified credentials using username_option
1236         and password_option as keys in params dictionary. If no such credentials
1237         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1238         value.
1239         If there's no info available, return (None, None)
1240         """
1241
1242         # Attempt to use provided username and password or .netrc data
1243         username = self.get_param(username_option)
1244         if username is not None:
1245             password = self.get_param(password_option)
1246         else:
1247             username, password = self._get_netrc_login_info(netrc_machine)
1248
1249         return username, password
1250
1251     def _get_tfa_info(self, note='two-factor verification code'):
1252         """
1253         Get the two-factor authentication info
1254         TODO - asking the user will be required for sms/phone verify
1255         currently just uses the command line option
1256         If there's no info available, return None
1257         """
1258
1259         tfa = self.get_param('twofactor')
1260         if tfa is not None:
1261             return tfa
1262
1263         return compat_getpass('Type %s and press [Return]: ' % note)
1264
1265     # Helper functions for extracting OpenGraph info
1266     @staticmethod
1267     def _og_regexes(prop):
1268         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1269         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1270                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1271         template = r'<meta[^>]+?%s[^>]+?%s'
1272         return [
1273             template % (property_re, content_re),
1274             template % (content_re, property_re),
1275         ]
1276
1277     @staticmethod
1278     def _meta_regex(prop):
1279         return r'''(?isx)<meta
1280                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1281                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1282
1283     def _og_search_property(self, prop, html, name=None, **kargs):
1284         prop = variadic(prop)
1285         if name is None:
1286             name = 'OpenGraph %s' % prop[0]
1287         og_regexes = []
1288         for p in prop:
1289             og_regexes.extend(self._og_regexes(p))
1290         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1291         if escaped is None:
1292             return None
1293         return unescapeHTML(escaped)
1294
1295     def _og_search_thumbnail(self, html, **kargs):
1296         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1297
1298     def _og_search_description(self, html, **kargs):
1299         return self._og_search_property('description', html, fatal=False, **kargs)
1300
1301     def _og_search_title(self, html, *, fatal=False, **kargs):
1302         return self._og_search_property('title', html, fatal=fatal, **kargs)
1303
1304     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1305         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1306         if secure:
1307             regexes = self._og_regexes('video:secure_url') + regexes
1308         return self._html_search_regex(regexes, html, name, **kargs)
1309
1310     def _og_search_url(self, html, **kargs):
1311         return self._og_search_property('url', html, **kargs)
1312
1313     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1314         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1315
1316     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1317         name = variadic(name)
1318         if display_name is None:
1319             display_name = name[0]
1320         return self._html_search_regex(
1321             [self._meta_regex(n) for n in name],
1322             html, display_name, fatal=fatal, group='content', **kwargs)
1323
1324     def _dc_search_uploader(self, html):
1325         return self._html_search_meta('dc.creator', html, 'uploader')
1326
1327     def _rta_search(self, html):
1328         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1329         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1330                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1331                      html):
1332             return 18
1333         return 0
1334
1335     def _media_rating_search(self, html):
1336         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1337         rating = self._html_search_meta('rating', html)
1338
1339         if not rating:
1340             return None
1341
1342         RATING_TABLE = {
1343             'safe for kids': 0,
1344             'general': 8,
1345             '14 years': 14,
1346             'mature': 17,
1347             'restricted': 19,
1348         }
1349         return RATING_TABLE.get(rating.lower())
1350
1351     def _family_friendly_search(self, html):
1352         # See http://schema.org/VideoObject
1353         family_friendly = self._html_search_meta(
1354             'isFamilyFriendly', html, default=None)
1355
1356         if not family_friendly:
1357             return None
1358
1359         RATING_TABLE = {
1360             '1': 0,
1361             'true': 0,
1362             '0': 18,
1363             'false': 18,
1364         }
1365         return RATING_TABLE.get(family_friendly.lower())
1366
1367     def _twitter_search_player(self, html):
1368         return self._html_search_meta('twitter:player', html,
1369                                       'twitter card player')
1370
1371     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1372         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1373         default = kwargs.get('default', NO_DEFAULT)
1374         # JSON-LD may be malformed and thus `fatal` should be respected.
1375         # At the same time `default` may be passed that assumes `fatal=False`
1376         # for _search_regex. Let's simulate the same behavior here as well.
1377         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1378         json_ld = []
1379         for mobj in json_ld_list:
1380             json_ld_item = self._parse_json(
1381                 mobj.group('json_ld'), video_id, fatal=fatal)
1382             if not json_ld_item:
1383                 continue
1384             if isinstance(json_ld_item, dict):
1385                 json_ld.append(json_ld_item)
1386             elif isinstance(json_ld_item, (list, tuple)):
1387                 json_ld.extend(json_ld_item)
1388         if json_ld:
1389             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1390         if json_ld:
1391             return json_ld
1392         if default is not NO_DEFAULT:
1393             return default
1394         elif fatal:
1395             raise RegexNotFoundError('Unable to extract JSON-LD')
1396         else:
1397             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1398             return {}
1399
1400     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1401         if isinstance(json_ld, compat_str):
1402             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1403         if not json_ld:
1404             return {}
1405         info = {}
1406         if not isinstance(json_ld, (list, tuple, dict)):
1407             return info
1408         if isinstance(json_ld, dict):
1409             json_ld = [json_ld]
1410
1411         INTERACTION_TYPE_MAP = {
1412             'CommentAction': 'comment',
1413             'AgreeAction': 'like',
1414             'DisagreeAction': 'dislike',
1415             'LikeAction': 'like',
1416             'DislikeAction': 'dislike',
1417             'ListenAction': 'view',
1418             'WatchAction': 'view',
1419             'ViewAction': 'view',
1420         }
1421
1422         def extract_interaction_type(e):
1423             interaction_type = e.get('interactionType')
1424             if isinstance(interaction_type, dict):
1425                 interaction_type = interaction_type.get('@type')
1426             return str_or_none(interaction_type)
1427
1428         def extract_interaction_statistic(e):
1429             interaction_statistic = e.get('interactionStatistic')
1430             if isinstance(interaction_statistic, dict):
1431                 interaction_statistic = [interaction_statistic]
1432             if not isinstance(interaction_statistic, list):
1433                 return
1434             for is_e in interaction_statistic:
1435                 if not isinstance(is_e, dict):
1436                     continue
1437                 if is_e.get('@type') != 'InteractionCounter':
1438                     continue
1439                 interaction_type = extract_interaction_type(is_e)
1440                 if not interaction_type:
1441                     continue
1442                 # For interaction count some sites provide string instead of
1443                 # an integer (as per spec) with non digit characters (e.g. ",")
1444                 # so extracting count with more relaxed str_to_int
1445                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1446                 if interaction_count is None:
1447                     continue
1448                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1449                 if not count_kind:
1450                     continue
1451                 count_key = '%s_count' % count_kind
1452                 if info.get(count_key) is not None:
1453                     continue
1454                 info[count_key] = interaction_count
1455
1456         def extract_chapter_information(e):
1457             chapters = [{
1458                 'title': part.get('name'),
1459                 'start_time': part.get('startOffset'),
1460                 'end_time': part.get('endOffset'),
1461             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1462             for idx, (last_c, current_c, next_c) in enumerate(zip(
1463                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1464                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1465                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1466                 if None in current_c.values():
1467                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1468                     return
1469             if chapters:
1470                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1471                 info['chapters'] = chapters
1472
1473         def extract_video_object(e):
1474             assert e['@type'] == 'VideoObject'
1475             author = e.get('author')
1476             info.update({
1477                 'url': traverse_obj(e, 'contentUrl', 'embedUrl', expected_type=url_or_none),
1478                 'title': unescapeHTML(e.get('name')),
1479                 'description': unescapeHTML(e.get('description')),
1480                 'thumbnails': [{'url': url}
1481                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1482                                if url_or_none(url)],
1483                 'duration': parse_duration(e.get('duration')),
1484                 'timestamp': unified_timestamp(e.get('uploadDate')),
1485                 # author can be an instance of 'Organization' or 'Person' types.
1486                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1487                 # however some websites are using 'Text' type instead.
1488                 # 1. https://schema.org/VideoObject
1489                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1490                 'filesize': float_or_none(e.get('contentSize')),
1491                 'tbr': int_or_none(e.get('bitrate')),
1492                 'width': int_or_none(e.get('width')),
1493                 'height': int_or_none(e.get('height')),
1494                 'view_count': int_or_none(e.get('interactionCount')),
1495             })
1496             extract_interaction_statistic(e)
1497             extract_chapter_information(e)
1498
1499         def traverse_json_ld(json_ld, at_top_level=True):
1500             for e in json_ld:
1501                 if at_top_level and '@context' not in e:
1502                     continue
1503                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1504                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1505                     break
1506                 item_type = e.get('@type')
1507                 if expected_type is not None and expected_type != item_type:
1508                     continue
1509                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1510                 if rating is not None:
1511                     info['average_rating'] = rating
1512                 if item_type in ('TVEpisode', 'Episode'):
1513                     episode_name = unescapeHTML(e.get('name'))
1514                     info.update({
1515                         'episode': episode_name,
1516                         'episode_number': int_or_none(e.get('episodeNumber')),
1517                         'description': unescapeHTML(e.get('description')),
1518                     })
1519                     if not info.get('title') and episode_name:
1520                         info['title'] = episode_name
1521                     part_of_season = e.get('partOfSeason')
1522                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1523                         info.update({
1524                             'season': unescapeHTML(part_of_season.get('name')),
1525                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1526                         })
1527                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1528                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1529                         info['series'] = unescapeHTML(part_of_series.get('name'))
1530                 elif item_type == 'Movie':
1531                     info.update({
1532                         'title': unescapeHTML(e.get('name')),
1533                         'description': unescapeHTML(e.get('description')),
1534                         'duration': parse_duration(e.get('duration')),
1535                         'timestamp': unified_timestamp(e.get('dateCreated')),
1536                     })
1537                 elif item_type in ('Article', 'NewsArticle'):
1538                     info.update({
1539                         'timestamp': parse_iso8601(e.get('datePublished')),
1540                         'title': unescapeHTML(e.get('headline')),
1541                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1542                     })
1543                     if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
1544                         extract_video_object(e['video'][0])
1545                     elif traverse_obj(e, ('subjectOf', 0, '@type')) == 'VideoObject':
1546                         extract_video_object(e['subjectOf'][0])
1547                 elif item_type == 'VideoObject':
1548                     extract_video_object(e)
1549                     if expected_type is None:
1550                         continue
1551                     else:
1552                         break
1553                 video = e.get('video')
1554                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1555                     extract_video_object(video)
1556                 if expected_type is None:
1557                     continue
1558                 else:
1559                     break
1560         traverse_json_ld(json_ld)
1561
1562         return filter_dict(info)
1563
1564     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1565         return self._parse_json(
1566             self._search_regex(
1567                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1568                 webpage, 'next.js data', fatal=fatal, **kw),
1569             video_id, transform_source=transform_source, fatal=fatal)
1570
1571     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1572         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1573         # not all website do this, but it can be changed
1574         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1575         rectx = re.escape(context_name)
1576         js, arg_keys, arg_vals = self._search_regex(
1577             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1578              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1579             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1580
1581         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1582
1583         for key, val in args.items():
1584             if val in ('undefined', 'void 0'):
1585                 args[key] = 'null'
1586
1587         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1588
1589     @staticmethod
1590     def _hidden_inputs(html):
1591         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1592         hidden_inputs = {}
1593         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1594             attrs = extract_attributes(input)
1595             if not input:
1596                 continue
1597             if attrs.get('type') not in ('hidden', 'submit'):
1598                 continue
1599             name = attrs.get('name') or attrs.get('id')
1600             value = attrs.get('value')
1601             if name and value is not None:
1602                 hidden_inputs[name] = value
1603         return hidden_inputs
1604
1605     def _form_hidden_inputs(self, form_id, html):
1606         form = self._search_regex(
1607             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1608             html, '%s form' % form_id, group='form')
1609         return self._hidden_inputs(form)
1610
1611     class FormatSort:
1612         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1613
1614         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1615                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1616                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1617         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1618                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1619                         'fps', 'fs_approx', 'source', 'id')
1620
1621         settings = {
1622             'vcodec': {'type': 'ordered', 'regex': True,
1623                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1624             'acodec': {'type': 'ordered', 'regex': True,
1625                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1626             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1627                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1628             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1629                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1630             'vext': {'type': 'ordered', 'field': 'video_ext',
1631                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1632                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1633             'aext': {'type': 'ordered', 'field': 'audio_ext',
1634                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1635                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1636             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1637             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1638                            'field': ('vcodec', 'acodec'),
1639                            'function': lambda it: int(any(v != 'none' for v in it))},
1640             'ie_pref': {'priority': True, 'type': 'extractor'},
1641             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1642             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1643             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1644             'quality': {'convert': 'float', 'default': -1},
1645             'filesize': {'convert': 'bytes'},
1646             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1647             'id': {'convert': 'string', 'field': 'format_id'},
1648             'height': {'convert': 'float_none'},
1649             'width': {'convert': 'float_none'},
1650             'fps': {'convert': 'float_none'},
1651             'tbr': {'convert': 'float_none'},
1652             'vbr': {'convert': 'float_none'},
1653             'abr': {'convert': 'float_none'},
1654             'asr': {'convert': 'float_none'},
1655             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1656
1657             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1658             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1659             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1660             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1661             'res': {'type': 'multiple', 'field': ('height', 'width'),
1662                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1663
1664             # For compatibility with youtube-dl
1665             'format_id': {'type': 'alias', 'field': 'id'},
1666             'preference': {'type': 'alias', 'field': 'ie_pref'},
1667             'language_preference': {'type': 'alias', 'field': 'lang'},
1668             'source_preference': {'type': 'alias', 'field': 'source'},
1669             'protocol': {'type': 'alias', 'field': 'proto'},
1670             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1671
1672             # Deprecated
1673             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1674             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1675             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1676             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1677             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1678             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1679             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1680             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1681             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1682             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1683             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1684             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1685             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1686             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1687             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1688             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1689             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1690             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1691             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1692             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1693         }
1694
1695         def __init__(self, ie, field_preference):
1696             self._order = []
1697             self.ydl = ie._downloader
1698             self.evaluate_params(self.ydl.params, field_preference)
1699             if ie.get_param('verbose'):
1700                 self.print_verbose_info(self.ydl.write_debug)
1701
1702         def _get_field_setting(self, field, key):
1703             if field not in self.settings:
1704                 if key in ('forced', 'priority'):
1705                     return False
1706                 self.ydl.deprecation_warning(
1707                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1708                     'and may be removed in a future version')
1709                 self.settings[field] = {}
1710             propObj = self.settings[field]
1711             if key not in propObj:
1712                 type = propObj.get('type')
1713                 if key == 'field':
1714                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1715                 elif key == 'convert':
1716                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1717                 else:
1718                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1719                 propObj[key] = default
1720             return propObj[key]
1721
1722         def _resolve_field_value(self, field, value, convertNone=False):
1723             if value is None:
1724                 if not convertNone:
1725                     return None
1726             else:
1727                 value = value.lower()
1728             conversion = self._get_field_setting(field, 'convert')
1729             if conversion == 'ignore':
1730                 return None
1731             if conversion == 'string':
1732                 return value
1733             elif conversion == 'float_none':
1734                 return float_or_none(value)
1735             elif conversion == 'bytes':
1736                 return FileDownloader.parse_bytes(value)
1737             elif conversion == 'order':
1738                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1739                 use_regex = self._get_field_setting(field, 'regex')
1740                 list_length = len(order_list)
1741                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1742                 if use_regex and value is not None:
1743                     for i, regex in enumerate(order_list):
1744                         if regex and re.match(regex, value):
1745                             return list_length - i
1746                     return list_length - empty_pos  # not in list
1747                 else:  # not regex or  value = None
1748                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1749             else:
1750                 if value.isnumeric():
1751                     return float(value)
1752                 else:
1753                     self.settings[field]['convert'] = 'string'
1754                     return value
1755
1756         def evaluate_params(self, params, sort_extractor):
1757             self._use_free_order = params.get('prefer_free_formats', False)
1758             self._sort_user = params.get('format_sort', [])
1759             self._sort_extractor = sort_extractor
1760
1761             def add_item(field, reverse, closest, limit_text):
1762                 field = field.lower()
1763                 if field in self._order:
1764                     return
1765                 self._order.append(field)
1766                 limit = self._resolve_field_value(field, limit_text)
1767                 data = {
1768                     'reverse': reverse,
1769                     'closest': False if limit is None else closest,
1770                     'limit_text': limit_text,
1771                     'limit': limit}
1772                 if field in self.settings:
1773                     self.settings[field].update(data)
1774                 else:
1775                     self.settings[field] = data
1776
1777             sort_list = (
1778                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1779                 + (tuple() if params.get('format_sort_force', False)
1780                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1781                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1782
1783             for item in sort_list:
1784                 match = re.match(self.regex, item)
1785                 if match is None:
1786                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1787                 field = match.group('field')
1788                 if field is None:
1789                     continue
1790                 if self._get_field_setting(field, 'type') == 'alias':
1791                     alias, field = field, self._get_field_setting(field, 'field')
1792                     if self._get_field_setting(alias, 'deprecated'):
1793                         self.ydl.deprecation_warning(
1794                             f'Format sorting alias {alias} is deprecated '
1795                             f'and may be removed in a future version. Please use {field} instead')
1796                 reverse = match.group('reverse') is not None
1797                 closest = match.group('separator') == '~'
1798                 limit_text = match.group('limit')
1799
1800                 has_limit = limit_text is not None
1801                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1802                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1803
1804                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1805                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1806                 limit_count = len(limits)
1807                 for (i, f) in enumerate(fields):
1808                     add_item(f, reverse, closest,
1809                              limits[i] if i < limit_count
1810                              else limits[0] if has_limit and not has_multiple_limits
1811                              else None)
1812
1813         def print_verbose_info(self, write_debug):
1814             if self._sort_user:
1815                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1816             if self._sort_extractor:
1817                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1818             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1819                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1820                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1821                               self._get_field_setting(field, 'limit_text'),
1822                               self._get_field_setting(field, 'limit'))
1823                 if self._get_field_setting(field, 'limit_text') is not None else '')
1824                 for field in self._order if self._get_field_setting(field, 'visible')]))
1825
1826         def _calculate_field_preference_from_value(self, format, field, type, value):
1827             reverse = self._get_field_setting(field, 'reverse')
1828             closest = self._get_field_setting(field, 'closest')
1829             limit = self._get_field_setting(field, 'limit')
1830
1831             if type == 'extractor':
1832                 maximum = self._get_field_setting(field, 'max')
1833                 if value is None or (maximum is not None and value >= maximum):
1834                     value = -1
1835             elif type == 'boolean':
1836                 in_list = self._get_field_setting(field, 'in_list')
1837                 not_in_list = self._get_field_setting(field, 'not_in_list')
1838                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1839             elif type == 'ordered':
1840                 value = self._resolve_field_value(field, value, True)
1841
1842             # try to convert to number
1843             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1844             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1845             if is_num:
1846                 value = val_num
1847
1848             return ((-10, 0) if value is None
1849                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1850                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1851                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1852                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1853                     else (-1, value, 0))
1854
1855         def _calculate_field_preference(self, format, field):
1856             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1857             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1858             if type == 'multiple':
1859                 type = 'field'  # Only 'field' is allowed in multiple for now
1860                 actual_fields = self._get_field_setting(field, 'field')
1861
1862                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1863             else:
1864                 value = get_value(field)
1865             return self._calculate_field_preference_from_value(format, field, type, value)
1866
1867         def calculate_preference(self, format):
1868             # Determine missing protocol
1869             if not format.get('protocol'):
1870                 format['protocol'] = determine_protocol(format)
1871
1872             # Determine missing ext
1873             if not format.get('ext') and 'url' in format:
1874                 format['ext'] = determine_ext(format['url'])
1875             if format.get('vcodec') == 'none':
1876                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1877                 format['video_ext'] = 'none'
1878             else:
1879                 format['video_ext'] = format['ext']
1880                 format['audio_ext'] = 'none'
1881             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1882             #    format['preference'] = -1000
1883
1884             # Determine missing bitrates
1885             if format.get('tbr') is None:
1886                 if format.get('vbr') is not None and format.get('abr') is not None:
1887                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1888             else:
1889                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1890                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1891                 if format.get('acodec') != 'none' and format.get('abr') is None:
1892                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1893
1894             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1895
1896     def _sort_formats(self, formats, field_preference=[]):
1897         if not formats:
1898             return
1899         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1900
1901     def _check_formats(self, formats, video_id):
1902         if formats:
1903             formats[:] = filter(
1904                 lambda f: self._is_valid_url(
1905                     f['url'], video_id,
1906                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1907                 formats)
1908
1909     @staticmethod
1910     def _remove_duplicate_formats(formats):
1911         format_urls = set()
1912         unique_formats = []
1913         for f in formats:
1914             if f['url'] not in format_urls:
1915                 format_urls.add(f['url'])
1916                 unique_formats.append(f)
1917         formats[:] = unique_formats
1918
1919     def _is_valid_url(self, url, video_id, item='video', headers={}):
1920         url = self._proto_relative_url(url, scheme='http:')
1921         # For now assume non HTTP(S) URLs always valid
1922         if not (url.startswith('http://') or url.startswith('https://')):
1923             return True
1924         try:
1925             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1926             return True
1927         except ExtractorError as e:
1928             self.to_screen(
1929                 '%s: %s URL is invalid, skipping: %s'
1930                 % (video_id, item, error_to_compat_str(e.cause)))
1931             return False
1932
1933     def http_scheme(self):
1934         """ Either "http:" or "https:", depending on the user's preferences """
1935         return (
1936             'http:'
1937             if self.get_param('prefer_insecure', False)
1938             else 'https:')
1939
1940     def _proto_relative_url(self, url, scheme=None):
1941         if url is None:
1942             return url
1943         if url.startswith('//'):
1944             if scheme is None:
1945                 scheme = self.http_scheme()
1946             return scheme + url
1947         else:
1948             return url
1949
1950     def _sleep(self, timeout, video_id, msg_template=None):
1951         if msg_template is None:
1952             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1953         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1954         self.to_screen(msg)
1955         time.sleep(timeout)
1956
1957     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1958                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1959                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1960         res = self._download_xml_handle(
1961             manifest_url, video_id, 'Downloading f4m manifest',
1962             'Unable to download f4m manifest',
1963             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1964             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1965             transform_source=transform_source,
1966             fatal=fatal, data=data, headers=headers, query=query)
1967         if res is False:
1968             return []
1969
1970         manifest, urlh = res
1971         manifest_url = urlh.geturl()
1972
1973         return self._parse_f4m_formats(
1974             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1975             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1976
1977     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1978                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1979                            fatal=True, m3u8_id=None):
1980         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1981             return []
1982
1983         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1984         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1985         if akamai_pv is not None and ';' in akamai_pv.text:
1986             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1987             if playerVerificationChallenge.strip() != '':
1988                 return []
1989
1990         formats = []
1991         manifest_version = '1.0'
1992         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1993         if not media_nodes:
1994             manifest_version = '2.0'
1995             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1996         # Remove unsupported DRM protected media from final formats
1997         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1998         media_nodes = remove_encrypted_media(media_nodes)
1999         if not media_nodes:
2000             return formats
2001
2002         manifest_base_url = get_base_url(manifest)
2003
2004         bootstrap_info = xpath_element(
2005             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2006             'bootstrap info', default=None)
2007
2008         vcodec = None
2009         mime_type = xpath_text(
2010             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2011             'base URL', default=None)
2012         if mime_type and mime_type.startswith('audio/'):
2013             vcodec = 'none'
2014
2015         for i, media_el in enumerate(media_nodes):
2016             tbr = int_or_none(media_el.attrib.get('bitrate'))
2017             width = int_or_none(media_el.attrib.get('width'))
2018             height = int_or_none(media_el.attrib.get('height'))
2019             format_id = join_nonempty(f4m_id, tbr or i)
2020             # If <bootstrapInfo> is present, the specified f4m is a
2021             # stream-level manifest, and only set-level manifests may refer to
2022             # external resources.  See section 11.4 and section 4 of F4M spec
2023             if bootstrap_info is None:
2024                 media_url = None
2025                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2026                 if manifest_version == '2.0':
2027                     media_url = media_el.attrib.get('href')
2028                 if media_url is None:
2029                     media_url = media_el.attrib.get('url')
2030                 if not media_url:
2031                     continue
2032                 manifest_url = (
2033                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2034                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2035                 # If media_url is itself a f4m manifest do the recursive extraction
2036                 # since bitrates in parent manifest (this one) and media_url manifest
2037                 # may differ leading to inability to resolve the format by requested
2038                 # bitrate in f4m downloader
2039                 ext = determine_ext(manifest_url)
2040                 if ext == 'f4m':
2041                     f4m_formats = self._extract_f4m_formats(
2042                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2043                         transform_source=transform_source, fatal=fatal)
2044                     # Sometimes stream-level manifest contains single media entry that
2045                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2046                     # At the same time parent's media entry in set-level manifest may
2047                     # contain it. We will copy it from parent in such cases.
2048                     if len(f4m_formats) == 1:
2049                         f = f4m_formats[0]
2050                         f.update({
2051                             'tbr': f.get('tbr') or tbr,
2052                             'width': f.get('width') or width,
2053                             'height': f.get('height') or height,
2054                             'format_id': f.get('format_id') if not tbr else format_id,
2055                             'vcodec': vcodec,
2056                         })
2057                     formats.extend(f4m_formats)
2058                     continue
2059                 elif ext == 'm3u8':
2060                     formats.extend(self._extract_m3u8_formats(
2061                         manifest_url, video_id, 'mp4', preference=preference,
2062                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2063                     continue
2064             formats.append({
2065                 'format_id': format_id,
2066                 'url': manifest_url,
2067                 'manifest_url': manifest_url,
2068                 'ext': 'flv' if bootstrap_info is not None else None,
2069                 'protocol': 'f4m',
2070                 'tbr': tbr,
2071                 'width': width,
2072                 'height': height,
2073                 'vcodec': vcodec,
2074                 'preference': preference,
2075                 'quality': quality,
2076             })
2077         return formats
2078
2079     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2080         return {
2081             'format_id': join_nonempty(m3u8_id, 'meta'),
2082             'url': m3u8_url,
2083             'ext': ext,
2084             'protocol': 'm3u8',
2085             'preference': preference - 100 if preference else -100,
2086             'quality': quality,
2087             'resolution': 'multiple',
2088             'format_note': 'Quality selection URL',
2089         }
2090
2091     def _report_ignoring_subs(self, name):
2092         self.report_warning(bug_reports_message(
2093             f'Ignoring subtitle tracks found in the {name} manifest; '
2094             'if any subtitle tracks are missing,'
2095         ), only_once=True)
2096
2097     def _extract_m3u8_formats(self, *args, **kwargs):
2098         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2099         if subs:
2100             self._report_ignoring_subs('HLS')
2101         return fmts
2102
2103     def _extract_m3u8_formats_and_subtitles(
2104             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2105             preference=None, quality=None, m3u8_id=None, note=None,
2106             errnote=None, fatal=True, live=False, data=None, headers={},
2107             query={}):
2108
2109         res = self._download_webpage_handle(
2110             m3u8_url, video_id,
2111             note='Downloading m3u8 information' if note is None else note,
2112             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2113             fatal=fatal, data=data, headers=headers, query=query)
2114
2115         if res is False:
2116             return [], {}
2117
2118         m3u8_doc, urlh = res
2119         m3u8_url = urlh.geturl()
2120
2121         return self._parse_m3u8_formats_and_subtitles(
2122             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2123             preference=preference, quality=quality, m3u8_id=m3u8_id,
2124             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2125             headers=headers, query=query, video_id=video_id)
2126
2127     def _parse_m3u8_formats_and_subtitles(
2128             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2129             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2130             errnote=None, fatal=True, data=None, headers={}, query={},
2131             video_id=None):
2132         formats, subtitles = [], {}
2133
2134         has_drm = re.search('|'.join([
2135             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2136             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2137         ]), m3u8_doc)
2138
2139         def format_url(url):
2140             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2141
2142         if self.get_param('hls_split_discontinuity', False):
2143             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2144                 if not m3u8_doc:
2145                     if not manifest_url:
2146                         return []
2147                     m3u8_doc = self._download_webpage(
2148                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2149                         note=False, errnote='Failed to download m3u8 playlist information')
2150                     if m3u8_doc is False:
2151                         return []
2152                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2153
2154         else:
2155             def _extract_m3u8_playlist_indices(*args, **kwargs):
2156                 return [None]
2157
2158         # References:
2159         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2160         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2161         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2162
2163         # We should try extracting formats only from master playlists [1, 4.3.4],
2164         # i.e. playlists that describe available qualities. On the other hand
2165         # media playlists [1, 4.3.3] should be returned as is since they contain
2166         # just the media without qualities renditions.
2167         # Fortunately, master playlist can be easily distinguished from media
2168         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2169         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2170         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2171         # media playlist and MUST NOT appear in master playlist thus we can
2172         # clearly detect media playlist with this criterion.
2173
2174         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2175             formats = [{
2176                 'format_id': join_nonempty(m3u8_id, idx),
2177                 'format_index': idx,
2178                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2179                 'ext': ext,
2180                 'protocol': entry_protocol,
2181                 'preference': preference,
2182                 'quality': quality,
2183                 'has_drm': has_drm,
2184             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2185
2186             return formats, subtitles
2187
2188         groups = {}
2189         last_stream_inf = {}
2190
2191         def extract_media(x_media_line):
2192             media = parse_m3u8_attributes(x_media_line)
2193             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2194             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2195             if not (media_type and group_id and name):
2196                 return
2197             groups.setdefault(group_id, []).append(media)
2198             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2199             if media_type == 'SUBTITLES':
2200                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2201                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2202                 # However, lack of URI has been spotted in the wild.
2203                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2204                 if not media.get('URI'):
2205                     return
2206                 url = format_url(media['URI'])
2207                 sub_info = {
2208                     'url': url,
2209                     'ext': determine_ext(url),
2210                 }
2211                 if sub_info['ext'] == 'm3u8':
2212                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2213                     # files may contain is WebVTT:
2214                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2215                     sub_info['ext'] = 'vtt'
2216                     sub_info['protocol'] = 'm3u8_native'
2217                 lang = media.get('LANGUAGE') or 'und'
2218                 subtitles.setdefault(lang, []).append(sub_info)
2219             if media_type not in ('VIDEO', 'AUDIO'):
2220                 return
2221             media_url = media.get('URI')
2222             if media_url:
2223                 manifest_url = format_url(media_url)
2224                 formats.extend({
2225                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2226                     'format_note': name,
2227                     'format_index': idx,
2228                     'url': manifest_url,
2229                     'manifest_url': m3u8_url,
2230                     'language': media.get('LANGUAGE'),
2231                     'ext': ext,
2232                     'protocol': entry_protocol,
2233                     'preference': preference,
2234                     'quality': quality,
2235                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2236                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2237
2238         def build_stream_name():
2239             # Despite specification does not mention NAME attribute for
2240             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2241             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2242             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2243             stream_name = last_stream_inf.get('NAME')
2244             if stream_name:
2245                 return stream_name
2246             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2247             # from corresponding rendition group
2248             stream_group_id = last_stream_inf.get('VIDEO')
2249             if not stream_group_id:
2250                 return
2251             stream_group = groups.get(stream_group_id)
2252             if not stream_group:
2253                 return stream_group_id
2254             rendition = stream_group[0]
2255             return rendition.get('NAME') or stream_group_id
2256
2257         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2258         # chance to detect video only formats when EXT-X-STREAM-INF tags
2259         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2260         for line in m3u8_doc.splitlines():
2261             if line.startswith('#EXT-X-MEDIA:'):
2262                 extract_media(line)
2263
2264         for line in m3u8_doc.splitlines():
2265             if line.startswith('#EXT-X-STREAM-INF:'):
2266                 last_stream_inf = parse_m3u8_attributes(line)
2267             elif line.startswith('#') or not line.strip():
2268                 continue
2269             else:
2270                 tbr = float_or_none(
2271                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2272                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2273                 manifest_url = format_url(line.strip())
2274
2275                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2276                     format_id = [m3u8_id, None, idx]
2277                     # Bandwidth of live streams may differ over time thus making
2278                     # format_id unpredictable. So it's better to keep provided
2279                     # format_id intact.
2280                     if not live:
2281                         stream_name = build_stream_name()
2282                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2283                     f = {
2284                         'format_id': join_nonempty(*format_id),
2285                         'format_index': idx,
2286                         'url': manifest_url,
2287                         'manifest_url': m3u8_url,
2288                         'tbr': tbr,
2289                         'ext': ext,
2290                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2291                         'protocol': entry_protocol,
2292                         'preference': preference,
2293                         'quality': quality,
2294                     }
2295                     resolution = last_stream_inf.get('RESOLUTION')
2296                     if resolution:
2297                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2298                         if mobj:
2299                             f['width'] = int(mobj.group('width'))
2300                             f['height'] = int(mobj.group('height'))
2301                     # Unified Streaming Platform
2302                     mobj = re.search(
2303                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2304                     if mobj:
2305                         abr, vbr = mobj.groups()
2306                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2307                         f.update({
2308                             'vbr': vbr,
2309                             'abr': abr,
2310                         })
2311                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2312                     f.update(codecs)
2313                     audio_group_id = last_stream_inf.get('AUDIO')
2314                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2315                     # references a rendition group MUST have a CODECS attribute.
2316                     # However, this is not always respected, for example, [2]
2317                     # contains EXT-X-STREAM-INF tag which references AUDIO
2318                     # rendition group but does not have CODECS and despite
2319                     # referencing an audio group it represents a complete
2320                     # (with audio and video) format. So, for such cases we will
2321                     # ignore references to rendition groups and treat them
2322                     # as complete formats.
2323                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2324                         audio_group = groups.get(audio_group_id)
2325                         if audio_group and audio_group[0].get('URI'):
2326                             # TODO: update acodec for audio only formats with
2327                             # the same GROUP-ID
2328                             f['acodec'] = 'none'
2329                     if not f.get('ext'):
2330                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2331                     formats.append(f)
2332
2333                     # for DailyMotion
2334                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2335                     if progressive_uri:
2336                         http_f = f.copy()
2337                         del http_f['manifest_url']
2338                         http_f.update({
2339                             'format_id': f['format_id'].replace('hls-', 'http-'),
2340                             'protocol': 'http',
2341                             'url': progressive_uri,
2342                         })
2343                         formats.append(http_f)
2344
2345                 last_stream_inf = {}
2346         return formats, subtitles
2347
2348     def _extract_m3u8_vod_duration(
2349             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2350
2351         m3u8_vod = self._download_webpage(
2352             m3u8_vod_url, video_id,
2353             note='Downloading m3u8 VOD manifest' if note is None else note,
2354             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2355             fatal=False, data=data, headers=headers, query=query)
2356
2357         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2358
2359     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2360         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2361             return None
2362
2363         return int(sum(
2364             float(line[len('#EXTINF:'):].split(',')[0])
2365             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2366
2367     @staticmethod
2368     def _xpath_ns(path, namespace=None):
2369         if not namespace:
2370             return path
2371         out = []
2372         for c in path.split('/'):
2373             if not c or c == '.':
2374                 out.append(c)
2375             else:
2376                 out.append('{%s}%s' % (namespace, c))
2377         return '/'.join(out)
2378
2379     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2380         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2381         if res is False:
2382             assert not fatal
2383             return [], {}
2384
2385         smil, urlh = res
2386         smil_url = urlh.geturl()
2387
2388         namespace = self._parse_smil_namespace(smil)
2389
2390         fmts = self._parse_smil_formats(
2391             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2392         subs = self._parse_smil_subtitles(
2393             smil, namespace=namespace)
2394
2395         return fmts, subs
2396
2397     def _extract_smil_formats(self, *args, **kwargs):
2398         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2399         if subs:
2400             self._report_ignoring_subs('SMIL')
2401         return fmts
2402
2403     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2404         res = self._download_smil(smil_url, video_id, fatal=fatal)
2405         if res is False:
2406             return {}
2407
2408         smil, urlh = res
2409         smil_url = urlh.geturl()
2410
2411         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2412
2413     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2414         return self._download_xml_handle(
2415             smil_url, video_id, 'Downloading SMIL file',
2416             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2417
2418     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2419         namespace = self._parse_smil_namespace(smil)
2420
2421         formats = self._parse_smil_formats(
2422             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2423         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2424
2425         video_id = os.path.splitext(url_basename(smil_url))[0]
2426         title = None
2427         description = None
2428         upload_date = None
2429         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2430             name = meta.attrib.get('name')
2431             content = meta.attrib.get('content')
2432             if not name or not content:
2433                 continue
2434             if not title and name == 'title':
2435                 title = content
2436             elif not description and name in ('description', 'abstract'):
2437                 description = content
2438             elif not upload_date and name == 'date':
2439                 upload_date = unified_strdate(content)
2440
2441         thumbnails = [{
2442             'id': image.get('type'),
2443             'url': image.get('src'),
2444             'width': int_or_none(image.get('width')),
2445             'height': int_or_none(image.get('height')),
2446         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2447
2448         return {
2449             'id': video_id,
2450             'title': title or video_id,
2451             'description': description,
2452             'upload_date': upload_date,
2453             'thumbnails': thumbnails,
2454             'formats': formats,
2455             'subtitles': subtitles,
2456         }
2457
2458     def _parse_smil_namespace(self, smil):
2459         return self._search_regex(
2460             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2461
2462     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2463         base = smil_url
2464         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2465             b = meta.get('base') or meta.get('httpBase')
2466             if b:
2467                 base = b
2468                 break
2469
2470         formats = []
2471         rtmp_count = 0
2472         http_count = 0
2473         m3u8_count = 0
2474         imgs_count = 0
2475
2476         srcs = set()
2477         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2478         for medium in media:
2479             src = medium.get('src')
2480             if not src or src in srcs:
2481                 continue
2482             srcs.add(src)
2483
2484             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2485             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2486             width = int_or_none(medium.get('width'))
2487             height = int_or_none(medium.get('height'))
2488             proto = medium.get('proto')
2489             ext = medium.get('ext')
2490             src_ext = determine_ext(src)
2491             streamer = medium.get('streamer') or base
2492
2493             if proto == 'rtmp' or streamer.startswith('rtmp'):
2494                 rtmp_count += 1
2495                 formats.append({
2496                     'url': streamer,
2497                     'play_path': src,
2498                     'ext': 'flv',
2499                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2500                     'tbr': bitrate,
2501                     'filesize': filesize,
2502                     'width': width,
2503                     'height': height,
2504                 })
2505                 if transform_rtmp_url:
2506                     streamer, src = transform_rtmp_url(streamer, src)
2507                     formats[-1].update({
2508                         'url': streamer,
2509                         'play_path': src,
2510                     })
2511                 continue
2512
2513             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2514             src_url = src_url.strip()
2515
2516             if proto == 'm3u8' or src_ext == 'm3u8':
2517                 m3u8_formats = self._extract_m3u8_formats(
2518                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2519                 if len(m3u8_formats) == 1:
2520                     m3u8_count += 1
2521                     m3u8_formats[0].update({
2522                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2523                         'tbr': bitrate,
2524                         'width': width,
2525                         'height': height,
2526                     })
2527                 formats.extend(m3u8_formats)
2528             elif src_ext == 'f4m':
2529                 f4m_url = src_url
2530                 if not f4m_params:
2531                     f4m_params = {
2532                         'hdcore': '3.2.0',
2533                         'plugin': 'flowplayer-3.2.0.1',
2534                     }
2535                 f4m_url += '&' if '?' in f4m_url else '?'
2536                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2537                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2538             elif src_ext == 'mpd':
2539                 formats.extend(self._extract_mpd_formats(
2540                     src_url, video_id, mpd_id='dash', fatal=False))
2541             elif re.search(r'\.ism/[Mm]anifest', src_url):
2542                 formats.extend(self._extract_ism_formats(
2543                     src_url, video_id, ism_id='mss', fatal=False))
2544             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2545                 http_count += 1
2546                 formats.append({
2547                     'url': src_url,
2548                     'ext': ext or src_ext or 'flv',
2549                     'format_id': 'http-%d' % (bitrate or http_count),
2550                     'tbr': bitrate,
2551                     'filesize': filesize,
2552                     'width': width,
2553                     'height': height,
2554                 })
2555
2556         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2557             src = medium.get('src')
2558             if not src or src in srcs:
2559                 continue
2560             srcs.add(src)
2561
2562             imgs_count += 1
2563             formats.append({
2564                 'format_id': 'imagestream-%d' % (imgs_count),
2565                 'url': src,
2566                 'ext': mimetype2ext(medium.get('type')),
2567                 'acodec': 'none',
2568                 'vcodec': 'none',
2569                 'width': int_or_none(medium.get('width')),
2570                 'height': int_or_none(medium.get('height')),
2571                 'format_note': 'SMIL storyboards',
2572             })
2573
2574         return formats
2575
2576     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2577         urls = []
2578         subtitles = {}
2579         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2580             src = textstream.get('src')
2581             if not src or src in urls:
2582                 continue
2583             urls.append(src)
2584             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2585             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2586             subtitles.setdefault(lang, []).append({
2587                 'url': src,
2588                 'ext': ext,
2589             })
2590         return subtitles
2591
2592     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2593         res = self._download_xml_handle(
2594             xspf_url, playlist_id, 'Downloading xpsf playlist',
2595             'Unable to download xspf manifest', fatal=fatal)
2596         if res is False:
2597             return []
2598
2599         xspf, urlh = res
2600         xspf_url = urlh.geturl()
2601
2602         return self._parse_xspf(
2603             xspf, playlist_id, xspf_url=xspf_url,
2604             xspf_base_url=base_url(xspf_url))
2605
2606     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2607         NS_MAP = {
2608             'xspf': 'http://xspf.org/ns/0/',
2609             's1': 'http://static.streamone.nl/player/ns/0',
2610         }
2611
2612         entries = []
2613         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2614             title = xpath_text(
2615                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2616             description = xpath_text(
2617                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2618             thumbnail = xpath_text(
2619                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2620             duration = float_or_none(
2621                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2622
2623             formats = []
2624             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2625                 format_url = urljoin(xspf_base_url, location.text)
2626                 if not format_url:
2627                     continue
2628                 formats.append({
2629                     'url': format_url,
2630                     'manifest_url': xspf_url,
2631                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2632                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2633                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2634                 })
2635             self._sort_formats(formats)
2636
2637             entries.append({
2638                 'id': playlist_id,
2639                 'title': title,
2640                 'description': description,
2641                 'thumbnail': thumbnail,
2642                 'duration': duration,
2643                 'formats': formats,
2644             })
2645         return entries
2646
2647     def _extract_mpd_formats(self, *args, **kwargs):
2648         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2649         if subs:
2650             self._report_ignoring_subs('DASH')
2651         return fmts
2652
2653     def _extract_mpd_formats_and_subtitles(
2654             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2655             fatal=True, data=None, headers={}, query={}):
2656         res = self._download_xml_handle(
2657             mpd_url, video_id,
2658             note='Downloading MPD manifest' if note is None else note,
2659             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2660             fatal=fatal, data=data, headers=headers, query=query)
2661         if res is False:
2662             return [], {}
2663         mpd_doc, urlh = res
2664         if mpd_doc is None:
2665             return [], {}
2666
2667         # We could have been redirected to a new url when we retrieved our mpd file.
2668         mpd_url = urlh.geturl()
2669         mpd_base_url = base_url(mpd_url)
2670
2671         return self._parse_mpd_formats_and_subtitles(
2672             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2673
2674     def _parse_mpd_formats(self, *args, **kwargs):
2675         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2676         if subs:
2677             self._report_ignoring_subs('DASH')
2678         return fmts
2679
2680     def _parse_mpd_formats_and_subtitles(
2681             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2682         """
2683         Parse formats from MPD manifest.
2684         References:
2685          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2686             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2687          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2688         """
2689         if not self.get_param('dynamic_mpd', True):
2690             if mpd_doc.get('type') == 'dynamic':
2691                 return [], {}
2692
2693         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2694
2695         def _add_ns(path):
2696             return self._xpath_ns(path, namespace)
2697
2698         def is_drm_protected(element):
2699             return element.find(_add_ns('ContentProtection')) is not None
2700
2701         def extract_multisegment_info(element, ms_parent_info):
2702             ms_info = ms_parent_info.copy()
2703
2704             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2705             # common attributes and elements.  We will only extract relevant
2706             # for us.
2707             def extract_common(source):
2708                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2709                 if segment_timeline is not None:
2710                     s_e = segment_timeline.findall(_add_ns('S'))
2711                     if s_e:
2712                         ms_info['total_number'] = 0
2713                         ms_info['s'] = []
2714                         for s in s_e:
2715                             r = int(s.get('r', 0))
2716                             ms_info['total_number'] += 1 + r
2717                             ms_info['s'].append({
2718                                 't': int(s.get('t', 0)),
2719                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2720                                 'd': int(s.attrib['d']),
2721                                 'r': r,
2722                             })
2723                 start_number = source.get('startNumber')
2724                 if start_number:
2725                     ms_info['start_number'] = int(start_number)
2726                 timescale = source.get('timescale')
2727                 if timescale:
2728                     ms_info['timescale'] = int(timescale)
2729                 segment_duration = source.get('duration')
2730                 if segment_duration:
2731                     ms_info['segment_duration'] = float(segment_duration)
2732
2733             def extract_Initialization(source):
2734                 initialization = source.find(_add_ns('Initialization'))
2735                 if initialization is not None:
2736                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2737
2738             segment_list = element.find(_add_ns('SegmentList'))
2739             if segment_list is not None:
2740                 extract_common(segment_list)
2741                 extract_Initialization(segment_list)
2742                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2743                 if segment_urls_e:
2744                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2745             else:
2746                 segment_template = element.find(_add_ns('SegmentTemplate'))
2747                 if segment_template is not None:
2748                     extract_common(segment_template)
2749                     media = segment_template.get('media')
2750                     if media:
2751                         ms_info['media'] = media
2752                     initialization = segment_template.get('initialization')
2753                     if initialization:
2754                         ms_info['initialization'] = initialization
2755                     else:
2756                         extract_Initialization(segment_template)
2757             return ms_info
2758
2759         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2760         formats, subtitles = [], {}
2761         stream_numbers = collections.defaultdict(int)
2762         for period in mpd_doc.findall(_add_ns('Period')):
2763             period_duration = parse_duration(period.get('duration')) or mpd_duration
2764             period_ms_info = extract_multisegment_info(period, {
2765                 'start_number': 1,
2766                 'timescale': 1,
2767             })
2768             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2769                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2770                 for representation in adaptation_set.findall(_add_ns('Representation')):
2771                     representation_attrib = adaptation_set.attrib.copy()
2772                     representation_attrib.update(representation.attrib)
2773                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2774                     mime_type = representation_attrib['mimeType']
2775                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2776
2777                     codec_str = representation_attrib.get('codecs', '')
2778                     # Some kind of binary subtitle found in some youtube livestreams
2779                     if mime_type == 'application/x-rawcc':
2780                         codecs = {'scodec': codec_str}
2781                     else:
2782                         codecs = parse_codecs(codec_str)
2783                     if content_type not in ('video', 'audio', 'text'):
2784                         if mime_type == 'image/jpeg':
2785                             content_type = mime_type
2786                         elif codecs.get('vcodec', 'none') != 'none':
2787                             content_type = 'video'
2788                         elif codecs.get('acodec', 'none') != 'none':
2789                             content_type = 'audio'
2790                         elif codecs.get('scodec', 'none') != 'none':
2791                             content_type = 'text'
2792                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2793                             content_type = 'text'
2794                         else:
2795                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2796                             continue
2797
2798                     base_url = ''
2799                     for element in (representation, adaptation_set, period, mpd_doc):
2800                         base_url_e = element.find(_add_ns('BaseURL'))
2801                         if base_url_e is not None:
2802                             base_url = base_url_e.text + base_url
2803                             if re.match(r'^https?://', base_url):
2804                                 break
2805                     if mpd_base_url and base_url.startswith('/'):
2806                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2807                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2808                         if not mpd_base_url.endswith('/'):
2809                             mpd_base_url += '/'
2810                         base_url = mpd_base_url + base_url
2811                     representation_id = representation_attrib.get('id')
2812                     lang = representation_attrib.get('lang')
2813                     url_el = representation.find(_add_ns('BaseURL'))
2814                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2815                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2816                     if representation_id is not None:
2817                         format_id = representation_id
2818                     else:
2819                         format_id = content_type
2820                     if mpd_id:
2821                         format_id = mpd_id + '-' + format_id
2822                     if content_type in ('video', 'audio'):
2823                         f = {
2824                             'format_id': format_id,
2825                             'manifest_url': mpd_url,
2826                             'ext': mimetype2ext(mime_type),
2827                             'width': int_or_none(representation_attrib.get('width')),
2828                             'height': int_or_none(representation_attrib.get('height')),
2829                             'tbr': float_or_none(bandwidth, 1000),
2830                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2831                             'fps': int_or_none(representation_attrib.get('frameRate')),
2832                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2833                             'format_note': 'DASH %s' % content_type,
2834                             'filesize': filesize,
2835                             'container': mimetype2ext(mime_type) + '_dash',
2836                             **codecs
2837                         }
2838                     elif content_type == 'text':
2839                         f = {
2840                             'ext': mimetype2ext(mime_type),
2841                             'manifest_url': mpd_url,
2842                             'filesize': filesize,
2843                         }
2844                     elif content_type == 'image/jpeg':
2845                         # See test case in VikiIE
2846                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2847                         f = {
2848                             'format_id': format_id,
2849                             'ext': 'mhtml',
2850                             'manifest_url': mpd_url,
2851                             'format_note': 'DASH storyboards (jpeg)',
2852                             'acodec': 'none',
2853                             'vcodec': 'none',
2854                         }
2855                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2856                         f['has_drm'] = True
2857                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2858
2859                     def prepare_template(template_name, identifiers):
2860                         tmpl = representation_ms_info[template_name]
2861                         # First of, % characters outside $...$ templates
2862                         # must be escaped by doubling for proper processing
2863                         # by % operator string formatting used further (see
2864                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2865                         t = ''
2866                         in_template = False
2867                         for c in tmpl:
2868                             t += c
2869                             if c == '$':
2870                                 in_template = not in_template
2871                             elif c == '%' and not in_template:
2872                                 t += c
2873                         # Next, $...$ templates are translated to their
2874                         # %(...) counterparts to be used with % operator
2875                         if representation_id is not None:
2876                             t = t.replace('$RepresentationID$', representation_id)
2877                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2878                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2879                         t.replace('$$', '$')
2880                         return t
2881
2882                     # @initialization is a regular template like @media one
2883                     # so it should be handled just the same way (see
2884                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2885                     if 'initialization' in representation_ms_info:
2886                         initialization_template = prepare_template(
2887                             'initialization',
2888                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2889                             # $Time$ shall not be included for @initialization thus
2890                             # only $Bandwidth$ remains
2891                             ('Bandwidth', ))
2892                         representation_ms_info['initialization_url'] = initialization_template % {
2893                             'Bandwidth': bandwidth,
2894                         }
2895
2896                     def location_key(location):
2897                         return 'url' if re.match(r'^https?://', location) else 'path'
2898
2899                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2900
2901                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2902                         media_location_key = location_key(media_template)
2903
2904                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2905                         # can't be used at the same time
2906                         if '%(Number' in media_template and 's' not in representation_ms_info:
2907                             segment_duration = None
2908                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2909                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2910                                 representation_ms_info['total_number'] = int(math.ceil(
2911                                     float_or_none(period_duration, segment_duration, default=0)))
2912                             representation_ms_info['fragments'] = [{
2913                                 media_location_key: media_template % {
2914                                     'Number': segment_number,
2915                                     'Bandwidth': bandwidth,
2916                                 },
2917                                 'duration': segment_duration,
2918                             } for segment_number in range(
2919                                 representation_ms_info['start_number'],
2920                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2921                         else:
2922                             # $Number*$ or $Time$ in media template with S list available
2923                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2924                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2925                             representation_ms_info['fragments'] = []
2926                             segment_time = 0
2927                             segment_d = None
2928                             segment_number = representation_ms_info['start_number']
2929
2930                             def add_segment_url():
2931                                 segment_url = media_template % {
2932                                     'Time': segment_time,
2933                                     'Bandwidth': bandwidth,
2934                                     'Number': segment_number,
2935                                 }
2936                                 representation_ms_info['fragments'].append({
2937                                     media_location_key: segment_url,
2938                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2939                                 })
2940
2941                             for num, s in enumerate(representation_ms_info['s']):
2942                                 segment_time = s.get('t') or segment_time
2943                                 segment_d = s['d']
2944                                 add_segment_url()
2945                                 segment_number += 1
2946                                 for r in range(s.get('r', 0)):
2947                                     segment_time += segment_d
2948                                     add_segment_url()
2949                                     segment_number += 1
2950                                 segment_time += segment_d
2951                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2952                         # No media template
2953                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2954                         # or any YouTube dashsegments video
2955                         fragments = []
2956                         segment_index = 0
2957                         timescale = representation_ms_info['timescale']
2958                         for s in representation_ms_info['s']:
2959                             duration = float_or_none(s['d'], timescale)
2960                             for r in range(s.get('r', 0) + 1):
2961                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2962                                 fragments.append({
2963                                     location_key(segment_uri): segment_uri,
2964                                     'duration': duration,
2965                                 })
2966                                 segment_index += 1
2967                         representation_ms_info['fragments'] = fragments
2968                     elif 'segment_urls' in representation_ms_info:
2969                         # Segment URLs with no SegmentTimeline
2970                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2971                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2972                         fragments = []
2973                         segment_duration = float_or_none(
2974                             representation_ms_info['segment_duration'],
2975                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2976                         for segment_url in representation_ms_info['segment_urls']:
2977                             fragment = {
2978                                 location_key(segment_url): segment_url,
2979                             }
2980                             if segment_duration:
2981                                 fragment['duration'] = segment_duration
2982                             fragments.append(fragment)
2983                         representation_ms_info['fragments'] = fragments
2984                     # If there is a fragments key available then we correctly recognized fragmented media.
2985                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2986                     # assumption is not necessarily correct since we may simply have no support for
2987                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2988                     if 'fragments' in representation_ms_info:
2989                         f.update({
2990                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2991                             'url': mpd_url or base_url,
2992                             'fragment_base_url': base_url,
2993                             'fragments': [],
2994                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2995                         })
2996                         if 'initialization_url' in representation_ms_info:
2997                             initialization_url = representation_ms_info['initialization_url']
2998                             if not f.get('url'):
2999                                 f['url'] = initialization_url
3000                             f['fragments'].append({location_key(initialization_url): initialization_url})
3001                         f['fragments'].extend(representation_ms_info['fragments'])
3002                         if not period_duration:
3003                             period_duration = try_get(
3004                                 representation_ms_info,
3005                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3006                     else:
3007                         # Assuming direct URL to unfragmented media.
3008                         f['url'] = base_url
3009                     if content_type in ('video', 'audio', 'image/jpeg'):
3010                         f['manifest_stream_number'] = stream_numbers[f['url']]
3011                         stream_numbers[f['url']] += 1
3012                         formats.append(f)
3013                     elif content_type == 'text':
3014                         subtitles.setdefault(lang or 'und', []).append(f)
3015
3016         return formats, subtitles
3017
3018     def _extract_ism_formats(self, *args, **kwargs):
3019         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3020         if subs:
3021             self._report_ignoring_subs('ISM')
3022         return fmts
3023
3024     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3025         res = self._download_xml_handle(
3026             ism_url, video_id,
3027             note='Downloading ISM manifest' if note is None else note,
3028             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3029             fatal=fatal, data=data, headers=headers, query=query)
3030         if res is False:
3031             return [], {}
3032         ism_doc, urlh = res
3033         if ism_doc is None:
3034             return [], {}
3035
3036         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3037
3038     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3039         """
3040         Parse formats from ISM manifest.
3041         References:
3042          1. [MS-SSTR]: Smooth Streaming Protocol,
3043             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3044         """
3045         if ism_doc.get('IsLive') == 'TRUE':
3046             return [], {}
3047
3048         duration = int(ism_doc.attrib['Duration'])
3049         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3050
3051         formats = []
3052         subtitles = {}
3053         for stream in ism_doc.findall('StreamIndex'):
3054             stream_type = stream.get('Type')
3055             if stream_type not in ('video', 'audio', 'text'):
3056                 continue
3057             url_pattern = stream.attrib['Url']
3058             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3059             stream_name = stream.get('Name')
3060             stream_language = stream.get('Language', 'und')
3061             for track in stream.findall('QualityLevel'):
3062                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3063                 # TODO: add support for WVC1 and WMAP
3064                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3065                     self.report_warning('%s is not a supported codec' % fourcc)
3066                     continue
3067                 tbr = int(track.attrib['Bitrate']) // 1000
3068                 # [1] does not mention Width and Height attributes. However,
3069                 # they're often present while MaxWidth and MaxHeight are
3070                 # missing, so should be used as fallbacks
3071                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3072                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3073                 sampling_rate = int_or_none(track.get('SamplingRate'))
3074
3075                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3076                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3077
3078                 fragments = []
3079                 fragment_ctx = {
3080                     'time': 0,
3081                 }
3082                 stream_fragments = stream.findall('c')
3083                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3084                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3085                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3086                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3087                     if not fragment_ctx['duration']:
3088                         try:
3089                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3090                         except IndexError:
3091                             next_fragment_time = duration
3092                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3093                     for _ in range(fragment_repeat):
3094                         fragments.append({
3095                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3096                             'duration': fragment_ctx['duration'] / stream_timescale,
3097                         })
3098                         fragment_ctx['time'] += fragment_ctx['duration']
3099
3100                 if stream_type == 'text':
3101                     subtitles.setdefault(stream_language, []).append({
3102                         'ext': 'ismt',
3103                         'protocol': 'ism',
3104                         'url': ism_url,
3105                         'manifest_url': ism_url,
3106                         'fragments': fragments,
3107                         '_download_params': {
3108                             'stream_type': stream_type,
3109                             'duration': duration,
3110                             'timescale': stream_timescale,
3111                             'fourcc': fourcc,
3112                             'language': stream_language,
3113                             'codec_private_data': track.get('CodecPrivateData'),
3114                         }
3115                     })
3116                 elif stream_type in ('video', 'audio'):
3117                     formats.append({
3118                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3119                         'url': ism_url,
3120                         'manifest_url': ism_url,
3121                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3122                         'width': width,
3123                         'height': height,
3124                         'tbr': tbr,
3125                         'asr': sampling_rate,
3126                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3127                         'acodec': 'none' if stream_type == 'video' else fourcc,
3128                         'protocol': 'ism',
3129                         'fragments': fragments,
3130                         'has_drm': ism_doc.find('Protection') is not None,
3131                         '_download_params': {
3132                             'stream_type': stream_type,
3133                             'duration': duration,
3134                             'timescale': stream_timescale,
3135                             'width': width or 0,
3136                             'height': height or 0,
3137                             'fourcc': fourcc,
3138                             'language': stream_language,
3139                             'codec_private_data': track.get('CodecPrivateData'),
3140                             'sampling_rate': sampling_rate,
3141                             'channels': int_or_none(track.get('Channels', 2)),
3142                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3143                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3144                         },
3145                     })
3146         return formats, subtitles
3147
3148     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3149         def absolute_url(item_url):
3150             return urljoin(base_url, item_url)
3151
3152         def parse_content_type(content_type):
3153             if not content_type:
3154                 return {}
3155             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3156             if ctr:
3157                 mimetype, codecs = ctr.groups()
3158                 f = parse_codecs(codecs)
3159                 f['ext'] = mimetype2ext(mimetype)
3160                 return f
3161             return {}
3162
3163         def _media_formats(src, cur_media_type, type_info=None):
3164             type_info = type_info or {}
3165             full_url = absolute_url(src)
3166             ext = type_info.get('ext') or determine_ext(full_url)
3167             if ext == 'm3u8':
3168                 is_plain_url = False
3169                 formats = self._extract_m3u8_formats(
3170                     full_url, video_id, ext='mp4',
3171                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3172                     preference=preference, quality=quality, fatal=False)
3173             elif ext == 'mpd':
3174                 is_plain_url = False
3175                 formats = self._extract_mpd_formats(
3176                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3177             else:
3178                 is_plain_url = True
3179                 formats = [{
3180                     'url': full_url,
3181                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3182                     'ext': ext,
3183                 }]
3184             return is_plain_url, formats
3185
3186         entries = []
3187         # amp-video and amp-audio are very similar to their HTML5 counterparts
3188         # so we wll include them right here (see
3189         # https://www.ampproject.org/docs/reference/components/amp-video)
3190         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3191         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3192         media_tags = [(media_tag, media_tag_name, media_type, '')
3193                       for media_tag, media_tag_name, media_type
3194                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3195         media_tags.extend(re.findall(
3196             # We only allow video|audio followed by a whitespace or '>'.
3197             # Allowing more characters may end up in significant slow down (see
3198             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3199             # http://www.porntrex.com/maps/videositemap.xml).
3200             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3201         for media_tag, _, media_type, media_content in media_tags:
3202             media_info = {
3203                 'formats': [],
3204                 'subtitles': {},
3205             }
3206             media_attributes = extract_attributes(media_tag)
3207             src = strip_or_none(media_attributes.get('src'))
3208             if src:
3209                 f = parse_content_type(media_attributes.get('type'))
3210                 _, formats = _media_formats(src, media_type, f)
3211                 media_info['formats'].extend(formats)
3212             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3213             if media_content:
3214                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3215                     s_attr = extract_attributes(source_tag)
3216                     # data-video-src and data-src are non standard but seen
3217                     # several times in the wild
3218                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3219                     if not src:
3220                         continue
3221                     f = parse_content_type(s_attr.get('type'))
3222                     is_plain_url, formats = _media_formats(src, media_type, f)
3223                     if is_plain_url:
3224                         # width, height, res, label and title attributes are
3225                         # all not standard but seen several times in the wild
3226                         labels = [
3227                             s_attr.get(lbl)
3228                             for lbl in ('label', 'title')
3229                             if str_or_none(s_attr.get(lbl))
3230                         ]
3231                         width = int_or_none(s_attr.get('width'))
3232                         height = (int_or_none(s_attr.get('height'))
3233                                   or int_or_none(s_attr.get('res')))
3234                         if not width or not height:
3235                             for lbl in labels:
3236                                 resolution = parse_resolution(lbl)
3237                                 if not resolution:
3238                                     continue
3239                                 width = width or resolution.get('width')
3240                                 height = height or resolution.get('height')
3241                         for lbl in labels:
3242                             tbr = parse_bitrate(lbl)
3243                             if tbr:
3244                                 break
3245                         else:
3246                             tbr = None
3247                         f.update({
3248                             'width': width,
3249                             'height': height,
3250                             'tbr': tbr,
3251                             'format_id': s_attr.get('label') or s_attr.get('title'),
3252                         })
3253                         f.update(formats[0])
3254                         media_info['formats'].append(f)
3255                     else:
3256                         media_info['formats'].extend(formats)
3257                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3258                     track_attributes = extract_attributes(track_tag)
3259                     kind = track_attributes.get('kind')
3260                     if not kind or kind in ('subtitles', 'captions'):
3261                         src = strip_or_none(track_attributes.get('src'))
3262                         if not src:
3263                             continue
3264                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3265                         media_info['subtitles'].setdefault(lang, []).append({
3266                             'url': absolute_url(src),
3267                         })
3268             for f in media_info['formats']:
3269                 f.setdefault('http_headers', {})['Referer'] = base_url
3270             if media_info['formats'] or media_info['subtitles']:
3271                 entries.append(media_info)
3272         return entries
3273
3274     def _extract_akamai_formats(self, *args, **kwargs):
3275         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3276         if subs:
3277             self._report_ignoring_subs('akamai')
3278         return fmts
3279
3280     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3281         signed = 'hdnea=' in manifest_url
3282         if not signed:
3283             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3284             manifest_url = re.sub(
3285                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3286                 '', manifest_url).strip('?')
3287
3288         formats = []
3289         subtitles = {}
3290
3291         hdcore_sign = 'hdcore=3.7.0'
3292         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3293         hds_host = hosts.get('hds')
3294         if hds_host:
3295             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3296         if 'hdcore=' not in f4m_url:
3297             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3298         f4m_formats = self._extract_f4m_formats(
3299             f4m_url, video_id, f4m_id='hds', fatal=False)
3300         for entry in f4m_formats:
3301             entry.update({'extra_param_to_segment_url': hdcore_sign})
3302         formats.extend(f4m_formats)
3303
3304         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3305         hls_host = hosts.get('hls')
3306         if hls_host:
3307             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3308         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3309             m3u8_url, video_id, 'mp4', 'm3u8_native',
3310             m3u8_id='hls', fatal=False)
3311         formats.extend(m3u8_formats)
3312         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3313
3314         http_host = hosts.get('http')
3315         if http_host and m3u8_formats and not signed:
3316             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3317             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3318             qualities_length = len(qualities)
3319             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3320                 i = 0
3321                 for f in m3u8_formats:
3322                     if f['vcodec'] != 'none':
3323                         for protocol in ('http', 'https'):
3324                             http_f = f.copy()
3325                             del http_f['manifest_url']
3326                             http_url = re.sub(
3327                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3328                             http_f.update({
3329                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3330                                 'url': http_url,
3331                                 'protocol': protocol,
3332                             })
3333                             formats.append(http_f)
3334                         i += 1
3335
3336         return formats, subtitles
3337
3338     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3339         query = compat_urlparse.urlparse(url).query
3340         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3341         mobj = re.search(
3342             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3343         url_base = mobj.group('url')
3344         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3345         formats = []
3346
3347         def manifest_url(manifest):
3348             m_url = f'{http_base_url}/{manifest}'
3349             if query:
3350                 m_url += '?%s' % query
3351             return m_url
3352
3353         if 'm3u8' not in skip_protocols:
3354             formats.extend(self._extract_m3u8_formats(
3355                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3356                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3357         if 'f4m' not in skip_protocols:
3358             formats.extend(self._extract_f4m_formats(
3359                 manifest_url('manifest.f4m'),
3360                 video_id, f4m_id='hds', fatal=False))
3361         if 'dash' not in skip_protocols:
3362             formats.extend(self._extract_mpd_formats(
3363                 manifest_url('manifest.mpd'),
3364                 video_id, mpd_id='dash', fatal=False))
3365         if re.search(r'(?:/smil:|\.smil)', url_base):
3366             if 'smil' not in skip_protocols:
3367                 rtmp_formats = self._extract_smil_formats(
3368                     manifest_url('jwplayer.smil'),
3369                     video_id, fatal=False)
3370                 for rtmp_format in rtmp_formats:
3371                     rtsp_format = rtmp_format.copy()
3372                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3373                     del rtsp_format['play_path']
3374                     del rtsp_format['ext']
3375                     rtsp_format.update({
3376                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3377                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3378                         'protocol': 'rtsp',
3379                     })
3380                     formats.extend([rtmp_format, rtsp_format])
3381         else:
3382             for protocol in ('rtmp', 'rtsp'):
3383                 if protocol not in skip_protocols:
3384                     formats.append({
3385                         'url': f'{protocol}:{url_base}',
3386                         'format_id': protocol,
3387                         'protocol': protocol,
3388                     })
3389         return formats
3390
3391     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3392         mobj = re.search(
3393             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3394             webpage)
3395         if mobj:
3396             try:
3397                 jwplayer_data = self._parse_json(mobj.group('options'),
3398                                                  video_id=video_id,
3399                                                  transform_source=transform_source)
3400             except ExtractorError:
3401                 pass
3402             else:
3403                 if isinstance(jwplayer_data, dict):
3404                     return jwplayer_data
3405
3406     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3407         jwplayer_data = self._find_jwplayer_data(
3408             webpage, video_id, transform_source=js_to_json)
3409         return self._parse_jwplayer_data(
3410             jwplayer_data, video_id, *args, **kwargs)
3411
3412     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3413                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3414         # JWPlayer backward compatibility: flattened playlists
3415         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3416         if 'playlist' not in jwplayer_data:
3417             jwplayer_data = {'playlist': [jwplayer_data]}
3418
3419         entries = []
3420
3421         # JWPlayer backward compatibility: single playlist item
3422         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3423         if not isinstance(jwplayer_data['playlist'], list):
3424             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3425
3426         for video_data in jwplayer_data['playlist']:
3427             # JWPlayer backward compatibility: flattened sources
3428             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3429             if 'sources' not in video_data:
3430                 video_data['sources'] = [video_data]
3431
3432             this_video_id = video_id or video_data['mediaid']
3433
3434             formats = self._parse_jwplayer_formats(
3435                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3436                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3437
3438             subtitles = {}
3439             tracks = video_data.get('tracks')
3440             if tracks and isinstance(tracks, list):
3441                 for track in tracks:
3442                     if not isinstance(track, dict):
3443                         continue
3444                     track_kind = track.get('kind')
3445                     if not track_kind or not isinstance(track_kind, compat_str):
3446                         continue
3447                     if track_kind.lower() not in ('captions', 'subtitles'):
3448                         continue
3449                     track_url = urljoin(base_url, track.get('file'))
3450                     if not track_url:
3451                         continue
3452                     subtitles.setdefault(track.get('label') or 'en', []).append({
3453                         'url': self._proto_relative_url(track_url)
3454                     })
3455
3456             entry = {
3457                 'id': this_video_id,
3458                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3459                 'description': clean_html(video_data.get('description')),
3460                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3461                 'timestamp': int_or_none(video_data.get('pubdate')),
3462                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3463                 'subtitles': subtitles,
3464             }
3465             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3466             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3467                 entry.update({
3468                     '_type': 'url_transparent',
3469                     'url': formats[0]['url'],
3470                 })
3471             else:
3472                 self._sort_formats(formats)
3473                 entry['formats'] = formats
3474             entries.append(entry)
3475         if len(entries) == 1:
3476             return entries[0]
3477         else:
3478             return self.playlist_result(entries)
3479
3480     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3481                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3482         urls = []
3483         formats = []
3484         for source in jwplayer_sources_data:
3485             if not isinstance(source, dict):
3486                 continue
3487             source_url = urljoin(
3488                 base_url, self._proto_relative_url(source.get('file')))
3489             if not source_url or source_url in urls:
3490                 continue
3491             urls.append(source_url)
3492             source_type = source.get('type') or ''
3493             ext = mimetype2ext(source_type) or determine_ext(source_url)
3494             if source_type == 'hls' or ext == 'm3u8':
3495                 formats.extend(self._extract_m3u8_formats(
3496                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3497                     m3u8_id=m3u8_id, fatal=False))
3498             elif source_type == 'dash' or ext == 'mpd':
3499                 formats.extend(self._extract_mpd_formats(
3500                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3501             elif ext == 'smil':
3502                 formats.extend(self._extract_smil_formats(
3503                     source_url, video_id, fatal=False))
3504             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3505             elif source_type.startswith('audio') or ext in (
3506                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3507                 formats.append({
3508                     'url': source_url,
3509                     'vcodec': 'none',
3510                     'ext': ext,
3511                 })
3512             else:
3513                 height = int_or_none(source.get('height'))
3514                 if height is None:
3515                     # Often no height is provided but there is a label in
3516                     # format like "1080p", "720p SD", or 1080.
3517                     height = int_or_none(self._search_regex(
3518                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3519                         'height', default=None))
3520                 a_format = {
3521                     'url': source_url,
3522                     'width': int_or_none(source.get('width')),
3523                     'height': height,
3524                     'tbr': int_or_none(source.get('bitrate')),
3525                     'ext': ext,
3526                 }
3527                 if source_url.startswith('rtmp'):
3528                     a_format['ext'] = 'flv'
3529                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3530                     # of jwplayer.flash.swf
3531                     rtmp_url_parts = re.split(
3532                         r'((?:mp4|mp3|flv):)', source_url, 1)
3533                     if len(rtmp_url_parts) == 3:
3534                         rtmp_url, prefix, play_path = rtmp_url_parts
3535                         a_format.update({
3536                             'url': rtmp_url,
3537                             'play_path': prefix + play_path,
3538                         })
3539                     if rtmp_params:
3540                         a_format.update(rtmp_params)
3541                 formats.append(a_format)
3542         return formats
3543
3544     def _live_title(self, name):
3545         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3546         return name
3547
3548     def _int(self, v, name, fatal=False, **kwargs):
3549         res = int_or_none(v, **kwargs)
3550         if res is None:
3551             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3552             if fatal:
3553                 raise ExtractorError(msg)
3554             else:
3555                 self.report_warning(msg)
3556         return res
3557
3558     def _float(self, v, name, fatal=False, **kwargs):
3559         res = float_or_none(v, **kwargs)
3560         if res is None:
3561             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3562             if fatal:
3563                 raise ExtractorError(msg)
3564             else:
3565                 self.report_warning(msg)
3566         return res
3567
3568     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3569                     path='/', secure=False, discard=False, rest={}, **kwargs):
3570         cookie = compat_cookiejar_Cookie(
3571             0, name, value, port, port is not None, domain, True,
3572             domain.startswith('.'), path, True, secure, expire_time,
3573             discard, None, None, rest)
3574         self._downloader.cookiejar.set_cookie(cookie)
3575
3576     def _get_cookies(self, url):
3577         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3578         return compat_cookies_SimpleCookie(self._downloader._calc_cookies(url))
3579
3580     def _apply_first_set_cookie_header(self, url_handle, cookie):
3581         """
3582         Apply first Set-Cookie header instead of the last. Experimental.
3583
3584         Some sites (e.g. [1-3]) may serve two cookies under the same name
3585         in Set-Cookie header and expect the first (old) one to be set rather
3586         than second (new). However, as of RFC6265 the newer one cookie
3587         should be set into cookie store what actually happens.
3588         We will workaround this issue by resetting the cookie to
3589         the first one manually.
3590         1. https://new.vk.com/
3591         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3592         3. https://learning.oreilly.com/
3593         """
3594         for header, cookies in url_handle.headers.items():
3595             if header.lower() != 'set-cookie':
3596                 continue
3597             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3598             cookie_value = re.search(
3599                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3600             if cookie_value:
3601                 value, domain = cookie_value.groups()
3602                 self._set_cookie(domain, cookie, value)
3603                 break
3604
3605     @classmethod
3606     def get_testcases(cls, include_onlymatching=False):
3607         t = getattr(cls, '_TEST', None)
3608         if t:
3609             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3610             tests = [t]
3611         else:
3612             tests = getattr(cls, '_TESTS', [])
3613         for t in tests:
3614             if not include_onlymatching and t.get('only_matching', False):
3615                 continue
3616             t['name'] = cls.ie_key()
3617             yield t
3618
3619     @classproperty
3620     def age_limit(cls):
3621         """Get age limit from the testcases"""
3622         return max(traverse_obj(
3623             tuple(cls.get_testcases(include_onlymatching=False)),
3624             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3625
3626     @classmethod
3627     def is_suitable(cls, age_limit):
3628         """Test whether the extractor is generally suitable for the given age limit"""
3629         return not age_restricted(cls.age_limit, age_limit)
3630
3631     @classmethod
3632     def description(cls, *, markdown=True, search_examples=None):
3633         """Description of the extractor"""
3634         desc = ''
3635         if cls._NETRC_MACHINE:
3636             if markdown:
3637                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3638             else:
3639                 desc += f' [{cls._NETRC_MACHINE}]'
3640         if cls.IE_DESC is False:
3641             desc += ' [HIDDEN]'
3642         elif cls.IE_DESC:
3643             desc += f' {cls.IE_DESC}'
3644         if cls.SEARCH_KEY:
3645             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3646             if search_examples:
3647                 _COUNTS = ('', '5', '10', 'all')
3648                 desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3649         if not cls.working():
3650             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3651
3652         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3653         return f'{name}:{desc}' if desc else name
3654
3655     def extract_subtitles(self, *args, **kwargs):
3656         if (self.get_param('writesubtitles', False)
3657                 or self.get_param('listsubtitles')):
3658             return self._get_subtitles(*args, **kwargs)
3659         return {}
3660
3661     def _get_subtitles(self, *args, **kwargs):
3662         raise NotImplementedError('This method must be implemented by subclasses')
3663
3664     def extract_comments(self, *args, **kwargs):
3665         if not self.get_param('getcomments'):
3666             return None
3667         generator = self._get_comments(*args, **kwargs)
3668
3669         def extractor():
3670             comments = []
3671             interrupted = True
3672             try:
3673                 while True:
3674                     comments.append(next(generator))
3675             except StopIteration:
3676                 interrupted = False
3677             except KeyboardInterrupt:
3678                 self.to_screen('Interrupted by user')
3679             except Exception as e:
3680                 if self.get_param('ignoreerrors') is not True:
3681                     raise
3682                 self._downloader.report_error(e)
3683             comment_count = len(comments)
3684             self.to_screen(f'Extracted {comment_count} comments')
3685             return {
3686                 'comments': comments,
3687                 'comment_count': None if interrupted else comment_count
3688             }
3689         return extractor
3690
3691     def _get_comments(self, *args, **kwargs):
3692         raise NotImplementedError('This method must be implemented by subclasses')
3693
3694     @staticmethod
3695     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3696         """ Merge subtitle items for one language. Items with duplicated URLs/data
3697         will be dropped. """
3698         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3699         ret = list(subtitle_list1)
3700         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3701         return ret
3702
3703     @classmethod
3704     def _merge_subtitles(cls, *dicts, target=None):
3705         """ Merge subtitle dictionaries, language by language. """
3706         if target is None:
3707             target = {}
3708         for d in dicts:
3709             for lang, subs in d.items():
3710                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3711         return target
3712
3713     def extract_automatic_captions(self, *args, **kwargs):
3714         if (self.get_param('writeautomaticsub', False)
3715                 or self.get_param('listsubtitles')):
3716             return self._get_automatic_captions(*args, **kwargs)
3717         return {}
3718
3719     def _get_automatic_captions(self, *args, **kwargs):
3720         raise NotImplementedError('This method must be implemented by subclasses')
3721
3722     @functools.cached_property
3723     def _cookies_passed(self):
3724         """Whether cookies have been passed to YoutubeDL"""
3725         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3726
3727     def mark_watched(self, *args, **kwargs):
3728         if not self.get_param('mark_watched', False):
3729             return
3730         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3731             self._mark_watched(*args, **kwargs)
3732
3733     def _mark_watched(self, *args, **kwargs):
3734         raise NotImplementedError('This method must be implemented by subclasses')
3735
3736     def geo_verification_headers(self):
3737         headers = {}
3738         geo_verification_proxy = self.get_param('geo_verification_proxy')
3739         if geo_verification_proxy:
3740             headers['Ytdl-request-proxy'] = geo_verification_proxy
3741         return headers
3742
3743     def _generic_id(self, url):
3744         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3745
3746     def _generic_title(self, url):
3747         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3748
3749     @staticmethod
3750     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3751         all_known = all(map(
3752             lambda x: x is not None,
3753             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3754         return (
3755             'private' if is_private
3756             else 'premium_only' if needs_premium
3757             else 'subscriber_only' if needs_subscription
3758             else 'needs_auth' if needs_auth
3759             else 'unlisted' if is_unlisted
3760             else 'public' if all_known
3761             else None)
3762
3763     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3764         '''
3765         @returns            A list of values for the extractor argument given by "key"
3766                             or "default" if no such key is present
3767         @param default      The default value to return when the key is not present (default: [])
3768         @param casesense    When false, the values are converted to lower case
3769         '''
3770         val = traverse_obj(
3771             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3772         if val is None:
3773             return [] if default is NO_DEFAULT else default
3774         return list(val) if casesense else [x.lower() for x in val]
3775
3776     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3777         if not playlist_id or not video_id:
3778             return not video_id
3779
3780         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3781         if no_playlist is not None:
3782             return not no_playlist
3783
3784         video_id = '' if video_id is True else f' {video_id}'
3785         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3786         if self.get_param('noplaylist'):
3787             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3788             return False
3789         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3790         return True
3791
3792
3793 class SearchInfoExtractor(InfoExtractor):
3794     """
3795     Base class for paged search queries extractors.
3796     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3797     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3798     """
3799
3800     _MAX_RESULTS = float('inf')
3801
3802     @classmethod
3803     def _make_valid_url(cls):
3804         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3805
3806     def _real_extract(self, query):
3807         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3808         if prefix == '':
3809             return self._get_n_results(query, 1)
3810         elif prefix == 'all':
3811             return self._get_n_results(query, self._MAX_RESULTS)
3812         else:
3813             n = int(prefix)
3814             if n <= 0:
3815                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3816             elif n > self._MAX_RESULTS:
3817                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3818                 n = self._MAX_RESULTS
3819             return self._get_n_results(query, n)
3820
3821     def _get_n_results(self, query, n):
3822         """Get a specified number of results for a query.
3823         Either this function or _search_results must be overridden by subclasses """
3824         return self.playlist_result(
3825             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3826             query, query)
3827
3828     def _search_results(self, query):
3829         """Returns an iterator of search results"""
3830         raise NotImplementedError('This method must be implemented by subclasses')
3831
3832     @classproperty
3833     def SEARCH_KEY(cls):
3834         return cls._SEARCH_KEY