yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import hashlib
   4 import itertools
   5 import json
   6 import math
   7 import netrc
   8 import os
   9 import random
  10 import sys
  11 import time
  12 import xml.etree.ElementTree
  13
  14 from ..compat import functools, re
  15 from ..compat import (
  16     compat_cookiejar_Cookie,
  17     compat_cookies_SimpleCookie,
  18     compat_etree_fromstring,
  19     compat_expanduser,
  20     compat_getpass,
  21     compat_http_client,
  22     compat_os_name,
  23     compat_str,
  24     compat_urllib_error,
  25     compat_urllib_parse_unquote,
  26     compat_urllib_parse_urlencode,
  27     compat_urllib_request,
  28     compat_urlparse,
  29 )
  30 from ..downloader import FileDownloader
  31 from ..downloader.f4m import get_base_url, remove_encrypted_media
  32 from ..utils import (
  33     JSON_LD_RE,
  34     NO_DEFAULT,
  35     ExtractorError,
  36     GeoRestrictedError,
  37     GeoUtils,
  38     RegexNotFoundError,
  39     UnsupportedError,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     classproperty,
  44     clean_html,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     encode_data_uri,
  49     error_to_compat_str,
  50     extract_attributes,
  51     filter_dict,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     int_or_none,
  56     join_nonempty,
  57     js_to_json,
  58     mimetype2ext,
  59     network_exceptions,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     sanitize_filename,
  68     sanitized_Request,
  69     str_or_none,
  70     str_to_int,
  71     strip_or_none,
  72     traverse_obj,
  73     try_get,
  74     unescapeHTML,
  75     unified_strdate,
  76     unified_timestamp,
  77     update_Request,
  78     update_url_query,
  79     url_basename,
  80     url_or_none,
  81     urljoin,
  82     variadic,
  83     xpath_element,
  84     xpath_text,
  85     xpath_with_ns,
  86 )
  87
  88
  89 class InfoExtractor:
  90     """Information Extractor class.
  91
  92     Information extractors are the classes that, given a URL, extract
  93     information about the video (or videos) the URL refers to. This
  94     information includes the real video URL, the video title, author and
  95     others. The information is stored in a dictionary which is then
  96     passed to the YoutubeDL. The YoutubeDL processes this
  97     information possibly downloading the video to the file system, among
  98     other possible outcomes.
  99
 100     The type field determines the type of the result.
 101     By far the most common value (and the default if _type is missing) is
 102     "video", which indicates a single video.
 103
 104     For a video, the dictionaries must include the following fields:
 105
 106     id:             Video identifier.
 107     title:          Video title, unescaped. Set to an empty string if video has
 108                     no title as opposed to "None" which signifies that the
 109                     extractor failed to obtain a title
 110
 111     Additionally, it must contain either a formats entry or a url one:
 112
 113     formats:        A list of dictionaries for each format available, ordered
 114                     from worst to best quality.
 115
 116                     Potential fields:
 117                     * url        The mandatory URL representing the media:
 118                                    for plain file media - HTTP URL of this file,
 119                                    for RTMP - RTMP URL,
 120                                    for HLS - URL of the M3U8 media playlist,
 121                                    for HDS - URL of the F4M manifest,
 122                                    for DASH
 123                                      - HTTP URL to plain file media (in case of
 124                                        unfragmented media)
 125                                      - URL of the MPD manifest or base URL
 126                                        representing the media if MPD manifest
 127                                        is parsed from a string (in case of
 128                                        fragmented media)
 129                                    for MSS - URL of the ISM manifest.
 130                     * manifest_url
 131                                  The URL of the manifest file in case of
 132                                  fragmented media:
 133                                    for HLS - URL of the M3U8 master playlist,
 134                                    for HDS - URL of the F4M manifest,
 135                                    for DASH - URL of the MPD manifest,
 136                                    for MSS - URL of the ISM manifest.
 137                     * manifest_stream_number  (For internal use only)
 138                                  The index of the stream in the manifest file
 139                     * ext        Will be calculated from URL if missing
 140                     * format     A human-readable description of the format
 141                                  ("mp4 container with h264/opus").
 142                                  Calculated from the format_id, width, height.
 143                                  and format_note fields if missing.
 144                     * format_id  A short description of the format
 145                                  ("mp4_h264_opus" or "19").
 146                                 Technically optional, but strongly recommended.
 147                     * format_note Additional info about the format
 148                                  ("3D" or "DASH video")
 149                     * width      Width of the video, if known
 150                     * height     Height of the video, if known
 151                     * resolution Textual description of width and height
 152                     * dynamic_range The dynamic range of the video. One of:
 153                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 154                     * tbr        Average bitrate of audio and video in KBit/s
 155                     * abr        Average audio bitrate in KBit/s
 156                     * acodec     Name of the audio codec in use
 157                     * asr        Audio sampling rate in Hertz
 158                     * vbr        Average video bitrate in KBit/s
 159                     * fps        Frame rate
 160                     * vcodec     Name of the video codec in use
 161                     * container  Name of the container format
 162                     * filesize   The number of bytes, if known in advance
 163                     * filesize_approx  An estimate for the number of bytes
 164                     * player_url SWF Player URL (used for rtmpdump).
 165                     * protocol   The protocol that will be used for the actual
 166                                  download, lower-case. One of "http", "https" or
 167                                  one of the protocols defined in downloader.PROTOCOL_MAP
 168                     * fragment_base_url
 169                                  Base URL for fragments. Each fragment's path
 170                                  value (if present) will be relative to
 171                                  this URL.
 172                     * fragments  A list of fragments of a fragmented media.
 173                                  Each fragment entry must contain either an url
 174                                  or a path. If an url is present it should be
 175                                  considered by a client. Otherwise both path and
 176                                  fragment_base_url must be present. Here is
 177                                  the list of all potential fields:
 178                                  * "url" - fragment's URL
 179                                  * "path" - fragment's path relative to
 180                                             fragment_base_url
 181                                  * "duration" (optional, int or float)
 182                                  * "filesize" (optional, int)
 183                     * is_from_start  Is a live format that can be downloaded
 184                                 from the start. Boolean
 185                     * preference Order number of this format. If this field is
 186                                  present and not None, the formats get sorted
 187                                  by this field, regardless of all other values.
 188                                  -1 for default (order by other properties),
 189                                  -2 or smaller for less than default.
 190                                  < -1000 to hide the format (if there is
 191                                     another one which is strictly better)
 192                     * language   Language code, e.g. "de" or "en-US".
 193                     * language_preference  Is this in the language mentioned in
 194                                  the URL?
 195                                  10 if it's what the URL is about,
 196                                  -1 for default (don't know),
 197                                  -10 otherwise, other values reserved for now.
 198                     * quality    Order number of the video quality of this
 199                                  format, irrespective of the file format.
 200                                  -1 for default (order by other properties),
 201                                  -2 or smaller for less than default.
 202                     * source_preference  Order number for this video source
 203                                   (quality takes higher priority)
 204                                  -1 for default (order by other properties),
 205                                  -2 or smaller for less than default.
 206                     * http_headers  A dictionary of additional HTTP headers
 207                                  to add to the request.
 208                     * stretched_ratio  If given and not 1, indicates that the
 209                                  video's pixels are not square.
 210                                  width : height ratio as float.
 211                     * no_resume  The server does not support resuming the
 212                                  (HTTP or RTMP) download. Boolean.
 213                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 214                     * downloader_options  A dictionary of downloader options
 215                                  (For internal use only)
 216                                  * http_chunk_size Chunk size for HTTP downloads
 217                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 218                     RTMP formats can also have the additional fields: page_url,
 219                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 220                     rtmp_protocol, rtmp_real_time
 221
 222     url:            Final video URL.
 223     ext:            Video filename extension.
 224     format:         The video format, defaults to ext (used for --get-format)
 225     player_url:     SWF Player URL (used for rtmpdump).
 226
 227     The following fields are optional:
 228
 229     direct:         True if a direct video file was given (must only be set by GenericIE)
 230     alt_title:      A secondary title of the video.
 231     display_id      An alternative identifier for the video, not necessarily
 232                     unique, but available before title. Typically, id is
 233                     something like "4234987", title "Dancing naked mole rats",
 234                     and display_id "dancing-naked-mole-rats"
 235     thumbnails:     A list of dictionaries, with the following entries:
 236                         * "id" (optional, string) - Thumbnail format ID
 237                         * "url"
 238                         * "preference" (optional, int) - quality of the image
 239                         * "width" (optional, int)
 240                         * "height" (optional, int)
 241                         * "resolution" (optional, string "{width}x{height}",
 242                                         deprecated)
 243                         * "filesize" (optional, int)
 244                         * "http_headers" (dict) - HTTP headers for the request
 245     thumbnail:      Full URL to a video thumbnail image.
 246     description:    Full video description.
 247     uploader:       Full name of the video uploader.
 248     license:        License name the video is licensed under.
 249     creator:        The creator of the video.
 250     timestamp:      UNIX timestamp of the moment the video was uploaded
 251     upload_date:    Video upload date in UTC (YYYYMMDD).
 252                     If not explicitly set, calculated from timestamp
 253     release_timestamp: UNIX timestamp of the moment the video was released.
 254                     If it is not clear whether to use timestamp or this, use the former
 255     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 256                     If not explicitly set, calculated from release_timestamp
 257     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 258     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 259                     If not explicitly set, calculated from modified_timestamp
 260     uploader_id:    Nickname or id of the video uploader.
 261     uploader_url:   Full URL to a personal webpage of the video uploader.
 262     channel:        Full name of the channel the video is uploaded on.
 263                     Note that channel fields may or may not repeat uploader
 264                     fields. This depends on a particular extractor.
 265     channel_id:     Id of the channel.
 266     channel_url:    Full URL to a channel webpage.
 267     channel_follower_count: Number of followers of the channel.
 268     location:       Physical location where the video was filmed.
 269     subtitles:      The available subtitles as a dictionary in the format
 270                     {tag: subformats}. "tag" is usually a language code, and
 271                     "subformats" is a list sorted from lower to higher
 272                     preference, each element is a dictionary with the "ext"
 273                     entry and one of:
 274                         * "data": The subtitles file contents
 275                         * "url": A URL pointing to the subtitles file
 276                     It can optionally also have:
 277                         * "name": Name or description of the subtitles
 278                         * "http_headers": A dictionary of additional HTTP headers
 279                                   to add to the request.
 280                     "ext" will be calculated from URL if missing
 281     automatic_captions: Like 'subtitles'; contains automatically generated
 282                     captions instead of normal subtitles
 283     duration:       Length of the video in seconds, as an integer or float.
 284     view_count:     How many users have watched the video on the platform.
 285     like_count:     Number of positive ratings of the video
 286     dislike_count:  Number of negative ratings of the video
 287     repost_count:   Number of reposts of the video
 288     average_rating: Average rating give by users, the scale used depends on the webpage
 289     comment_count:  Number of comments on the video
 290     comments:       A list of comments, each with one or more of the following
 291                     properties (all but one of text or html optional):
 292                         * "author" - human-readable name of the comment author
 293                         * "author_id" - user ID of the comment author
 294                         * "author_thumbnail" - The thumbnail of the comment author
 295                         * "id" - Comment ID
 296                         * "html" - Comment as HTML
 297                         * "text" - Plain text of the comment
 298                         * "timestamp" - UNIX timestamp of comment
 299                         * "parent" - ID of the comment this one is replying to.
 300                                      Set to "root" to indicate that this is a
 301                                      comment to the original video.
 302                         * "like_count" - Number of positive ratings of the comment
 303                         * "dislike_count" - Number of negative ratings of the comment
 304                         * "is_favorited" - Whether the comment is marked as
 305                                            favorite by the video uploader
 306                         * "author_is_uploader" - Whether the comment is made by
 307                                                  the video uploader
 308     age_limit:      Age restriction for the video, as an integer (years)
 309     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 310                     should allow to get the same result again. (It will be set
 311                     by YoutubeDL if it's missing)
 312     categories:     A list of categories that the video falls in, for example
 313                     ["Sports", "Berlin"]
 314     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 315     cast:           A list of the video cast
 316     is_live:        True, False, or None (=unknown). Whether this video is a
 317                     live stream that goes on instead of a fixed-length video.
 318     was_live:       True, False, or None (=unknown). Whether this video was
 319                     originally a live stream.
 320     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 321                     If absent, automatically set from is_live, was_live
 322     start_time:     Time in seconds where the reproduction should start, as
 323                     specified in the URL.
 324     end_time:       Time in seconds where the reproduction should end, as
 325                     specified in the URL.
 326     chapters:       A list of dictionaries, with the following entries:
 327                         * "start_time" - The start time of the chapter in seconds
 328                         * "end_time" - The end time of the chapter in seconds
 329                         * "title" (optional, string)
 330     playable_in_embed: Whether this video is allowed to play in embedded
 331                     players on other sites. Can be True (=always allowed),
 332                     False (=never allowed), None (=unknown), or a string
 333                     specifying the criteria for embedability (Eg: 'whitelist')
 334     availability:   Under what condition the video is available. One of
 335                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 336                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 337                     to set it
 338     __post_extractor: A function to be called just before the metadata is
 339                     written to either disk, logger or console. The function
 340                     must return a dict which will be added to the info_dict.
 341                     This is usefull for additional information that is
 342                     time-consuming to extract. Note that the fields thus
 343                     extracted will not be available to output template and
 344                     match_filter. So, only "comments" and "comment_count" are
 345                     currently allowed to be extracted via this method.
 346
 347     The following fields should only be used when the video belongs to some logical
 348     chapter or section:
 349
 350     chapter:        Name or title of the chapter the video belongs to.
 351     chapter_number: Number of the chapter the video belongs to, as an integer.
 352     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 353
 354     The following fields should only be used when the video is an episode of some
 355     series, programme or podcast:
 356
 357     series:         Title of the series or programme the video episode belongs to.
 358     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 359     season:         Title of the season the video episode belongs to.
 360     season_number:  Number of the season the video episode belongs to, as an integer.
 361     season_id:      Id of the season the video episode belongs to, as a unicode string.
 362     episode:        Title of the video episode. Unlike mandatory video title field,
 363                     this field should denote the exact title of the video episode
 364                     without any kind of decoration.
 365     episode_number: Number of the video episode within a season, as an integer.
 366     episode_id:     Id of the video episode, as a unicode string.
 367
 368     The following fields should only be used when the media is a track or a part of
 369     a music album:
 370
 371     track:          Title of the track.
 372     track_number:   Number of the track within an album or a disc, as an integer.
 373     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 374                     as a unicode string.
 375     artist:         Artist(s) of the track.
 376     genre:          Genre(s) of the track.
 377     album:          Title of the album the track belongs to.
 378     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 379     album_artist:   List of all artists appeared on the album (e.g.
 380                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 381                     and compilations).
 382     disc_number:    Number of the disc or other physical medium the track belongs to,
 383                     as an integer.
 384     release_year:   Year (YYYY) when the album was released.
 385     composer:       Composer of the piece
 386
 387     Unless mentioned otherwise, the fields should be Unicode strings.
 388
 389     Unless mentioned otherwise, None is equivalent to absence of information.
 390
 391
 392     _type "playlist" indicates multiple videos.
 393     There must be a key "entries", which is a list, an iterable, or a PagedList
 394     object, each element of which is a valid dictionary by this specification.
 395
 396     Additionally, playlists can have "id", "title", and any other relevent
 397     attributes with the same semantics as videos (see above).
 398
 399     It can also have the following optional fields:
 400
 401     playlist_count: The total number of videos in a playlist. If not given,
 402                     YoutubeDL tries to calculate it from "entries"
 403
 404
 405     _type "multi_video" indicates that there are multiple videos that
 406     form a single show, for examples multiple acts of an opera or TV episode.
 407     It must have an entries key like a playlist and contain all the keys
 408     required for a video at the same time.
 409
 410
 411     _type "url" indicates that the video must be extracted from another
 412     location, possibly by a different extractor. Its only required key is:
 413     "url" - the next URL to extract.
 414     The key "ie_key" can be set to the class name (minus the trailing "IE",
 415     e.g. "Youtube") if the extractor class is known in advance.
 416     Additionally, the dictionary may have any properties of the resolved entity
 417     known in advance, for example "title" if the title of the referred video is
 418     known ahead of time.
 419
 420
 421     _type "url_transparent" entities have the same specification as "url", but
 422     indicate that the given additional information is more precise than the one
 423     associated with the resolved URL.
 424     This is useful when a site employs a video service that hosts the video and
 425     its technical metadata, but that video service does not embed a useful
 426     title, description etc.
 427
 428
 429     Subclasses of this should define a _VALID_URL regexp and, re-define the
 430     _real_extract() and (optionally) _real_initialize() methods.
 431     Probably, they should also be added to the list of extractors.
 432
 433     Subclasses may also override suitable() if necessary, but ensure the function
 434     signature is preserved and that this function imports everything it needs
 435     (except other extractors), so that lazy_extractors works correctly.
 436
 437     To support username + password (or netrc) login, the extractor must define a
 438     _NETRC_MACHINE and re-define _perform_login(username, password) and
 439     (optionally) _initialize_pre_login() methods. The _perform_login method will
 440     be called between _initialize_pre_login and _real_initialize if credentials
 441     are passed by the user. In cases where it is necessary to have the login
 442     process as part of the extraction rather than initialization, _perform_login
 443     can be left undefined.
 444
 445     _GEO_BYPASS attribute may be set to False in order to disable
 446     geo restriction bypass mechanisms for a particular extractor.
 447     Though it won't disable explicit geo restriction bypass based on
 448     country code provided with geo_bypass_country.
 449
 450     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 451     countries for this extractor. One of these countries will be used by
 452     geo restriction bypass mechanism right away in order to bypass
 453     geo restriction, of course, if the mechanism is not disabled.
 454
 455     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 456     IP blocks in CIDR notation for this extractor. One of these IP blocks
 457     will be used by geo restriction bypass mechanism similarly
 458     to _GEO_COUNTRIES.
 459
 460     The _WORKING attribute should be set to False for broken IEs
 461     in order to warn the users and skip the tests.
 462     """
 463
 464     _ready = False
 465     _downloader = None
 466     _x_forwarded_for_ip = None
 467     _GEO_BYPASS = True
 468     _GEO_COUNTRIES = None
 469     _GEO_IP_BLOCKS = None
 470     _WORKING = True
 471     _NETRC_MACHINE = None
 472     IE_DESC = None
 473     SEARCH_KEY = None
 474
 475     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 476         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 477         return {
 478             None: '',
 479             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 480             'password': f'Use {password_hint}',
 481             'cookies': (
 482                 'Use --cookies-from-browser or --cookies for the authentication. '
 483                 'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 484         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 485
 486     def __init__(self, downloader=None):
 487         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 488         If a downloader is not passed during initialization,
 489         it must be set using "set_downloader()" before "extract()" is called"""
 490         self._ready = False
 491         self._x_forwarded_for_ip = None
 492         self._printed_messages = set()
 493         self.set_downloader(downloader)
 494
 495     @classmethod
 496     def _match_valid_url(cls, url):
 497         # This does not use has/getattr intentionally - we want to know whether
 498         # we have cached the regexp for *this* class, whereas getattr would also
 499         # match the superclass
 500         if '_VALID_URL_RE' not in cls.__dict__:
 501             if '_VALID_URL' not in cls.__dict__:
 502                 cls._VALID_URL = cls._make_valid_url()
 503             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 504         return cls._VALID_URL_RE.match(url)
 505
 506     @classmethod
 507     def suitable(cls, url):
 508         """Receives a URL and returns True if suitable for this IE."""
 509         # This function must import everything it needs (except other extractors),
 510         # so that lazy_extractors works correctly
 511         return cls._match_valid_url(url) is not None
 512
 513     @classmethod
 514     def _match_id(cls, url):
 515         return cls._match_valid_url(url).group('id')
 516
 517     @classmethod
 518     def get_temp_id(cls, url):
 519         try:
 520             return cls._match_id(url)
 521         except (IndexError, AttributeError):
 522             return None
 523
 524     @classmethod
 525     def working(cls):
 526         """Getter method for _WORKING."""
 527         return cls._WORKING
 528
 529     @classmethod
 530     def supports_login(cls):
 531         return bool(cls._NETRC_MACHINE)
 532
 533     def initialize(self):
 534         """Initializes an instance (authentication, etc)."""
 535         self._printed_messages = set()
 536         self._initialize_geo_bypass({
 537             'countries': self._GEO_COUNTRIES,
 538             'ip_blocks': self._GEO_IP_BLOCKS,
 539         })
 540         if not self._ready:
 541             self._initialize_pre_login()
 542             if self.supports_login():
 543                 username, password = self._get_login_info()
 544                 if username:
 545                     self._perform_login(username, password)
 546             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 547                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 548             self._real_initialize()
 549             self._ready = True
 550
 551     def _initialize_geo_bypass(self, geo_bypass_context):
 552         """
 553         Initialize geo restriction bypass mechanism.
 554
 555         This method is used to initialize geo bypass mechanism based on faking
 556         X-Forwarded-For HTTP header. A random country from provided country list
 557         is selected and a random IP belonging to this country is generated. This
 558         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 559         HTTP requests.
 560
 561         This method will be used for initial geo bypass mechanism initialization
 562         during the instance initialization with _GEO_COUNTRIES and
 563         _GEO_IP_BLOCKS.
 564
 565         You may also manually call it from extractor's code if geo bypass
 566         information is not available beforehand (e.g. obtained during
 567         extraction) or due to some other reason. In this case you should pass
 568         this information in geo bypass context passed as first argument. It may
 569         contain following fields:
 570
 571         countries:  List of geo unrestricted countries (similar
 572                     to _GEO_COUNTRIES)
 573         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 574                     (similar to _GEO_IP_BLOCKS)
 575
 576         """
 577         if not self._x_forwarded_for_ip:
 578
 579             # Geo bypass mechanism is explicitly disabled by user
 580             if not self.get_param('geo_bypass', True):
 581                 return
 582
 583             if not geo_bypass_context:
 584                 geo_bypass_context = {}
 585
 586             # Backward compatibility: previously _initialize_geo_bypass
 587             # expected a list of countries, some 3rd party code may still use
 588             # it this way
 589             if isinstance(geo_bypass_context, (list, tuple)):
 590                 geo_bypass_context = {
 591                     'countries': geo_bypass_context,
 592                 }
 593
 594             # The whole point of geo bypass mechanism is to fake IP
 595             # as X-Forwarded-For HTTP header based on some IP block or
 596             # country code.
 597
 598             # Path 1: bypassing based on IP block in CIDR notation
 599
 600             # Explicit IP block specified by user, use it right away
 601             # regardless of whether extractor is geo bypassable or not
 602             ip_block = self.get_param('geo_bypass_ip_block', None)
 603
 604             # Otherwise use random IP block from geo bypass context but only
 605             # if extractor is known as geo bypassable
 606             if not ip_block:
 607                 ip_blocks = geo_bypass_context.get('ip_blocks')
 608                 if self._GEO_BYPASS and ip_blocks:
 609                     ip_block = random.choice(ip_blocks)
 610
 611             if ip_block:
 612                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 613                 self._downloader.write_debug(
 614                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 615                 return
 616
 617             # Path 2: bypassing based on country code
 618
 619             # Explicit country code specified by user, use it right away
 620             # regardless of whether extractor is geo bypassable or not
 621             country = self.get_param('geo_bypass_country', None)
 622
 623             # Otherwise use random country code from geo bypass context but
 624             # only if extractor is known as geo bypassable
 625             if not country:
 626                 countries = geo_bypass_context.get('countries')
 627                 if self._GEO_BYPASS and countries:
 628                     country = random.choice(countries)
 629
 630             if country:
 631                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 632                 self._downloader.write_debug(
 633                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 634
 635     def extract(self, url):
 636         """Extracts URL information and returns it in list of dicts."""
 637         try:
 638             for _ in range(2):
 639                 try:
 640                     self.initialize()
 641                     self.write_debug('Extracting URL: %s' % url)
 642                     ie_result = self._real_extract(url)
 643                     if ie_result is None:
 644                         return None
 645                     if self._x_forwarded_for_ip:
 646                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 647                     subtitles = ie_result.get('subtitles')
 648                     if (subtitles and 'live_chat' in subtitles
 649                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 650                         del subtitles['live_chat']
 651                     return ie_result
 652                 except GeoRestrictedError as e:
 653                     if self.__maybe_fake_ip_and_retry(e.countries):
 654                         continue
 655                     raise
 656         except UnsupportedError:
 657             raise
 658         except ExtractorError as e:
 659             kwargs = {
 660                 'video_id': e.video_id or self.get_temp_id(url),
 661                 'ie': self.IE_NAME,
 662                 'tb': e.traceback or sys.exc_info()[2],
 663                 'expected': e.expected,
 664                 'cause': e.cause
 665             }
 666             if hasattr(e, 'countries'):
 667                 kwargs['countries'] = e.countries
 668             raise type(e)(e.orig_msg, **kwargs)
 669         except compat_http_client.IncompleteRead as e:
 670             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 671         except (KeyError, StopIteration) as e:
 672             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 673
 674     def __maybe_fake_ip_and_retry(self, countries):
 675         if (not self.get_param('geo_bypass_country', None)
 676                 and self._GEO_BYPASS
 677                 and self.get_param('geo_bypass', True)
 678                 and not self._x_forwarded_for_ip
 679                 and countries):
 680             country_code = random.choice(countries)
 681             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 682             if self._x_forwarded_for_ip:
 683                 self.report_warning(
 684                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 685                     % (self._x_forwarded_for_ip, country_code.upper()))
 686                 return True
 687         return False
 688
 689     def set_downloader(self, downloader):
 690         """Sets a YoutubeDL instance as the downloader for this IE."""
 691         self._downloader = downloader
 692
 693     def _initialize_pre_login(self):
 694         """ Intialization before login. Redefine in subclasses."""
 695         pass
 696
 697     def _perform_login(self, username, password):
 698         """ Login with username and password. Redefine in subclasses."""
 699         pass
 700
 701     def _real_initialize(self):
 702         """Real initialization process. Redefine in subclasses."""
 703         pass
 704
 705     def _real_extract(self, url):
 706         """Real extraction process. Redefine in subclasses."""
 707         raise NotImplementedError('This method must be implemented by subclasses')
 708
 709     @classmethod
 710     def ie_key(cls):
 711         """A string for getting the InfoExtractor with get_info_extractor"""
 712         return cls.__name__[:-2]
 713
 714     @classproperty
 715     def IE_NAME(cls):
 716         return cls.__name__[:-2]
 717
 718     @staticmethod
 719     def __can_accept_status_code(err, expected_status):
 720         assert isinstance(err, compat_urllib_error.HTTPError)
 721         if expected_status is None:
 722             return False
 723         elif callable(expected_status):
 724             return expected_status(err.code) is True
 725         else:
 726             return err.code in variadic(expected_status)
 727
 728     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 729         """
 730         Return the response handle.
 731
 732         See _download_webpage docstring for arguments specification.
 733         """
 734         if not self._downloader._first_webpage_request:
 735             sleep_interval = self.get_param('sleep_interval_requests') or 0
 736             if sleep_interval > 0:
 737                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 738                 time.sleep(sleep_interval)
 739         else:
 740             self._downloader._first_webpage_request = False
 741
 742         if note is None:
 743             self.report_download_webpage(video_id)
 744         elif note is not False:
 745             if video_id is None:
 746                 self.to_screen(str(note))
 747             else:
 748                 self.to_screen(f'{video_id}: {note}')
 749
 750         # Some sites check X-Forwarded-For HTTP header in order to figure out
 751         # the origin of the client behind proxy. This allows bypassing geo
 752         # restriction by faking this header's value to IP that belongs to some
 753         # geo unrestricted country. We will do so once we encounter any
 754         # geo restriction error.
 755         if self._x_forwarded_for_ip:
 756             if 'X-Forwarded-For' not in headers:
 757                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 758
 759         if isinstance(url_or_request, compat_urllib_request.Request):
 760             url_or_request = update_Request(
 761                 url_or_request, data=data, headers=headers, query=query)
 762         else:
 763             if query:
 764                 url_or_request = update_url_query(url_or_request, query)
 765             if data is not None or headers:
 766                 url_or_request = sanitized_Request(url_or_request, data, headers)
 767         try:
 768             return self._downloader.urlopen(url_or_request)
 769         except network_exceptions as err:
 770             if isinstance(err, compat_urllib_error.HTTPError):
 771                 if self.__can_accept_status_code(err, expected_status):
 772                     # Retain reference to error to prevent file object from
 773                     # being closed before it can be read. Works around the
 774                     # effects of <https://bugs.python.org/issue15002>
 775                     # introduced in Python 3.4.1.
 776                     err.fp._error = err
 777                     return err.fp
 778
 779             if errnote is False:
 780                 return False
 781             if errnote is None:
 782                 errnote = 'Unable to download webpage'
 783
 784             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 785             if fatal:
 786                 raise ExtractorError(errmsg, cause=err)
 787             else:
 788                 self.report_warning(errmsg)
 789                 return False
 790
 791     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 792         """
 793         Return a tuple (page content as string, URL handle).
 794
 795         See _download_webpage docstring for arguments specification.
 796         """
 797         # Strip hashes from the URL (#1038)
 798         if isinstance(url_or_request, (compat_str, str)):
 799             url_or_request = url_or_request.partition('#')[0]
 800
 801         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 802         if urlh is False:
 803             assert not fatal
 804             return False
 805         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 806         return (content, urlh)
 807
 808     @staticmethod
 809     def _guess_encoding_from_content(content_type, webpage_bytes):
 810         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 811         if m:
 812             encoding = m.group(1)
 813         else:
 814             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 815                           webpage_bytes[:1024])
 816             if m:
 817                 encoding = m.group(1).decode('ascii')
 818             elif webpage_bytes.startswith(b'\xff\xfe'):
 819                 encoding = 'utf-16'
 820             else:
 821                 encoding = 'utf-8'
 822
 823         return encoding
 824
 825     def __check_blocked(self, content):
 826         first_block = content[:512]
 827         if ('<title>Access to this site is blocked</title>' in content
 828                 and 'Websense' in first_block):
 829             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 830             blocked_iframe = self._html_search_regex(
 831                 r'<iframe src="([^"]+)"', content,
 832                 'Websense information URL', default=None)
 833             if blocked_iframe:
 834                 msg += ' Visit %s for more details' % blocked_iframe
 835             raise ExtractorError(msg, expected=True)
 836         if '<title>The URL you requested has been blocked</title>' in first_block:
 837             msg = (
 838                 'Access to this webpage has been blocked by Indian censorship. '
 839                 'Use a VPN or proxy server (with --proxy) to route around it.')
 840             block_msg = self._html_search_regex(
 841                 r'</h1><p>(.*?)</p>',
 842                 content, 'block message', default=None)
 843             if block_msg:
 844                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 845             raise ExtractorError(msg, expected=True)
 846         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 847                 and 'blocklist.rkn.gov.ru' in content):
 848             raise ExtractorError(
 849                 'Access to this webpage has been blocked by decision of the Russian government. '
 850                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 851                 expected=True)
 852
 853     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 854         content_type = urlh.headers.get('Content-Type', '')
 855         webpage_bytes = urlh.read()
 856         if prefix is not None:
 857             webpage_bytes = prefix + webpage_bytes
 858         if not encoding:
 859             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 860         if self.get_param('dump_intermediate_pages', False):
 861             self.to_screen('Dumping request to ' + urlh.geturl())
 862             dump = base64.b64encode(webpage_bytes).decode('ascii')
 863             self._downloader.to_screen(dump)
 864         if self.get_param('write_pages', False):
 865             basen = f'{video_id}_{urlh.geturl()}'
 866             trim_length = self.get_param('trim_file_name') or 240
 867             if len(basen) > trim_length:
 868                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 869                 basen = basen[:trim_length - len(h)] + h
 870             raw_filename = basen + '.dump'
 871             filename = sanitize_filename(raw_filename, restricted=True)
 872             self.to_screen('Saving request to ' + filename)
 873             # Working around MAX_PATH limitation on Windows (see
 874             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 875             if compat_os_name == 'nt':
 876                 absfilepath = os.path.abspath(filename)
 877                 if len(absfilepath) > 259:
 878                     filename = '\\\\?\\' + absfilepath
 879             with open(filename, 'wb') as outf:
 880                 outf.write(webpage_bytes)
 881
 882         try:
 883             content = webpage_bytes.decode(encoding, 'replace')
 884         except LookupError:
 885             content = webpage_bytes.decode('utf-8', 'replace')
 886
 887         self.__check_blocked(content)
 888
 889         return content
 890
 891     def _download_webpage(
 892             self, url_or_request, video_id, note=None, errnote=None,
 893             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 894             headers={}, query={}, expected_status=None):
 895         """
 896         Return the data of the page as a string.
 897
 898         Arguments:
 899         url_or_request -- plain text URL as a string or
 900             a compat_urllib_request.Requestobject
 901         video_id -- Video/playlist/item identifier (string)
 902
 903         Keyword arguments:
 904         note -- note printed before downloading (string)
 905         errnote -- note printed in case of an error (string)
 906         fatal -- flag denoting whether error should be considered fatal,
 907             i.e. whether it should cause ExtractionError to be raised,
 908             otherwise a warning will be reported and extraction continued
 909         tries -- number of tries
 910         timeout -- sleep interval between tries
 911         encoding -- encoding for a page content decoding, guessed automatically
 912             when not explicitly specified
 913         data -- POST data (bytes)
 914         headers -- HTTP headers (dict)
 915         query -- URL query (dict)
 916         expected_status -- allows to accept failed HTTP requests (non 2xx
 917             status code) by explicitly specifying a set of accepted status
 918             codes. Can be any of the following entities:
 919                 - an integer type specifying an exact failed status code to
 920                   accept
 921                 - a list or a tuple of integer types specifying a list of
 922                   failed status codes to accept
 923                 - a callable accepting an actual failed status code and
 924                   returning True if it should be accepted
 925             Note that this argument does not affect success status codes (2xx)
 926             which are always accepted.
 927         """
 928
 929         success = False
 930         try_count = 0
 931         while success is False:
 932             try:
 933                 res = self._download_webpage_handle(
 934                     url_or_request, video_id, note, errnote, fatal,
 935                     encoding=encoding, data=data, headers=headers, query=query,
 936                     expected_status=expected_status)
 937                 success = True
 938             except compat_http_client.IncompleteRead as e:
 939                 try_count += 1
 940                 if try_count >= tries:
 941                     raise e
 942                 self._sleep(timeout, video_id)
 943         if res is False:
 944             return res
 945         else:
 946             content, _ = res
 947             return content
 948
 949     def _download_xml_handle(
 950             self, url_or_request, video_id, note='Downloading XML',
 951             errnote='Unable to download XML', transform_source=None,
 952             fatal=True, encoding=None, data=None, headers={}, query={},
 953             expected_status=None):
 954         """
 955         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
 956
 957         See _download_webpage docstring for arguments specification.
 958         """
 959         res = self._download_webpage_handle(
 960             url_or_request, video_id, note, errnote, fatal=fatal,
 961             encoding=encoding, data=data, headers=headers, query=query,
 962             expected_status=expected_status)
 963         if res is False:
 964             return res
 965         xml_string, urlh = res
 966         return self._parse_xml(
 967             xml_string, video_id, transform_source=transform_source,
 968             fatal=fatal), urlh
 969
 970     def _download_xml(
 971             self, url_or_request, video_id,
 972             note='Downloading XML', errnote='Unable to download XML',
 973             transform_source=None, fatal=True, encoding=None,
 974             data=None, headers={}, query={}, expected_status=None):
 975         """
 976         Return the xml as an xml.etree.ElementTree.Element.
 977
 978         See _download_webpage docstring for arguments specification.
 979         """
 980         res = self._download_xml_handle(
 981             url_or_request, video_id, note=note, errnote=errnote,
 982             transform_source=transform_source, fatal=fatal, encoding=encoding,
 983             data=data, headers=headers, query=query,
 984             expected_status=expected_status)
 985         return res if res is False else res[0]
 986
 987     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 988         if transform_source:
 989             xml_string = transform_source(xml_string)
 990         try:
 991             return compat_etree_fromstring(xml_string.encode('utf-8'))
 992         except xml.etree.ElementTree.ParseError as ve:
 993             errmsg = '%s: Failed to parse XML ' % video_id
 994             if fatal:
 995                 raise ExtractorError(errmsg, cause=ve)
 996             else:
 997                 self.report_warning(errmsg + str(ve))
 998
 999     def _download_json_handle(
1000             self, url_or_request, video_id, note='Downloading JSON metadata',
1001             errnote='Unable to download JSON metadata', transform_source=None,
1002             fatal=True, encoding=None, data=None, headers={}, query={},
1003             expected_status=None):
1004         """
1005         Return a tuple (JSON object, URL handle).
1006
1007         See _download_webpage docstring for arguments specification.
1008         """
1009         res = self._download_webpage_handle(
1010             url_or_request, video_id, note, errnote, fatal=fatal,
1011             encoding=encoding, data=data, headers=headers, query=query,
1012             expected_status=expected_status)
1013         if res is False:
1014             return res
1015         json_string, urlh = res
1016         return self._parse_json(
1017             json_string, video_id, transform_source=transform_source,
1018             fatal=fatal), urlh
1019
1020     def _download_json(
1021             self, url_or_request, video_id, note='Downloading JSON metadata',
1022             errnote='Unable to download JSON metadata', transform_source=None,
1023             fatal=True, encoding=None, data=None, headers={}, query={},
1024             expected_status=None):
1025         """
1026         Return the JSON object as a dict.
1027
1028         See _download_webpage docstring for arguments specification.
1029         """
1030         res = self._download_json_handle(
1031             url_or_request, video_id, note=note, errnote=errnote,
1032             transform_source=transform_source, fatal=fatal, encoding=encoding,
1033             data=data, headers=headers, query=query,
1034             expected_status=expected_status)
1035         return res if res is False else res[0]
1036
1037     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
1038         if transform_source:
1039             json_string = transform_source(json_string)
1040         try:
1041             return json.loads(json_string, strict=False)
1042         except ValueError as ve:
1043             errmsg = '%s: Failed to parse JSON ' % video_id
1044             if fatal:
1045                 raise ExtractorError(errmsg, cause=ve)
1046             else:
1047                 self.report_warning(errmsg + str(ve))
1048
1049     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1050         return self._parse_json(
1051             data[data.find('{'):data.rfind('}') + 1],
1052             video_id, transform_source, fatal)
1053
1054     def _download_socket_json_handle(
1055             self, url_or_request, video_id, note='Polling socket',
1056             errnote='Unable to poll socket', transform_source=None,
1057             fatal=True, encoding=None, data=None, headers={}, query={},
1058             expected_status=None):
1059         """
1060         Return a tuple (JSON object, URL handle).
1061
1062         See _download_webpage docstring for arguments specification.
1063         """
1064         res = self._download_webpage_handle(
1065             url_or_request, video_id, note, errnote, fatal=fatal,
1066             encoding=encoding, data=data, headers=headers, query=query,
1067             expected_status=expected_status)
1068         if res is False:
1069             return res
1070         webpage, urlh = res
1071         return self._parse_socket_response_as_json(
1072             webpage, video_id, transform_source=transform_source,
1073             fatal=fatal), urlh
1074
1075     def _download_socket_json(
1076             self, url_or_request, video_id, note='Polling socket',
1077             errnote='Unable to poll socket', transform_source=None,
1078             fatal=True, encoding=None, data=None, headers={}, query={},
1079             expected_status=None):
1080         """
1081         Return the JSON object as a dict.
1082
1083         See _download_webpage docstring for arguments specification.
1084         """
1085         res = self._download_socket_json_handle(
1086             url_or_request, video_id, note=note, errnote=errnote,
1087             transform_source=transform_source, fatal=fatal, encoding=encoding,
1088             data=data, headers=headers, query=query,
1089             expected_status=expected_status)
1090         return res if res is False else res[0]
1091
1092     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1093         idstr = format_field(video_id, template='%s: ')
1094         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1095         if only_once:
1096             if f'WARNING: {msg}' in self._printed_messages:
1097                 return
1098             self._printed_messages.add(f'WARNING: {msg}')
1099         self._downloader.report_warning(msg, *args, **kwargs)
1100
1101     def to_screen(self, msg, *args, **kwargs):
1102         """Print msg to screen, prefixing it with '[ie_name]'"""
1103         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1104
1105     def write_debug(self, msg, *args, **kwargs):
1106         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1107
1108     def get_param(self, name, default=None, *args, **kwargs):
1109         if self._downloader:
1110             return self._downloader.params.get(name, default, *args, **kwargs)
1111         return default
1112
1113     def report_drm(self, video_id, partial=False):
1114         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1115
1116     def report_extraction(self, id_or_name):
1117         """Report information extraction."""
1118         self.to_screen('%s: Extracting information' % id_or_name)
1119
1120     def report_download_webpage(self, video_id):
1121         """Report webpage download."""
1122         self.to_screen('%s: Downloading webpage' % video_id)
1123
1124     def report_age_confirmation(self):
1125         """Report attempt to confirm age."""
1126         self.to_screen('Confirming age')
1127
1128     def report_login(self):
1129         """Report attempt to log in."""
1130         self.to_screen('Logging in')
1131
1132     def raise_login_required(
1133             self, msg='This video is only available for registered users',
1134             metadata_available=False, method=NO_DEFAULT):
1135         if metadata_available and (
1136                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1137             self.report_warning(msg)
1138             return
1139         msg += format_field(self._login_hint(method), template='. %s')
1140         raise ExtractorError(msg, expected=True)
1141
1142     def raise_geo_restricted(
1143             self, msg='This video is not available from your location due to geo restriction',
1144             countries=None, metadata_available=False):
1145         if metadata_available and (
1146                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1147             self.report_warning(msg)
1148         else:
1149             raise GeoRestrictedError(msg, countries=countries)
1150
1151     def raise_no_formats(self, msg, expected=False, video_id=None):
1152         if expected and (
1153                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1154             self.report_warning(msg, video_id)
1155         elif isinstance(msg, ExtractorError):
1156             raise msg
1157         else:
1158             raise ExtractorError(msg, expected=expected, video_id=video_id)
1159
1160     # Methods for following #608
1161     @staticmethod
1162     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1163         """Returns a URL that points to a page that should be processed"""
1164         if ie is not None:
1165             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1166         if video_id is not None:
1167             kwargs['id'] = video_id
1168         if video_title is not None:
1169             kwargs['title'] = video_title
1170         return {
1171             **kwargs,
1172             '_type': 'url_transparent' if url_transparent else 'url',
1173             'url': url,
1174         }
1175
1176     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1177         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1178                 for m in orderedSet(map(getter, matches) if getter else matches))
1179         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1180
1181     @staticmethod
1182     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1183         """Returns a playlist"""
1184         if playlist_id:
1185             kwargs['id'] = playlist_id
1186         if playlist_title:
1187             kwargs['title'] = playlist_title
1188         if playlist_description is not None:
1189             kwargs['description'] = playlist_description
1190         return {
1191             **kwargs,
1192             '_type': 'multi_video' if multi_video else 'playlist',
1193             'entries': entries,
1194         }
1195
1196     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1197         """
1198         Perform a regex search on the given string, using a single or a list of
1199         patterns returning the first matching group.
1200         In case of failure return a default value or raise a WARNING or a
1201         RegexNotFoundError, depending on fatal, specifying the field name.
1202         """
1203         if string is None:
1204             mobj = None
1205         elif isinstance(pattern, (str, re.Pattern)):
1206             mobj = re.search(pattern, string, flags)
1207         else:
1208             for p in pattern:
1209                 mobj = re.search(p, string, flags)
1210                 if mobj:
1211                     break
1212
1213         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1214
1215         if mobj:
1216             if group is None:
1217                 # return the first matching group
1218                 return next(g for g in mobj.groups() if g is not None)
1219             elif isinstance(group, (list, tuple)):
1220                 return tuple(mobj.group(g) for g in group)
1221             else:
1222                 return mobj.group(group)
1223         elif default is not NO_DEFAULT:
1224             return default
1225         elif fatal:
1226             raise RegexNotFoundError('Unable to extract %s' % _name)
1227         else:
1228             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1229             return None
1230
1231     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1232         """
1233         Like _search_regex, but strips HTML tags and unescapes entities.
1234         """
1235         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1236         if res:
1237             return clean_html(res).strip()
1238         else:
1239             return res
1240
1241     def _get_netrc_login_info(self, netrc_machine=None):
1242         username = None
1243         password = None
1244         netrc_machine = netrc_machine or self._NETRC_MACHINE
1245
1246         if self.get_param('usenetrc', False):
1247             try:
1248                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1249                 if os.path.isdir(netrc_file):
1250                     netrc_file = os.path.join(netrc_file, '.netrc')
1251                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1252                 if info is not None:
1253                     username = info[0]
1254                     password = info[2]
1255                 else:
1256                     raise netrc.NetrcParseError(
1257                         'No authenticators for %s' % netrc_machine)
1258             except (OSError, netrc.NetrcParseError) as err:
1259                 self.report_warning(
1260                     'parsing .netrc: %s' % error_to_compat_str(err))
1261
1262         return username, password
1263
1264     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1265         """
1266         Get the login info as (username, password)
1267         First look for the manually specified credentials using username_option
1268         and password_option as keys in params dictionary. If no such credentials
1269         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1270         value.
1271         If there's no info available, return (None, None)
1272         """
1273
1274         # Attempt to use provided username and password or .netrc data
1275         username = self.get_param(username_option)
1276         if username is not None:
1277             password = self.get_param(password_option)
1278         else:
1279             username, password = self._get_netrc_login_info(netrc_machine)
1280
1281         return username, password
1282
1283     def _get_tfa_info(self, note='two-factor verification code'):
1284         """
1285         Get the two-factor authentication info
1286         TODO - asking the user will be required for sms/phone verify
1287         currently just uses the command line option
1288         If there's no info available, return None
1289         """
1290
1291         tfa = self.get_param('twofactor')
1292         if tfa is not None:
1293             return tfa
1294
1295         return compat_getpass('Type %s and press [Return]: ' % note)
1296
1297     # Helper functions for extracting OpenGraph info
1298     @staticmethod
1299     def _og_regexes(prop):
1300         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1301         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1302                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1303         template = r'<meta[^>]+?%s[^>]+?%s'
1304         return [
1305             template % (property_re, content_re),
1306             template % (content_re, property_re),
1307         ]
1308
1309     @staticmethod
1310     def _meta_regex(prop):
1311         return r'''(?isx)<meta
1312                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1313                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1314
1315     def _og_search_property(self, prop, html, name=None, **kargs):
1316         prop = variadic(prop)
1317         if name is None:
1318             name = 'OpenGraph %s' % prop[0]
1319         og_regexes = []
1320         for p in prop:
1321             og_regexes.extend(self._og_regexes(p))
1322         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1323         if escaped is None:
1324             return None
1325         return unescapeHTML(escaped)
1326
1327     def _og_search_thumbnail(self, html, **kargs):
1328         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1329
1330     def _og_search_description(self, html, **kargs):
1331         return self._og_search_property('description', html, fatal=False, **kargs)
1332
1333     def _og_search_title(self, html, *, fatal=False, **kargs):
1334         return self._og_search_property('title', html, fatal=fatal, **kargs)
1335
1336     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1337         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1338         if secure:
1339             regexes = self._og_regexes('video:secure_url') + regexes
1340         return self._html_search_regex(regexes, html, name, **kargs)
1341
1342     def _og_search_url(self, html, **kargs):
1343         return self._og_search_property('url', html, **kargs)
1344
1345     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1346         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1347
1348     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1349         name = variadic(name)
1350         if display_name is None:
1351             display_name = name[0]
1352         return self._html_search_regex(
1353             [self._meta_regex(n) for n in name],
1354             html, display_name, fatal=fatal, group='content', **kwargs)
1355
1356     def _dc_search_uploader(self, html):
1357         return self._html_search_meta('dc.creator', html, 'uploader')
1358
1359     def _rta_search(self, html):
1360         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1361         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1362                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1363                      html):
1364             return 18
1365         return 0
1366
1367     def _media_rating_search(self, html):
1368         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1369         rating = self._html_search_meta('rating', html)
1370
1371         if not rating:
1372             return None
1373
1374         RATING_TABLE = {
1375             'safe for kids': 0,
1376             'general': 8,
1377             '14 years': 14,
1378             'mature': 17,
1379             'restricted': 19,
1380         }
1381         return RATING_TABLE.get(rating.lower())
1382
1383     def _family_friendly_search(self, html):
1384         # See http://schema.org/VideoObject
1385         family_friendly = self._html_search_meta(
1386             'isFamilyFriendly', html, default=None)
1387
1388         if not family_friendly:
1389             return None
1390
1391         RATING_TABLE = {
1392             '1': 0,
1393             'true': 0,
1394             '0': 18,
1395             'false': 18,
1396         }
1397         return RATING_TABLE.get(family_friendly.lower())
1398
1399     def _twitter_search_player(self, html):
1400         return self._html_search_meta('twitter:player', html,
1401                                       'twitter card player')
1402
1403     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1404         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1405         default = kwargs.get('default', NO_DEFAULT)
1406         # JSON-LD may be malformed and thus `fatal` should be respected.
1407         # At the same time `default` may be passed that assumes `fatal=False`
1408         # for _search_regex. Let's simulate the same behavior here as well.
1409         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1410         json_ld = []
1411         for mobj in json_ld_list:
1412             json_ld_item = self._parse_json(
1413                 mobj.group('json_ld'), video_id, fatal=fatal)
1414             if not json_ld_item:
1415                 continue
1416             if isinstance(json_ld_item, dict):
1417                 json_ld.append(json_ld_item)
1418             elif isinstance(json_ld_item, (list, tuple)):
1419                 json_ld.extend(json_ld_item)
1420         if json_ld:
1421             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1422         if json_ld:
1423             return json_ld
1424         if default is not NO_DEFAULT:
1425             return default
1426         elif fatal:
1427             raise RegexNotFoundError('Unable to extract JSON-LD')
1428         else:
1429             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1430             return {}
1431
1432     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1433         if isinstance(json_ld, compat_str):
1434             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1435         if not json_ld:
1436             return {}
1437         info = {}
1438         if not isinstance(json_ld, (list, tuple, dict)):
1439             return info
1440         if isinstance(json_ld, dict):
1441             json_ld = [json_ld]
1442
1443         INTERACTION_TYPE_MAP = {
1444             'CommentAction': 'comment',
1445             'AgreeAction': 'like',
1446             'DisagreeAction': 'dislike',
1447             'LikeAction': 'like',
1448             'DislikeAction': 'dislike',
1449             'ListenAction': 'view',
1450             'WatchAction': 'view',
1451             'ViewAction': 'view',
1452         }
1453
1454         def extract_interaction_type(e):
1455             interaction_type = e.get('interactionType')
1456             if isinstance(interaction_type, dict):
1457                 interaction_type = interaction_type.get('@type')
1458             return str_or_none(interaction_type)
1459
1460         def extract_interaction_statistic(e):
1461             interaction_statistic = e.get('interactionStatistic')
1462             if isinstance(interaction_statistic, dict):
1463                 interaction_statistic = [interaction_statistic]
1464             if not isinstance(interaction_statistic, list):
1465                 return
1466             for is_e in interaction_statistic:
1467                 if not isinstance(is_e, dict):
1468                     continue
1469                 if is_e.get('@type') != 'InteractionCounter':
1470                     continue
1471                 interaction_type = extract_interaction_type(is_e)
1472                 if not interaction_type:
1473                     continue
1474                 # For interaction count some sites provide string instead of
1475                 # an integer (as per spec) with non digit characters (e.g. ",")
1476                 # so extracting count with more relaxed str_to_int
1477                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1478                 if interaction_count is None:
1479                     continue
1480                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1481                 if not count_kind:
1482                     continue
1483                 count_key = '%s_count' % count_kind
1484                 if info.get(count_key) is not None:
1485                     continue
1486                 info[count_key] = interaction_count
1487
1488         def extract_chapter_information(e):
1489             chapters = [{
1490                 'title': part.get('name'),
1491                 'start_time': part.get('startOffset'),
1492                 'end_time': part.get('endOffset'),
1493             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1494             for idx, (last_c, current_c, next_c) in enumerate(zip(
1495                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1496                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1497                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1498                 if None in current_c.values():
1499                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1500                     return
1501             if chapters:
1502                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1503                 info['chapters'] = chapters
1504
1505         def extract_video_object(e):
1506             assert e['@type'] == 'VideoObject'
1507             author = e.get('author')
1508             info.update({
1509                 'url': url_or_none(e.get('contentUrl')),
1510                 'title': unescapeHTML(e.get('name')),
1511                 'description': unescapeHTML(e.get('description')),
1512                 'thumbnails': [{'url': url}
1513                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1514                                if url_or_none(url)],
1515                 'duration': parse_duration(e.get('duration')),
1516                 'timestamp': unified_timestamp(e.get('uploadDate')),
1517                 # author can be an instance of 'Organization' or 'Person' types.
1518                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1519                 # however some websites are using 'Text' type instead.
1520                 # 1. https://schema.org/VideoObject
1521                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1522                 'filesize': float_or_none(e.get('contentSize')),
1523                 'tbr': int_or_none(e.get('bitrate')),
1524                 'width': int_or_none(e.get('width')),
1525                 'height': int_or_none(e.get('height')),
1526                 'view_count': int_or_none(e.get('interactionCount')),
1527             })
1528             extract_interaction_statistic(e)
1529             extract_chapter_information(e)
1530
1531         def traverse_json_ld(json_ld, at_top_level=True):
1532             for e in json_ld:
1533                 if at_top_level and '@context' not in e:
1534                     continue
1535                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1536                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1537                     break
1538                 item_type = e.get('@type')
1539                 if expected_type is not None and expected_type != item_type:
1540                     continue
1541                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1542                 if rating is not None:
1543                     info['average_rating'] = rating
1544                 if item_type in ('TVEpisode', 'Episode'):
1545                     episode_name = unescapeHTML(e.get('name'))
1546                     info.update({
1547                         'episode': episode_name,
1548                         'episode_number': int_or_none(e.get('episodeNumber')),
1549                         'description': unescapeHTML(e.get('description')),
1550                     })
1551                     if not info.get('title') and episode_name:
1552                         info['title'] = episode_name
1553                     part_of_season = e.get('partOfSeason')
1554                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1555                         info.update({
1556                             'season': unescapeHTML(part_of_season.get('name')),
1557                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1558                         })
1559                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1560                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1561                         info['series'] = unescapeHTML(part_of_series.get('name'))
1562                 elif item_type == 'Movie':
1563                     info.update({
1564                         'title': unescapeHTML(e.get('name')),
1565                         'description': unescapeHTML(e.get('description')),
1566                         'duration': parse_duration(e.get('duration')),
1567                         'timestamp': unified_timestamp(e.get('dateCreated')),
1568                     })
1569                 elif item_type in ('Article', 'NewsArticle'):
1570                     info.update({
1571                         'timestamp': parse_iso8601(e.get('datePublished')),
1572                         'title': unescapeHTML(e.get('headline')),
1573                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1574                     })
1575                     if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
1576                         extract_video_object(e['video'][0])
1577                 elif item_type == 'VideoObject':
1578                     extract_video_object(e)
1579                     if expected_type is None:
1580                         continue
1581                     else:
1582                         break
1583                 video = e.get('video')
1584                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1585                     extract_video_object(video)
1586                 if expected_type is None:
1587                     continue
1588                 else:
1589                     break
1590         traverse_json_ld(json_ld)
1591
1592         return filter_dict(info)
1593
1594     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1595         return self._parse_json(
1596             self._search_regex(
1597                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1598                 webpage, 'next.js data', fatal=fatal, **kw),
1599             video_id, transform_source=transform_source, fatal=fatal)
1600
1601     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1602         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1603         # not all website do this, but it can be changed
1604         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1605         rectx = re.escape(context_name)
1606         js, arg_keys, arg_vals = self._search_regex(
1607             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1608              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1609             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1610
1611         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1612
1613         for key, val in args.items():
1614             if val in ('undefined', 'void 0'):
1615                 args[key] = 'null'
1616
1617         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1618
1619     @staticmethod
1620     def _hidden_inputs(html):
1621         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1622         hidden_inputs = {}
1623         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1624             attrs = extract_attributes(input)
1625             if not input:
1626                 continue
1627             if attrs.get('type') not in ('hidden', 'submit'):
1628                 continue
1629             name = attrs.get('name') or attrs.get('id')
1630             value = attrs.get('value')
1631             if name and value is not None:
1632                 hidden_inputs[name] = value
1633         return hidden_inputs
1634
1635     def _form_hidden_inputs(self, form_id, html):
1636         form = self._search_regex(
1637             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1638             html, '%s form' % form_id, group='form')
1639         return self._hidden_inputs(form)
1640
1641     class FormatSort:
1642         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1643
1644         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1645                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1646                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1647         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1648                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1649                         'fps', 'fs_approx', 'source', 'id')
1650
1651         settings = {
1652             'vcodec': {'type': 'ordered', 'regex': True,
1653                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1654             'acodec': {'type': 'ordered', 'regex': True,
1655                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1656             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1657                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1658             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1659                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1660             'vext': {'type': 'ordered', 'field': 'video_ext',
1661                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1662                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1663             'aext': {'type': 'ordered', 'field': 'audio_ext',
1664                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1665                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1666             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1667             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1668                            'field': ('vcodec', 'acodec'),
1669                            'function': lambda it: int(any(v != 'none' for v in it))},
1670             'ie_pref': {'priority': True, 'type': 'extractor'},
1671             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1672             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1673             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1674             'quality': {'convert': 'float', 'default': -1},
1675             'filesize': {'convert': 'bytes'},
1676             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1677             'id': {'convert': 'string', 'field': 'format_id'},
1678             'height': {'convert': 'float_none'},
1679             'width': {'convert': 'float_none'},
1680             'fps': {'convert': 'float_none'},
1681             'tbr': {'convert': 'float_none'},
1682             'vbr': {'convert': 'float_none'},
1683             'abr': {'convert': 'float_none'},
1684             'asr': {'convert': 'float_none'},
1685             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1686
1687             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1688             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1689             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1690             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1691             'res': {'type': 'multiple', 'field': ('height', 'width'),
1692                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1693
1694             # For compatibility with youtube-dl
1695             'format_id': {'type': 'alias', 'field': 'id'},
1696             'preference': {'type': 'alias', 'field': 'ie_pref'},
1697             'language_preference': {'type': 'alias', 'field': 'lang'},
1698             'source_preference': {'type': 'alias', 'field': 'source'},
1699             'protocol': {'type': 'alias', 'field': 'proto'},
1700             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1701
1702             # Deprecated
1703             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1704             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1705             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1706             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1707             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1708             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1709             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1710             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1711             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1712             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1713             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1714             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1715             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1716             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1717             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1718             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1719             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1720             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1721             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1722             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1723         }
1724
1725         def __init__(self, ie, field_preference):
1726             self._order = []
1727             self.ydl = ie._downloader
1728             self.evaluate_params(self.ydl.params, field_preference)
1729             if ie.get_param('verbose'):
1730                 self.print_verbose_info(self.ydl.write_debug)
1731
1732         def _get_field_setting(self, field, key):
1733             if field not in self.settings:
1734                 if key in ('forced', 'priority'):
1735                     return False
1736                 self.ydl.deprecation_warning(
1737                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1738                     'and may be removed in a future version')
1739                 self.settings[field] = {}
1740             propObj = self.settings[field]
1741             if key not in propObj:
1742                 type = propObj.get('type')
1743                 if key == 'field':
1744                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1745                 elif key == 'convert':
1746                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1747                 else:
1748                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1749                 propObj[key] = default
1750             return propObj[key]
1751
1752         def _resolve_field_value(self, field, value, convertNone=False):
1753             if value is None:
1754                 if not convertNone:
1755                     return None
1756             else:
1757                 value = value.lower()
1758             conversion = self._get_field_setting(field, 'convert')
1759             if conversion == 'ignore':
1760                 return None
1761             if conversion == 'string':
1762                 return value
1763             elif conversion == 'float_none':
1764                 return float_or_none(value)
1765             elif conversion == 'bytes':
1766                 return FileDownloader.parse_bytes(value)
1767             elif conversion == 'order':
1768                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1769                 use_regex = self._get_field_setting(field, 'regex')
1770                 list_length = len(order_list)
1771                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1772                 if use_regex and value is not None:
1773                     for i, regex in enumerate(order_list):
1774                         if regex and re.match(regex, value):
1775                             return list_length - i
1776                     return list_length - empty_pos  # not in list
1777                 else:  # not regex or  value = None
1778                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1779             else:
1780                 if value.isnumeric():
1781                     return float(value)
1782                 else:
1783                     self.settings[field]['convert'] = 'string'
1784                     return value
1785
1786         def evaluate_params(self, params, sort_extractor):
1787             self._use_free_order = params.get('prefer_free_formats', False)
1788             self._sort_user = params.get('format_sort', [])
1789             self._sort_extractor = sort_extractor
1790
1791             def add_item(field, reverse, closest, limit_text):
1792                 field = field.lower()
1793                 if field in self._order:
1794                     return
1795                 self._order.append(field)
1796                 limit = self._resolve_field_value(field, limit_text)
1797                 data = {
1798                     'reverse': reverse,
1799                     'closest': False if limit is None else closest,
1800                     'limit_text': limit_text,
1801                     'limit': limit}
1802                 if field in self.settings:
1803                     self.settings[field].update(data)
1804                 else:
1805                     self.settings[field] = data
1806
1807             sort_list = (
1808                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1809                 + (tuple() if params.get('format_sort_force', False)
1810                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1811                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1812
1813             for item in sort_list:
1814                 match = re.match(self.regex, item)
1815                 if match is None:
1816                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1817                 field = match.group('field')
1818                 if field is None:
1819                     continue
1820                 if self._get_field_setting(field, 'type') == 'alias':
1821                     alias, field = field, self._get_field_setting(field, 'field')
1822                     if self._get_field_setting(alias, 'deprecated'):
1823                         self.ydl.deprecation_warning(
1824                             f'Format sorting alias {alias} is deprecated '
1825                             f'and may be removed in a future version. Please use {field} instead')
1826                 reverse = match.group('reverse') is not None
1827                 closest = match.group('separator') == '~'
1828                 limit_text = match.group('limit')
1829
1830                 has_limit = limit_text is not None
1831                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1832                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1833
1834                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1835                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1836                 limit_count = len(limits)
1837                 for (i, f) in enumerate(fields):
1838                     add_item(f, reverse, closest,
1839                              limits[i] if i < limit_count
1840                              else limits[0] if has_limit and not has_multiple_limits
1841                              else None)
1842
1843         def print_verbose_info(self, write_debug):
1844             if self._sort_user:
1845                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1846             if self._sort_extractor:
1847                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1848             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1849                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1850                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1851                               self._get_field_setting(field, 'limit_text'),
1852                               self._get_field_setting(field, 'limit'))
1853                 if self._get_field_setting(field, 'limit_text') is not None else '')
1854                 for field in self._order if self._get_field_setting(field, 'visible')]))
1855
1856         def _calculate_field_preference_from_value(self, format, field, type, value):
1857             reverse = self._get_field_setting(field, 'reverse')
1858             closest = self._get_field_setting(field, 'closest')
1859             limit = self._get_field_setting(field, 'limit')
1860
1861             if type == 'extractor':
1862                 maximum = self._get_field_setting(field, 'max')
1863                 if value is None or (maximum is not None and value >= maximum):
1864                     value = -1
1865             elif type == 'boolean':
1866                 in_list = self._get_field_setting(field, 'in_list')
1867                 not_in_list = self._get_field_setting(field, 'not_in_list')
1868                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1869             elif type == 'ordered':
1870                 value = self._resolve_field_value(field, value, True)
1871
1872             # try to convert to number
1873             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1874             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1875             if is_num:
1876                 value = val_num
1877
1878             return ((-10, 0) if value is None
1879                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1880                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1881                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1882                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1883                     else (-1, value, 0))
1884
1885         def _calculate_field_preference(self, format, field):
1886             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1887             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1888             if type == 'multiple':
1889                 type = 'field'  # Only 'field' is allowed in multiple for now
1890                 actual_fields = self._get_field_setting(field, 'field')
1891
1892                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1893             else:
1894                 value = get_value(field)
1895             return self._calculate_field_preference_from_value(format, field, type, value)
1896
1897         def calculate_preference(self, format):
1898             # Determine missing protocol
1899             if not format.get('protocol'):
1900                 format['protocol'] = determine_protocol(format)
1901
1902             # Determine missing ext
1903             if not format.get('ext') and 'url' in format:
1904                 format['ext'] = determine_ext(format['url'])
1905             if format.get('vcodec') == 'none':
1906                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1907                 format['video_ext'] = 'none'
1908             else:
1909                 format['video_ext'] = format['ext']
1910                 format['audio_ext'] = 'none'
1911             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1912             #    format['preference'] = -1000
1913
1914             # Determine missing bitrates
1915             if format.get('tbr') is None:
1916                 if format.get('vbr') is not None and format.get('abr') is not None:
1917                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1918             else:
1919                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1920                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1921                 if format.get('acodec') != 'none' and format.get('abr') is None:
1922                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1923
1924             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1925
1926     def _sort_formats(self, formats, field_preference=[]):
1927         if not formats:
1928             return
1929         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1930
1931     def _check_formats(self, formats, video_id):
1932         if formats:
1933             formats[:] = filter(
1934                 lambda f: self._is_valid_url(
1935                     f['url'], video_id,
1936                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1937                 formats)
1938
1939     @staticmethod
1940     def _remove_duplicate_formats(formats):
1941         format_urls = set()
1942         unique_formats = []
1943         for f in formats:
1944             if f['url'] not in format_urls:
1945                 format_urls.add(f['url'])
1946                 unique_formats.append(f)
1947         formats[:] = unique_formats
1948
1949     def _is_valid_url(self, url, video_id, item='video', headers={}):
1950         url = self._proto_relative_url(url, scheme='http:')
1951         # For now assume non HTTP(S) URLs always valid
1952         if not (url.startswith('http://') or url.startswith('https://')):
1953             return True
1954         try:
1955             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1956             return True
1957         except ExtractorError as e:
1958             self.to_screen(
1959                 '%s: %s URL is invalid, skipping: %s'
1960                 % (video_id, item, error_to_compat_str(e.cause)))
1961             return False
1962
1963     def http_scheme(self):
1964         """ Either "http:" or "https:", depending on the user's preferences """
1965         return (
1966             'http:'
1967             if self.get_param('prefer_insecure', False)
1968             else 'https:')
1969
1970     def _proto_relative_url(self, url, scheme=None):
1971         if url is None:
1972             return url
1973         if url.startswith('//'):
1974             if scheme is None:
1975                 scheme = self.http_scheme()
1976             return scheme + url
1977         else:
1978             return url
1979
1980     def _sleep(self, timeout, video_id, msg_template=None):
1981         if msg_template is None:
1982             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1983         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1984         self.to_screen(msg)
1985         time.sleep(timeout)
1986
1987     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1988                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1989                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1990         res = self._download_xml_handle(
1991             manifest_url, video_id, 'Downloading f4m manifest',
1992             'Unable to download f4m manifest',
1993             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1994             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1995             transform_source=transform_source,
1996             fatal=fatal, data=data, headers=headers, query=query)
1997         if res is False:
1998             return []
1999
2000         manifest, urlh = res
2001         manifest_url = urlh.geturl()
2002
2003         return self._parse_f4m_formats(
2004             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2005             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2006
2007     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2008                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2009                            fatal=True, m3u8_id=None):
2010         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2011             return []
2012
2013         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2014         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2015         if akamai_pv is not None and ';' in akamai_pv.text:
2016             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2017             if playerVerificationChallenge.strip() != '':
2018                 return []
2019
2020         formats = []
2021         manifest_version = '1.0'
2022         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2023         if not media_nodes:
2024             manifest_version = '2.0'
2025             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2026         # Remove unsupported DRM protected media from final formats
2027         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2028         media_nodes = remove_encrypted_media(media_nodes)
2029         if not media_nodes:
2030             return formats
2031
2032         manifest_base_url = get_base_url(manifest)
2033
2034         bootstrap_info = xpath_element(
2035             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2036             'bootstrap info', default=None)
2037
2038         vcodec = None
2039         mime_type = xpath_text(
2040             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2041             'base URL', default=None)
2042         if mime_type and mime_type.startswith('audio/'):
2043             vcodec = 'none'
2044
2045         for i, media_el in enumerate(media_nodes):
2046             tbr = int_or_none(media_el.attrib.get('bitrate'))
2047             width = int_or_none(media_el.attrib.get('width'))
2048             height = int_or_none(media_el.attrib.get('height'))
2049             format_id = join_nonempty(f4m_id, tbr or i)
2050             # If <bootstrapInfo> is present, the specified f4m is a
2051             # stream-level manifest, and only set-level manifests may refer to
2052             # external resources.  See section 11.4 and section 4 of F4M spec
2053             if bootstrap_info is None:
2054                 media_url = None
2055                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2056                 if manifest_version == '2.0':
2057                     media_url = media_el.attrib.get('href')
2058                 if media_url is None:
2059                     media_url = media_el.attrib.get('url')
2060                 if not media_url:
2061                     continue
2062                 manifest_url = (
2063                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2064                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2065                 # If media_url is itself a f4m manifest do the recursive extraction
2066                 # since bitrates in parent manifest (this one) and media_url manifest
2067                 # may differ leading to inability to resolve the format by requested
2068                 # bitrate in f4m downloader
2069                 ext = determine_ext(manifest_url)
2070                 if ext == 'f4m':
2071                     f4m_formats = self._extract_f4m_formats(
2072                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2073                         transform_source=transform_source, fatal=fatal)
2074                     # Sometimes stream-level manifest contains single media entry that
2075                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2076                     # At the same time parent's media entry in set-level manifest may
2077                     # contain it. We will copy it from parent in such cases.
2078                     if len(f4m_formats) == 1:
2079                         f = f4m_formats[0]
2080                         f.update({
2081                             'tbr': f.get('tbr') or tbr,
2082                             'width': f.get('width') or width,
2083                             'height': f.get('height') or height,
2084                             'format_id': f.get('format_id') if not tbr else format_id,
2085                             'vcodec': vcodec,
2086                         })
2087                     formats.extend(f4m_formats)
2088                     continue
2089                 elif ext == 'm3u8':
2090                     formats.extend(self._extract_m3u8_formats(
2091                         manifest_url, video_id, 'mp4', preference=preference,
2092                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2093                     continue
2094             formats.append({
2095                 'format_id': format_id,
2096                 'url': manifest_url,
2097                 'manifest_url': manifest_url,
2098                 'ext': 'flv' if bootstrap_info is not None else None,
2099                 'protocol': 'f4m',
2100                 'tbr': tbr,
2101                 'width': width,
2102                 'height': height,
2103                 'vcodec': vcodec,
2104                 'preference': preference,
2105                 'quality': quality,
2106             })
2107         return formats
2108
2109     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2110         return {
2111             'format_id': join_nonempty(m3u8_id, 'meta'),
2112             'url': m3u8_url,
2113             'ext': ext,
2114             'protocol': 'm3u8',
2115             'preference': preference - 100 if preference else -100,
2116             'quality': quality,
2117             'resolution': 'multiple',
2118             'format_note': 'Quality selection URL',
2119         }
2120
2121     def _report_ignoring_subs(self, name):
2122         self.report_warning(bug_reports_message(
2123             f'Ignoring subtitle tracks found in the {name} manifest; '
2124             'if any subtitle tracks are missing,'
2125         ), only_once=True)
2126
2127     def _extract_m3u8_formats(self, *args, **kwargs):
2128         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2129         if subs:
2130             self._report_ignoring_subs('HLS')
2131         return fmts
2132
2133     def _extract_m3u8_formats_and_subtitles(
2134             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2135             preference=None, quality=None, m3u8_id=None, note=None,
2136             errnote=None, fatal=True, live=False, data=None, headers={},
2137             query={}):
2138
2139         res = self._download_webpage_handle(
2140             m3u8_url, video_id,
2141             note='Downloading m3u8 information' if note is None else note,
2142             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2143             fatal=fatal, data=data, headers=headers, query=query)
2144
2145         if res is False:
2146             return [], {}
2147
2148         m3u8_doc, urlh = res
2149         m3u8_url = urlh.geturl()
2150
2151         return self._parse_m3u8_formats_and_subtitles(
2152             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2153             preference=preference, quality=quality, m3u8_id=m3u8_id,
2154             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2155             headers=headers, query=query, video_id=video_id)
2156
2157     def _parse_m3u8_formats_and_subtitles(
2158             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2159             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2160             errnote=None, fatal=True, data=None, headers={}, query={},
2161             video_id=None):
2162         formats, subtitles = [], {}
2163
2164         has_drm = re.search('|'.join([
2165             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2166             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2167         ]), m3u8_doc)
2168
2169         def format_url(url):
2170             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2171
2172         if self.get_param('hls_split_discontinuity', False):
2173             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2174                 if not m3u8_doc:
2175                     if not manifest_url:
2176                         return []
2177                     m3u8_doc = self._download_webpage(
2178                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2179                         note=False, errnote='Failed to download m3u8 playlist information')
2180                     if m3u8_doc is False:
2181                         return []
2182                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2183
2184         else:
2185             def _extract_m3u8_playlist_indices(*args, **kwargs):
2186                 return [None]
2187
2188         # References:
2189         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2190         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2191         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2192
2193         # We should try extracting formats only from master playlists [1, 4.3.4],
2194         # i.e. playlists that describe available qualities. On the other hand
2195         # media playlists [1, 4.3.3] should be returned as is since they contain
2196         # just the media without qualities renditions.
2197         # Fortunately, master playlist can be easily distinguished from media
2198         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2199         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2200         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2201         # media playlist and MUST NOT appear in master playlist thus we can
2202         # clearly detect media playlist with this criterion.
2203
2204         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2205             formats = [{
2206                 'format_id': join_nonempty(m3u8_id, idx),
2207                 'format_index': idx,
2208                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2209                 'ext': ext,
2210                 'protocol': entry_protocol,
2211                 'preference': preference,
2212                 'quality': quality,
2213                 'has_drm': has_drm,
2214             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2215
2216             return formats, subtitles
2217
2218         groups = {}
2219         last_stream_inf = {}
2220
2221         def extract_media(x_media_line):
2222             media = parse_m3u8_attributes(x_media_line)
2223             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2224             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2225             if not (media_type and group_id and name):
2226                 return
2227             groups.setdefault(group_id, []).append(media)
2228             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2229             if media_type == 'SUBTITLES':
2230                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2231                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2232                 # However, lack of URI has been spotted in the wild.
2233                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2234                 if not media.get('URI'):
2235                     return
2236                 url = format_url(media['URI'])
2237                 sub_info = {
2238                     'url': url,
2239                     'ext': determine_ext(url),
2240                 }
2241                 if sub_info['ext'] == 'm3u8':
2242                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2243                     # files may contain is WebVTT:
2244                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2245                     sub_info['ext'] = 'vtt'
2246                     sub_info['protocol'] = 'm3u8_native'
2247                 lang = media.get('LANGUAGE') or 'und'
2248                 subtitles.setdefault(lang, []).append(sub_info)
2249             if media_type not in ('VIDEO', 'AUDIO'):
2250                 return
2251             media_url = media.get('URI')
2252             if media_url:
2253                 manifest_url = format_url(media_url)
2254                 formats.extend({
2255                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2256                     'format_note': name,
2257                     'format_index': idx,
2258                     'url': manifest_url,
2259                     'manifest_url': m3u8_url,
2260                     'language': media.get('LANGUAGE'),
2261                     'ext': ext,
2262                     'protocol': entry_protocol,
2263                     'preference': preference,
2264                     'quality': quality,
2265                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2266                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2267
2268         def build_stream_name():
2269             # Despite specification does not mention NAME attribute for
2270             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2271             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2272             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2273             stream_name = last_stream_inf.get('NAME')
2274             if stream_name:
2275                 return stream_name
2276             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2277             # from corresponding rendition group
2278             stream_group_id = last_stream_inf.get('VIDEO')
2279             if not stream_group_id:
2280                 return
2281             stream_group = groups.get(stream_group_id)
2282             if not stream_group:
2283                 return stream_group_id
2284             rendition = stream_group[0]
2285             return rendition.get('NAME') or stream_group_id
2286
2287         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2288         # chance to detect video only formats when EXT-X-STREAM-INF tags
2289         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2290         for line in m3u8_doc.splitlines():
2291             if line.startswith('#EXT-X-MEDIA:'):
2292                 extract_media(line)
2293
2294         for line in m3u8_doc.splitlines():
2295             if line.startswith('#EXT-X-STREAM-INF:'):
2296                 last_stream_inf = parse_m3u8_attributes(line)
2297             elif line.startswith('#') or not line.strip():
2298                 continue
2299             else:
2300                 tbr = float_or_none(
2301                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2302                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2303                 manifest_url = format_url(line.strip())
2304
2305                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2306                     format_id = [m3u8_id, None, idx]
2307                     # Bandwidth of live streams may differ over time thus making
2308                     # format_id unpredictable. So it's better to keep provided
2309                     # format_id intact.
2310                     if not live:
2311                         stream_name = build_stream_name()
2312                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2313                     f = {
2314                         'format_id': join_nonempty(*format_id),
2315                         'format_index': idx,
2316                         'url': manifest_url,
2317                         'manifest_url': m3u8_url,
2318                         'tbr': tbr,
2319                         'ext': ext,
2320                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2321                         'protocol': entry_protocol,
2322                         'preference': preference,
2323                         'quality': quality,
2324                     }
2325                     resolution = last_stream_inf.get('RESOLUTION')
2326                     if resolution:
2327                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2328                         if mobj:
2329                             f['width'] = int(mobj.group('width'))
2330                             f['height'] = int(mobj.group('height'))
2331                     # Unified Streaming Platform
2332                     mobj = re.search(
2333                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2334                     if mobj:
2335                         abr, vbr = mobj.groups()
2336                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2337                         f.update({
2338                             'vbr': vbr,
2339                             'abr': abr,
2340                         })
2341                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2342                     f.update(codecs)
2343                     audio_group_id = last_stream_inf.get('AUDIO')
2344                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2345                     # references a rendition group MUST have a CODECS attribute.
2346                     # However, this is not always respected, for example, [2]
2347                     # contains EXT-X-STREAM-INF tag which references AUDIO
2348                     # rendition group but does not have CODECS and despite
2349                     # referencing an audio group it represents a complete
2350                     # (with audio and video) format. So, for such cases we will
2351                     # ignore references to rendition groups and treat them
2352                     # as complete formats.
2353                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2354                         audio_group = groups.get(audio_group_id)
2355                         if audio_group and audio_group[0].get('URI'):
2356                             # TODO: update acodec for audio only formats with
2357                             # the same GROUP-ID
2358                             f['acodec'] = 'none'
2359                     if not f.get('ext'):
2360                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2361                     formats.append(f)
2362
2363                     # for DailyMotion
2364                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2365                     if progressive_uri:
2366                         http_f = f.copy()
2367                         del http_f['manifest_url']
2368                         http_f.update({
2369                             'format_id': f['format_id'].replace('hls-', 'http-'),
2370                             'protocol': 'http',
2371                             'url': progressive_uri,
2372                         })
2373                         formats.append(http_f)
2374
2375                 last_stream_inf = {}
2376         return formats, subtitles
2377
2378     def _extract_m3u8_vod_duration(
2379             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2380
2381         m3u8_vod = self._download_webpage(
2382             m3u8_vod_url, video_id,
2383             note='Downloading m3u8 VOD manifest' if note is None else note,
2384             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2385             fatal=False, data=data, headers=headers, query=query)
2386
2387         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2388
2389     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2390         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2391             return None
2392
2393         return int(sum(
2394             float(line[len('#EXTINF:'):].split(',')[0])
2395             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2396
2397     @staticmethod
2398     def _xpath_ns(path, namespace=None):
2399         if not namespace:
2400             return path
2401         out = []
2402         for c in path.split('/'):
2403             if not c or c == '.':
2404                 out.append(c)
2405             else:
2406                 out.append('{%s}%s' % (namespace, c))
2407         return '/'.join(out)
2408
2409     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2410         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2411         if res is False:
2412             assert not fatal
2413             return [], {}
2414
2415         smil, urlh = res
2416         smil_url = urlh.geturl()
2417
2418         namespace = self._parse_smil_namespace(smil)
2419
2420         fmts = self._parse_smil_formats(
2421             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2422         subs = self._parse_smil_subtitles(
2423             smil, namespace=namespace)
2424
2425         return fmts, subs
2426
2427     def _extract_smil_formats(self, *args, **kwargs):
2428         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2429         if subs:
2430             self._report_ignoring_subs('SMIL')
2431         return fmts
2432
2433     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2434         res = self._download_smil(smil_url, video_id, fatal=fatal)
2435         if res is False:
2436             return {}
2437
2438         smil, urlh = res
2439         smil_url = urlh.geturl()
2440
2441         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2442
2443     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2444         return self._download_xml_handle(
2445             smil_url, video_id, 'Downloading SMIL file',
2446             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2447
2448     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2449         namespace = self._parse_smil_namespace(smil)
2450
2451         formats = self._parse_smil_formats(
2452             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2453         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2454
2455         video_id = os.path.splitext(url_basename(smil_url))[0]
2456         title = None
2457         description = None
2458         upload_date = None
2459         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2460             name = meta.attrib.get('name')
2461             content = meta.attrib.get('content')
2462             if not name or not content:
2463                 continue
2464             if not title and name == 'title':
2465                 title = content
2466             elif not description and name in ('description', 'abstract'):
2467                 description = content
2468             elif not upload_date and name == 'date':
2469                 upload_date = unified_strdate(content)
2470
2471         thumbnails = [{
2472             'id': image.get('type'),
2473             'url': image.get('src'),
2474             'width': int_or_none(image.get('width')),
2475             'height': int_or_none(image.get('height')),
2476         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2477
2478         return {
2479             'id': video_id,
2480             'title': title or video_id,
2481             'description': description,
2482             'upload_date': upload_date,
2483             'thumbnails': thumbnails,
2484             'formats': formats,
2485             'subtitles': subtitles,
2486         }
2487
2488     def _parse_smil_namespace(self, smil):
2489         return self._search_regex(
2490             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2491
2492     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2493         base = smil_url
2494         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2495             b = meta.get('base') or meta.get('httpBase')
2496             if b:
2497                 base = b
2498                 break
2499
2500         formats = []
2501         rtmp_count = 0
2502         http_count = 0
2503         m3u8_count = 0
2504         imgs_count = 0
2505
2506         srcs = set()
2507         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2508         for medium in media:
2509             src = medium.get('src')
2510             if not src or src in srcs:
2511                 continue
2512             srcs.add(src)
2513
2514             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2515             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2516             width = int_or_none(medium.get('width'))
2517             height = int_or_none(medium.get('height'))
2518             proto = medium.get('proto')
2519             ext = medium.get('ext')
2520             src_ext = determine_ext(src)
2521             streamer = medium.get('streamer') or base
2522
2523             if proto == 'rtmp' or streamer.startswith('rtmp'):
2524                 rtmp_count += 1
2525                 formats.append({
2526                     'url': streamer,
2527                     'play_path': src,
2528                     'ext': 'flv',
2529                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2530                     'tbr': bitrate,
2531                     'filesize': filesize,
2532                     'width': width,
2533                     'height': height,
2534                 })
2535                 if transform_rtmp_url:
2536                     streamer, src = transform_rtmp_url(streamer, src)
2537                     formats[-1].update({
2538                         'url': streamer,
2539                         'play_path': src,
2540                     })
2541                 continue
2542
2543             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2544             src_url = src_url.strip()
2545
2546             if proto == 'm3u8' or src_ext == 'm3u8':
2547                 m3u8_formats = self._extract_m3u8_formats(
2548                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2549                 if len(m3u8_formats) == 1:
2550                     m3u8_count += 1
2551                     m3u8_formats[0].update({
2552                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2553                         'tbr': bitrate,
2554                         'width': width,
2555                         'height': height,
2556                     })
2557                 formats.extend(m3u8_formats)
2558             elif src_ext == 'f4m':
2559                 f4m_url = src_url
2560                 if not f4m_params:
2561                     f4m_params = {
2562                         'hdcore': '3.2.0',
2563                         'plugin': 'flowplayer-3.2.0.1',
2564                     }
2565                 f4m_url += '&' if '?' in f4m_url else '?'
2566                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2567                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2568             elif src_ext == 'mpd':
2569                 formats.extend(self._extract_mpd_formats(
2570                     src_url, video_id, mpd_id='dash', fatal=False))
2571             elif re.search(r'\.ism/[Mm]anifest', src_url):
2572                 formats.extend(self._extract_ism_formats(
2573                     src_url, video_id, ism_id='mss', fatal=False))
2574             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2575                 http_count += 1
2576                 formats.append({
2577                     'url': src_url,
2578                     'ext': ext or src_ext or 'flv',
2579                     'format_id': 'http-%d' % (bitrate or http_count),
2580                     'tbr': bitrate,
2581                     'filesize': filesize,
2582                     'width': width,
2583                     'height': height,
2584                 })
2585
2586         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2587             src = medium.get('src')
2588             if not src or src in srcs:
2589                 continue
2590             srcs.add(src)
2591
2592             imgs_count += 1
2593             formats.append({
2594                 'format_id': 'imagestream-%d' % (imgs_count),
2595                 'url': src,
2596                 'ext': mimetype2ext(medium.get('type')),
2597                 'acodec': 'none',
2598                 'vcodec': 'none',
2599                 'width': int_or_none(medium.get('width')),
2600                 'height': int_or_none(medium.get('height')),
2601                 'format_note': 'SMIL storyboards',
2602             })
2603
2604         return formats
2605
2606     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2607         urls = []
2608         subtitles = {}
2609         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2610             src = textstream.get('src')
2611             if not src or src in urls:
2612                 continue
2613             urls.append(src)
2614             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2615             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2616             subtitles.setdefault(lang, []).append({
2617                 'url': src,
2618                 'ext': ext,
2619             })
2620         return subtitles
2621
2622     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2623         res = self._download_xml_handle(
2624             xspf_url, playlist_id, 'Downloading xpsf playlist',
2625             'Unable to download xspf manifest', fatal=fatal)
2626         if res is False:
2627             return []
2628
2629         xspf, urlh = res
2630         xspf_url = urlh.geturl()
2631
2632         return self._parse_xspf(
2633             xspf, playlist_id, xspf_url=xspf_url,
2634             xspf_base_url=base_url(xspf_url))
2635
2636     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2637         NS_MAP = {
2638             'xspf': 'http://xspf.org/ns/0/',
2639             's1': 'http://static.streamone.nl/player/ns/0',
2640         }
2641
2642         entries = []
2643         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2644             title = xpath_text(
2645                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2646             description = xpath_text(
2647                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2648             thumbnail = xpath_text(
2649                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2650             duration = float_or_none(
2651                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2652
2653             formats = []
2654             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2655                 format_url = urljoin(xspf_base_url, location.text)
2656                 if not format_url:
2657                     continue
2658                 formats.append({
2659                     'url': format_url,
2660                     'manifest_url': xspf_url,
2661                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2662                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2663                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2664                 })
2665             self._sort_formats(formats)
2666
2667             entries.append({
2668                 'id': playlist_id,
2669                 'title': title,
2670                 'description': description,
2671                 'thumbnail': thumbnail,
2672                 'duration': duration,
2673                 'formats': formats,
2674             })
2675         return entries
2676
2677     def _extract_mpd_formats(self, *args, **kwargs):
2678         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2679         if subs:
2680             self._report_ignoring_subs('DASH')
2681         return fmts
2682
2683     def _extract_mpd_formats_and_subtitles(
2684             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2685             fatal=True, data=None, headers={}, query={}):
2686         res = self._download_xml_handle(
2687             mpd_url, video_id,
2688             note='Downloading MPD manifest' if note is None else note,
2689             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2690             fatal=fatal, data=data, headers=headers, query=query)
2691         if res is False:
2692             return [], {}
2693         mpd_doc, urlh = res
2694         if mpd_doc is None:
2695             return [], {}
2696
2697         # We could have been redirected to a new url when we retrieved our mpd file.
2698         mpd_url = urlh.geturl()
2699         mpd_base_url = base_url(mpd_url)
2700
2701         return self._parse_mpd_formats_and_subtitles(
2702             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2703
2704     def _parse_mpd_formats(self, *args, **kwargs):
2705         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2706         if subs:
2707             self._report_ignoring_subs('DASH')
2708         return fmts
2709
2710     def _parse_mpd_formats_and_subtitles(
2711             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2712         """
2713         Parse formats from MPD manifest.
2714         References:
2715          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2716             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2717          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2718         """
2719         if not self.get_param('dynamic_mpd', True):
2720             if mpd_doc.get('type') == 'dynamic':
2721                 return [], {}
2722
2723         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2724
2725         def _add_ns(path):
2726             return self._xpath_ns(path, namespace)
2727
2728         def is_drm_protected(element):
2729             return element.find(_add_ns('ContentProtection')) is not None
2730
2731         def extract_multisegment_info(element, ms_parent_info):
2732             ms_info = ms_parent_info.copy()
2733
2734             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2735             # common attributes and elements.  We will only extract relevant
2736             # for us.
2737             def extract_common(source):
2738                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2739                 if segment_timeline is not None:
2740                     s_e = segment_timeline.findall(_add_ns('S'))
2741                     if s_e:
2742                         ms_info['total_number'] = 0
2743                         ms_info['s'] = []
2744                         for s in s_e:
2745                             r = int(s.get('r', 0))
2746                             ms_info['total_number'] += 1 + r
2747                             ms_info['s'].append({
2748                                 't': int(s.get('t', 0)),
2749                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2750                                 'd': int(s.attrib['d']),
2751                                 'r': r,
2752                             })
2753                 start_number = source.get('startNumber')
2754                 if start_number:
2755                     ms_info['start_number'] = int(start_number)
2756                 timescale = source.get('timescale')
2757                 if timescale:
2758                     ms_info['timescale'] = int(timescale)
2759                 segment_duration = source.get('duration')
2760                 if segment_duration:
2761                     ms_info['segment_duration'] = float(segment_duration)
2762
2763             def extract_Initialization(source):
2764                 initialization = source.find(_add_ns('Initialization'))
2765                 if initialization is not None:
2766                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2767
2768             segment_list = element.find(_add_ns('SegmentList'))
2769             if segment_list is not None:
2770                 extract_common(segment_list)
2771                 extract_Initialization(segment_list)
2772                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2773                 if segment_urls_e:
2774                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2775             else:
2776                 segment_template = element.find(_add_ns('SegmentTemplate'))
2777                 if segment_template is not None:
2778                     extract_common(segment_template)
2779                     media = segment_template.get('media')
2780                     if media:
2781                         ms_info['media'] = media
2782                     initialization = segment_template.get('initialization')
2783                     if initialization:
2784                         ms_info['initialization'] = initialization
2785                     else:
2786                         extract_Initialization(segment_template)
2787             return ms_info
2788
2789         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2790         formats, subtitles = [], {}
2791         stream_numbers = collections.defaultdict(int)
2792         for period in mpd_doc.findall(_add_ns('Period')):
2793             period_duration = parse_duration(period.get('duration')) or mpd_duration
2794             period_ms_info = extract_multisegment_info(period, {
2795                 'start_number': 1,
2796                 'timescale': 1,
2797             })
2798             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2799                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2800                 for representation in adaptation_set.findall(_add_ns('Representation')):
2801                     representation_attrib = adaptation_set.attrib.copy()
2802                     representation_attrib.update(representation.attrib)
2803                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2804                     mime_type = representation_attrib['mimeType']
2805                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2806
2807                     codec_str = representation_attrib.get('codecs', '')
2808                     # Some kind of binary subtitle found in some youtube livestreams
2809                     if mime_type == 'application/x-rawcc':
2810                         codecs = {'scodec': codec_str}
2811                     else:
2812                         codecs = parse_codecs(codec_str)
2813                     if content_type not in ('video', 'audio', 'text'):
2814                         if mime_type == 'image/jpeg':
2815                             content_type = mime_type
2816                         elif codecs.get('vcodec', 'none') != 'none':
2817                             content_type = 'video'
2818                         elif codecs.get('acodec', 'none') != 'none':
2819                             content_type = 'audio'
2820                         elif codecs.get('scodec', 'none') != 'none':
2821                             content_type = 'text'
2822                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2823                             content_type = 'text'
2824                         else:
2825                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2826                             continue
2827
2828                     base_url = ''
2829                     for element in (representation, adaptation_set, period, mpd_doc):
2830                         base_url_e = element.find(_add_ns('BaseURL'))
2831                         if base_url_e is not None:
2832                             base_url = base_url_e.text + base_url
2833                             if re.match(r'^https?://', base_url):
2834                                 break
2835                     if mpd_base_url and base_url.startswith('/'):
2836                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2837                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2838                         if not mpd_base_url.endswith('/'):
2839                             mpd_base_url += '/'
2840                         base_url = mpd_base_url + base_url
2841                     representation_id = representation_attrib.get('id')
2842                     lang = representation_attrib.get('lang')
2843                     url_el = representation.find(_add_ns('BaseURL'))
2844                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2845                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2846                     if representation_id is not None:
2847                         format_id = representation_id
2848                     else:
2849                         format_id = content_type
2850                     if mpd_id:
2851                         format_id = mpd_id + '-' + format_id
2852                     if content_type in ('video', 'audio'):
2853                         f = {
2854                             'format_id': format_id,
2855                             'manifest_url': mpd_url,
2856                             'ext': mimetype2ext(mime_type),
2857                             'width': int_or_none(representation_attrib.get('width')),
2858                             'height': int_or_none(representation_attrib.get('height')),
2859                             'tbr': float_or_none(bandwidth, 1000),
2860                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2861                             'fps': int_or_none(representation_attrib.get('frameRate')),
2862                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2863                             'format_note': 'DASH %s' % content_type,
2864                             'filesize': filesize,
2865                             'container': mimetype2ext(mime_type) + '_dash',
2866                             **codecs
2867                         }
2868                     elif content_type == 'text':
2869                         f = {
2870                             'ext': mimetype2ext(mime_type),
2871                             'manifest_url': mpd_url,
2872                             'filesize': filesize,
2873                         }
2874                     elif content_type == 'image/jpeg':
2875                         # See test case in VikiIE
2876                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2877                         f = {
2878                             'format_id': format_id,
2879                             'ext': 'mhtml',
2880                             'manifest_url': mpd_url,
2881                             'format_note': 'DASH storyboards (jpeg)',
2882                             'acodec': 'none',
2883                             'vcodec': 'none',
2884                         }
2885                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2886                         f['has_drm'] = True
2887                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2888
2889                     def prepare_template(template_name, identifiers):
2890                         tmpl = representation_ms_info[template_name]
2891                         # First of, % characters outside $...$ templates
2892                         # must be escaped by doubling for proper processing
2893                         # by % operator string formatting used further (see
2894                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2895                         t = ''
2896                         in_template = False
2897                         for c in tmpl:
2898                             t += c
2899                             if c == '$':
2900                                 in_template = not in_template
2901                             elif c == '%' and not in_template:
2902                                 t += c
2903                         # Next, $...$ templates are translated to their
2904                         # %(...) counterparts to be used with % operator
2905                         if representation_id is not None:
2906                             t = t.replace('$RepresentationID$', representation_id)
2907                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2908                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2909                         t.replace('$$', '$')
2910                         return t
2911
2912                     # @initialization is a regular template like @media one
2913                     # so it should be handled just the same way (see
2914                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2915                     if 'initialization' in representation_ms_info:
2916                         initialization_template = prepare_template(
2917                             'initialization',
2918                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2919                             # $Time$ shall not be included for @initialization thus
2920                             # only $Bandwidth$ remains
2921                             ('Bandwidth', ))
2922                         representation_ms_info['initialization_url'] = initialization_template % {
2923                             'Bandwidth': bandwidth,
2924                         }
2925
2926                     def location_key(location):
2927                         return 'url' if re.match(r'^https?://', location) else 'path'
2928
2929                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2930
2931                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2932                         media_location_key = location_key(media_template)
2933
2934                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2935                         # can't be used at the same time
2936                         if '%(Number' in media_template and 's' not in representation_ms_info:
2937                             segment_duration = None
2938                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2939                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2940                                 representation_ms_info['total_number'] = int(math.ceil(
2941                                     float_or_none(period_duration, segment_duration, default=0)))
2942                             representation_ms_info['fragments'] = [{
2943                                 media_location_key: media_template % {
2944                                     'Number': segment_number,
2945                                     'Bandwidth': bandwidth,
2946                                 },
2947                                 'duration': segment_duration,
2948                             } for segment_number in range(
2949                                 representation_ms_info['start_number'],
2950                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2951                         else:
2952                             # $Number*$ or $Time$ in media template with S list available
2953                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2954                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2955                             representation_ms_info['fragments'] = []
2956                             segment_time = 0
2957                             segment_d = None
2958                             segment_number = representation_ms_info['start_number']
2959
2960                             def add_segment_url():
2961                                 segment_url = media_template % {
2962                                     'Time': segment_time,
2963                                     'Bandwidth': bandwidth,
2964                                     'Number': segment_number,
2965                                 }
2966                                 representation_ms_info['fragments'].append({
2967                                     media_location_key: segment_url,
2968                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2969                                 })
2970
2971                             for num, s in enumerate(representation_ms_info['s']):
2972                                 segment_time = s.get('t') or segment_time
2973                                 segment_d = s['d']
2974                                 add_segment_url()
2975                                 segment_number += 1
2976                                 for r in range(s.get('r', 0)):
2977                                     segment_time += segment_d
2978                                     add_segment_url()
2979                                     segment_number += 1
2980                                 segment_time += segment_d
2981                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2982                         # No media template
2983                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2984                         # or any YouTube dashsegments video
2985                         fragments = []
2986                         segment_index = 0
2987                         timescale = representation_ms_info['timescale']
2988                         for s in representation_ms_info['s']:
2989                             duration = float_or_none(s['d'], timescale)
2990                             for r in range(s.get('r', 0) + 1):
2991                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2992                                 fragments.append({
2993                                     location_key(segment_uri): segment_uri,
2994                                     'duration': duration,
2995                                 })
2996                                 segment_index += 1
2997                         representation_ms_info['fragments'] = fragments
2998                     elif 'segment_urls' in representation_ms_info:
2999                         # Segment URLs with no SegmentTimeline
3000                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3001                         # https://github.com/ytdl-org/youtube-dl/pull/14844
3002                         fragments = []
3003                         segment_duration = float_or_none(
3004                             representation_ms_info['segment_duration'],
3005                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3006                         for segment_url in representation_ms_info['segment_urls']:
3007                             fragment = {
3008                                 location_key(segment_url): segment_url,
3009                             }
3010                             if segment_duration:
3011                                 fragment['duration'] = segment_duration
3012                             fragments.append(fragment)
3013                         representation_ms_info['fragments'] = fragments
3014                     # If there is a fragments key available then we correctly recognized fragmented media.
3015                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3016                     # assumption is not necessarily correct since we may simply have no support for
3017                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3018                     if 'fragments' in representation_ms_info:
3019                         f.update({
3020                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3021                             'url': mpd_url or base_url,
3022                             'fragment_base_url': base_url,
3023                             'fragments': [],
3024                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3025                         })
3026                         if 'initialization_url' in representation_ms_info:
3027                             initialization_url = representation_ms_info['initialization_url']
3028                             if not f.get('url'):
3029                                 f['url'] = initialization_url
3030                             f['fragments'].append({location_key(initialization_url): initialization_url})
3031                         f['fragments'].extend(representation_ms_info['fragments'])
3032                         if not period_duration:
3033                             period_duration = try_get(
3034                                 representation_ms_info,
3035                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3036                     else:
3037                         # Assuming direct URL to unfragmented media.
3038                         f['url'] = base_url
3039                     if content_type in ('video', 'audio', 'image/jpeg'):
3040                         f['manifest_stream_number'] = stream_numbers[f['url']]
3041                         stream_numbers[f['url']] += 1
3042                         formats.append(f)
3043                     elif content_type == 'text':
3044                         subtitles.setdefault(lang or 'und', []).append(f)
3045
3046         return formats, subtitles
3047
3048     def _extract_ism_formats(self, *args, **kwargs):
3049         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3050         if subs:
3051             self._report_ignoring_subs('ISM')
3052         return fmts
3053
3054     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3055         res = self._download_xml_handle(
3056             ism_url, video_id,
3057             note='Downloading ISM manifest' if note is None else note,
3058             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3059             fatal=fatal, data=data, headers=headers, query=query)
3060         if res is False:
3061             return [], {}
3062         ism_doc, urlh = res
3063         if ism_doc is None:
3064             return [], {}
3065
3066         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3067
3068     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3069         """
3070         Parse formats from ISM manifest.
3071         References:
3072          1. [MS-SSTR]: Smooth Streaming Protocol,
3073             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3074         """
3075         if ism_doc.get('IsLive') == 'TRUE':
3076             return [], {}
3077
3078         duration = int(ism_doc.attrib['Duration'])
3079         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3080
3081         formats = []
3082         subtitles = {}
3083         for stream in ism_doc.findall('StreamIndex'):
3084             stream_type = stream.get('Type')
3085             if stream_type not in ('video', 'audio', 'text'):
3086                 continue
3087             url_pattern = stream.attrib['Url']
3088             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3089             stream_name = stream.get('Name')
3090             stream_language = stream.get('Language', 'und')
3091             for track in stream.findall('QualityLevel'):
3092                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3093                 # TODO: add support for WVC1 and WMAP
3094                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3095                     self.report_warning('%s is not a supported codec' % fourcc)
3096                     continue
3097                 tbr = int(track.attrib['Bitrate']) // 1000
3098                 # [1] does not mention Width and Height attributes. However,
3099                 # they're often present while MaxWidth and MaxHeight are
3100                 # missing, so should be used as fallbacks
3101                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3102                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3103                 sampling_rate = int_or_none(track.get('SamplingRate'))
3104
3105                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3106                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3107
3108                 fragments = []
3109                 fragment_ctx = {
3110                     'time': 0,
3111                 }
3112                 stream_fragments = stream.findall('c')
3113                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3114                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3115                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3116                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3117                     if not fragment_ctx['duration']:
3118                         try:
3119                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3120                         except IndexError:
3121                             next_fragment_time = duration
3122                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3123                     for _ in range(fragment_repeat):
3124                         fragments.append({
3125                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3126                             'duration': fragment_ctx['duration'] / stream_timescale,
3127                         })
3128                         fragment_ctx['time'] += fragment_ctx['duration']
3129
3130                 if stream_type == 'text':
3131                     subtitles.setdefault(stream_language, []).append({
3132                         'ext': 'ismt',
3133                         'protocol': 'ism',
3134                         'url': ism_url,
3135                         'manifest_url': ism_url,
3136                         'fragments': fragments,
3137                         '_download_params': {
3138                             'stream_type': stream_type,
3139                             'duration': duration,
3140                             'timescale': stream_timescale,
3141                             'fourcc': fourcc,
3142                             'language': stream_language,
3143                             'codec_private_data': track.get('CodecPrivateData'),
3144                         }
3145                     })
3146                 elif stream_type in ('video', 'audio'):
3147                     formats.append({
3148                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3149                         'url': ism_url,
3150                         'manifest_url': ism_url,
3151                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3152                         'width': width,
3153                         'height': height,
3154                         'tbr': tbr,
3155                         'asr': sampling_rate,
3156                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3157                         'acodec': 'none' if stream_type == 'video' else fourcc,
3158                         'protocol': 'ism',
3159                         'fragments': fragments,
3160                         'has_drm': ism_doc.find('Protection') is not None,
3161                         '_download_params': {
3162                             'stream_type': stream_type,
3163                             'duration': duration,
3164                             'timescale': stream_timescale,
3165                             'width': width or 0,
3166                             'height': height or 0,
3167                             'fourcc': fourcc,
3168                             'language': stream_language,
3169                             'codec_private_data': track.get('CodecPrivateData'),
3170                             'sampling_rate': sampling_rate,
3171                             'channels': int_or_none(track.get('Channels', 2)),
3172                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3173                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3174                         },
3175                     })
3176         return formats, subtitles
3177
3178     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3179         def absolute_url(item_url):
3180             return urljoin(base_url, item_url)
3181
3182         def parse_content_type(content_type):
3183             if not content_type:
3184                 return {}
3185             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3186             if ctr:
3187                 mimetype, codecs = ctr.groups()
3188                 f = parse_codecs(codecs)
3189                 f['ext'] = mimetype2ext(mimetype)
3190                 return f
3191             return {}
3192
3193         def _media_formats(src, cur_media_type, type_info={}):
3194             full_url = absolute_url(src)
3195             ext = type_info.get('ext') or determine_ext(full_url)
3196             if ext == 'm3u8':
3197                 is_plain_url = False
3198                 formats = self._extract_m3u8_formats(
3199                     full_url, video_id, ext='mp4',
3200                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3201                     preference=preference, quality=quality, fatal=False)
3202             elif ext == 'mpd':
3203                 is_plain_url = False
3204                 formats = self._extract_mpd_formats(
3205                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3206             else:
3207                 is_plain_url = True
3208                 formats = [{
3209                     'url': full_url,
3210                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3211                 }]
3212             return is_plain_url, formats
3213
3214         entries = []
3215         # amp-video and amp-audio are very similar to their HTML5 counterparts
3216         # so we wll include them right here (see
3217         # https://www.ampproject.org/docs/reference/components/amp-video)
3218         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3219         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3220         media_tags = [(media_tag, media_tag_name, media_type, '')
3221                       for media_tag, media_tag_name, media_type
3222                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3223         media_tags.extend(re.findall(
3224             # We only allow video|audio followed by a whitespace or '>'.
3225             # Allowing more characters may end up in significant slow down (see
3226             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3227             # http://www.porntrex.com/maps/videositemap.xml).
3228             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3229         for media_tag, _, media_type, media_content in media_tags:
3230             media_info = {
3231                 'formats': [],
3232                 'subtitles': {},
3233             }
3234             media_attributes = extract_attributes(media_tag)
3235             src = strip_or_none(media_attributes.get('src'))
3236             if src:
3237                 _, formats = _media_formats(src, media_type)
3238                 media_info['formats'].extend(formats)
3239             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3240             if media_content:
3241                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3242                     s_attr = extract_attributes(source_tag)
3243                     # data-video-src and data-src are non standard but seen
3244                     # several times in the wild
3245                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3246                     if not src:
3247                         continue
3248                     f = parse_content_type(s_attr.get('type'))
3249                     is_plain_url, formats = _media_formats(src, media_type, f)
3250                     if is_plain_url:
3251                         # width, height, res, label and title attributes are
3252                         # all not standard but seen several times in the wild
3253                         labels = [
3254                             s_attr.get(lbl)
3255                             for lbl in ('label', 'title')
3256                             if str_or_none(s_attr.get(lbl))
3257                         ]
3258                         width = int_or_none(s_attr.get('width'))
3259                         height = (int_or_none(s_attr.get('height'))
3260                                   or int_or_none(s_attr.get('res')))
3261                         if not width or not height:
3262                             for lbl in labels:
3263                                 resolution = parse_resolution(lbl)
3264                                 if not resolution:
3265                                     continue
3266                                 width = width or resolution.get('width')
3267                                 height = height or resolution.get('height')
3268                         for lbl in labels:
3269                             tbr = parse_bitrate(lbl)
3270                             if tbr:
3271                                 break
3272                         else:
3273                             tbr = None
3274                         f.update({
3275                             'width': width,
3276                             'height': height,
3277                             'tbr': tbr,
3278                             'format_id': s_attr.get('label') or s_attr.get('title'),
3279                         })
3280                         f.update(formats[0])
3281                         media_info['formats'].append(f)
3282                     else:
3283                         media_info['formats'].extend(formats)
3284                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3285                     track_attributes = extract_attributes(track_tag)
3286                     kind = track_attributes.get('kind')
3287                     if not kind or kind in ('subtitles', 'captions'):
3288                         src = strip_or_none(track_attributes.get('src'))
3289                         if not src:
3290                             continue
3291                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3292                         media_info['subtitles'].setdefault(lang, []).append({
3293                             'url': absolute_url(src),
3294                         })
3295             for f in media_info['formats']:
3296                 f.setdefault('http_headers', {})['Referer'] = base_url
3297             if media_info['formats'] or media_info['subtitles']:
3298                 entries.append(media_info)
3299         return entries
3300
3301     def _extract_akamai_formats(self, *args, **kwargs):
3302         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3303         if subs:
3304             self._report_ignoring_subs('akamai')
3305         return fmts
3306
3307     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3308         signed = 'hdnea=' in manifest_url
3309         if not signed:
3310             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3311             manifest_url = re.sub(
3312                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3313                 '', manifest_url).strip('?')
3314
3315         formats = []
3316         subtitles = {}
3317
3318         hdcore_sign = 'hdcore=3.7.0'
3319         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3320         hds_host = hosts.get('hds')
3321         if hds_host:
3322             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3323         if 'hdcore=' not in f4m_url:
3324             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3325         f4m_formats = self._extract_f4m_formats(
3326             f4m_url, video_id, f4m_id='hds', fatal=False)
3327         for entry in f4m_formats:
3328             entry.update({'extra_param_to_segment_url': hdcore_sign})
3329         formats.extend(f4m_formats)
3330
3331         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3332         hls_host = hosts.get('hls')
3333         if hls_host:
3334             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3335         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3336             m3u8_url, video_id, 'mp4', 'm3u8_native',
3337             m3u8_id='hls', fatal=False)
3338         formats.extend(m3u8_formats)
3339         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3340
3341         http_host = hosts.get('http')
3342         if http_host and m3u8_formats and not signed:
3343             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3344             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3345             qualities_length = len(qualities)
3346             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3347                 i = 0
3348                 for f in m3u8_formats:
3349                     if f['vcodec'] != 'none':
3350                         for protocol in ('http', 'https'):
3351                             http_f = f.copy()
3352                             del http_f['manifest_url']
3353                             http_url = re.sub(
3354                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3355                             http_f.update({
3356                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3357                                 'url': http_url,
3358                                 'protocol': protocol,
3359                             })
3360                             formats.append(http_f)
3361                         i += 1
3362
3363         return formats, subtitles
3364
3365     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3366         query = compat_urlparse.urlparse(url).query
3367         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3368         mobj = re.search(
3369             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3370         url_base = mobj.group('url')
3371         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3372         formats = []
3373
3374         def manifest_url(manifest):
3375             m_url = f'{http_base_url}/{manifest}'
3376             if query:
3377                 m_url += '?%s' % query
3378             return m_url
3379
3380         if 'm3u8' not in skip_protocols:
3381             formats.extend(self._extract_m3u8_formats(
3382                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3383                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3384         if 'f4m' not in skip_protocols:
3385             formats.extend(self._extract_f4m_formats(
3386                 manifest_url('manifest.f4m'),
3387                 video_id, f4m_id='hds', fatal=False))
3388         if 'dash' not in skip_protocols:
3389             formats.extend(self._extract_mpd_formats(
3390                 manifest_url('manifest.mpd'),
3391                 video_id, mpd_id='dash', fatal=False))
3392         if re.search(r'(?:/smil:|\.smil)', url_base):
3393             if 'smil' not in skip_protocols:
3394                 rtmp_formats = self._extract_smil_formats(
3395                     manifest_url('jwplayer.smil'),
3396                     video_id, fatal=False)
3397                 for rtmp_format in rtmp_formats:
3398                     rtsp_format = rtmp_format.copy()
3399                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3400                     del rtsp_format['play_path']
3401                     del rtsp_format['ext']
3402                     rtsp_format.update({
3403                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3404                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3405                         'protocol': 'rtsp',
3406                     })
3407                     formats.extend([rtmp_format, rtsp_format])
3408         else:
3409             for protocol in ('rtmp', 'rtsp'):
3410                 if protocol not in skip_protocols:
3411                     formats.append({
3412                         'url': f'{protocol}:{url_base}',
3413                         'format_id': protocol,
3414                         'protocol': protocol,
3415                     })
3416         return formats
3417
3418     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3419         mobj = re.search(
3420             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3421             webpage)
3422         if mobj:
3423             try:
3424                 jwplayer_data = self._parse_json(mobj.group('options'),
3425                                                  video_id=video_id,
3426                                                  transform_source=transform_source)
3427             except ExtractorError:
3428                 pass
3429             else:
3430                 if isinstance(jwplayer_data, dict):
3431                     return jwplayer_data
3432
3433     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3434         jwplayer_data = self._find_jwplayer_data(
3435             webpage, video_id, transform_source=js_to_json)
3436         return self._parse_jwplayer_data(
3437             jwplayer_data, video_id, *args, **kwargs)
3438
3439     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3440                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3441         # JWPlayer backward compatibility: flattened playlists
3442         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3443         if 'playlist' not in jwplayer_data:
3444             jwplayer_data = {'playlist': [jwplayer_data]}
3445
3446         entries = []
3447
3448         # JWPlayer backward compatibility: single playlist item
3449         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3450         if not isinstance(jwplayer_data['playlist'], list):
3451             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3452
3453         for video_data in jwplayer_data['playlist']:
3454             # JWPlayer backward compatibility: flattened sources
3455             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3456             if 'sources' not in video_data:
3457                 video_data['sources'] = [video_data]
3458
3459             this_video_id = video_id or video_data['mediaid']
3460
3461             formats = self._parse_jwplayer_formats(
3462                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3463                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3464
3465             subtitles = {}
3466             tracks = video_data.get('tracks')
3467             if tracks and isinstance(tracks, list):
3468                 for track in tracks:
3469                     if not isinstance(track, dict):
3470                         continue
3471                     track_kind = track.get('kind')
3472                     if not track_kind or not isinstance(track_kind, compat_str):
3473                         continue
3474                     if track_kind.lower() not in ('captions', 'subtitles'):
3475                         continue
3476                     track_url = urljoin(base_url, track.get('file'))
3477                     if not track_url:
3478                         continue
3479                     subtitles.setdefault(track.get('label') or 'en', []).append({
3480                         'url': self._proto_relative_url(track_url)
3481                     })
3482
3483             entry = {
3484                 'id': this_video_id,
3485                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3486                 'description': clean_html(video_data.get('description')),
3487                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3488                 'timestamp': int_or_none(video_data.get('pubdate')),
3489                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3490                 'subtitles': subtitles,
3491             }
3492             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3493             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3494                 entry.update({
3495                     '_type': 'url_transparent',
3496                     'url': formats[0]['url'],
3497                 })
3498             else:
3499                 self._sort_formats(formats)
3500                 entry['formats'] = formats
3501             entries.append(entry)
3502         if len(entries) == 1:
3503             return entries[0]
3504         else:
3505             return self.playlist_result(entries)
3506
3507     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3508                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3509         urls = []
3510         formats = []
3511         for source in jwplayer_sources_data:
3512             if not isinstance(source, dict):
3513                 continue
3514             source_url = urljoin(
3515                 base_url, self._proto_relative_url(source.get('file')))
3516             if not source_url or source_url in urls:
3517                 continue
3518             urls.append(source_url)
3519             source_type = source.get('type') or ''
3520             ext = mimetype2ext(source_type) or determine_ext(source_url)
3521             if source_type == 'hls' or ext == 'm3u8':
3522                 formats.extend(self._extract_m3u8_formats(
3523                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3524                     m3u8_id=m3u8_id, fatal=False))
3525             elif source_type == 'dash' or ext == 'mpd':
3526                 formats.extend(self._extract_mpd_formats(
3527                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3528             elif ext == 'smil':
3529                 formats.extend(self._extract_smil_formats(
3530                     source_url, video_id, fatal=False))
3531             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3532             elif source_type.startswith('audio') or ext in (
3533                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3534                 formats.append({
3535                     'url': source_url,
3536                     'vcodec': 'none',
3537                     'ext': ext,
3538                 })
3539             else:
3540                 height = int_or_none(source.get('height'))
3541                 if height is None:
3542                     # Often no height is provided but there is a label in
3543                     # format like "1080p", "720p SD", or 1080.
3544                     height = int_or_none(self._search_regex(
3545                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3546                         'height', default=None))
3547                 a_format = {
3548                     'url': source_url,
3549                     'width': int_or_none(source.get('width')),
3550                     'height': height,
3551                     'tbr': int_or_none(source.get('bitrate')),
3552                     'ext': ext,
3553                 }
3554                 if source_url.startswith('rtmp'):
3555                     a_format['ext'] = 'flv'
3556                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3557                     # of jwplayer.flash.swf
3558                     rtmp_url_parts = re.split(
3559                         r'((?:mp4|mp3|flv):)', source_url, 1)
3560                     if len(rtmp_url_parts) == 3:
3561                         rtmp_url, prefix, play_path = rtmp_url_parts
3562                         a_format.update({
3563                             'url': rtmp_url,
3564                             'play_path': prefix + play_path,
3565                         })
3566                     if rtmp_params:
3567                         a_format.update(rtmp_params)
3568                 formats.append(a_format)
3569         return formats
3570
3571     def _live_title(self, name):
3572         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3573         return name
3574
3575     def _int(self, v, name, fatal=False, **kwargs):
3576         res = int_or_none(v, **kwargs)
3577         if res is None:
3578             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3579             if fatal:
3580                 raise ExtractorError(msg)
3581             else:
3582                 self.report_warning(msg)
3583         return res
3584
3585     def _float(self, v, name, fatal=False, **kwargs):
3586         res = float_or_none(v, **kwargs)
3587         if res is None:
3588             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3589             if fatal:
3590                 raise ExtractorError(msg)
3591             else:
3592                 self.report_warning(msg)
3593         return res
3594
3595     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3596                     path='/', secure=False, discard=False, rest={}, **kwargs):
3597         cookie = compat_cookiejar_Cookie(
3598             0, name, value, port, port is not None, domain, True,
3599             domain.startswith('.'), path, True, secure, expire_time,
3600             discard, None, None, rest)
3601         self._downloader.cookiejar.set_cookie(cookie)
3602
3603     def _get_cookies(self, url):
3604         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3605         req = sanitized_Request(url)
3606         self._downloader.cookiejar.add_cookie_header(req)
3607         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3608
3609     def _apply_first_set_cookie_header(self, url_handle, cookie):
3610         """
3611         Apply first Set-Cookie header instead of the last. Experimental.
3612
3613         Some sites (e.g. [1-3]) may serve two cookies under the same name
3614         in Set-Cookie header and expect the first (old) one to be set rather
3615         than second (new). However, as of RFC6265 the newer one cookie
3616         should be set into cookie store what actually happens.
3617         We will workaround this issue by resetting the cookie to
3618         the first one manually.
3619         1. https://new.vk.com/
3620         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3621         3. https://learning.oreilly.com/
3622         """
3623         for header, cookies in url_handle.headers.items():
3624             if header.lower() != 'set-cookie':
3625                 continue
3626             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3627             cookie_value = re.search(
3628                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3629             if cookie_value:
3630                 value, domain = cookie_value.groups()
3631                 self._set_cookie(domain, cookie, value)
3632                 break
3633
3634     @classmethod
3635     def get_testcases(cls, include_onlymatching=False):
3636         t = getattr(cls, '_TEST', None)
3637         if t:
3638             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3639             tests = [t]
3640         else:
3641             tests = getattr(cls, '_TESTS', [])
3642         for t in tests:
3643             if not include_onlymatching and t.get('only_matching', False):
3644                 continue
3645             t['name'] = cls.ie_key()
3646             yield t
3647
3648     @classproperty
3649     def age_limit(cls):
3650         """Get age limit from the testcases"""
3651         return max(traverse_obj(
3652             tuple(cls.get_testcases(include_onlymatching=False)),
3653             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3654
3655     @classmethod
3656     def is_suitable(cls, age_limit):
3657         """Test whether the extractor is generally suitable for the given age limit"""
3658         return not age_restricted(cls.age_limit, age_limit)
3659
3660     @classmethod
3661     def description(cls, *, markdown=True, search_examples=None):
3662         """Description of the extractor"""
3663         desc = ''
3664         if cls._NETRC_MACHINE:
3665             if markdown:
3666                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3667             else:
3668                 desc += f' [{cls._NETRC_MACHINE}]'
3669         if cls.IE_DESC is False:
3670             desc += ' [HIDDEN]'
3671         elif cls.IE_DESC:
3672             desc += f' {cls.IE_DESC}'
3673         if cls.SEARCH_KEY:
3674             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3675             if search_examples:
3676                 _COUNTS = ('', '5', '10', 'all')
3677                 desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3678         if not cls.working():
3679             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3680
3681         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3682         return f'{name}:{desc}' if desc else name
3683
3684     def extract_subtitles(self, *args, **kwargs):
3685         if (self.get_param('writesubtitles', False)
3686                 or self.get_param('listsubtitles')):
3687             return self._get_subtitles(*args, **kwargs)
3688         return {}
3689
3690     def _get_subtitles(self, *args, **kwargs):
3691         raise NotImplementedError('This method must be implemented by subclasses')
3692
3693     def extract_comments(self, *args, **kwargs):
3694         if not self.get_param('getcomments'):
3695             return None
3696         generator = self._get_comments(*args, **kwargs)
3697
3698         def extractor():
3699             comments = []
3700             interrupted = True
3701             try:
3702                 while True:
3703                     comments.append(next(generator))
3704             except StopIteration:
3705                 interrupted = False
3706             except KeyboardInterrupt:
3707                 self.to_screen('Interrupted by user')
3708             except Exception as e:
3709                 if self.get_param('ignoreerrors') is not True:
3710                     raise
3711                 self._downloader.report_error(e)
3712             comment_count = len(comments)
3713             self.to_screen(f'Extracted {comment_count} comments')
3714             return {
3715                 'comments': comments,
3716                 'comment_count': None if interrupted else comment_count
3717             }
3718         return extractor
3719
3720     def _get_comments(self, *args, **kwargs):
3721         raise NotImplementedError('This method must be implemented by subclasses')
3722
3723     @staticmethod
3724     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3725         """ Merge subtitle items for one language. Items with duplicated URLs/data
3726         will be dropped. """
3727         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3728         ret = list(subtitle_list1)
3729         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3730         return ret
3731
3732     @classmethod
3733     def _merge_subtitles(cls, *dicts, target=None):
3734         """ Merge subtitle dictionaries, language by language. """
3735         if target is None:
3736             target = {}
3737         for d in dicts:
3738             for lang, subs in d.items():
3739                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3740         return target
3741
3742     def extract_automatic_captions(self, *args, **kwargs):
3743         if (self.get_param('writeautomaticsub', False)
3744                 or self.get_param('listsubtitles')):
3745             return self._get_automatic_captions(*args, **kwargs)
3746         return {}
3747
3748     def _get_automatic_captions(self, *args, **kwargs):
3749         raise NotImplementedError('This method must be implemented by subclasses')
3750
3751     @functools.cached_property
3752     def _cookies_passed(self):
3753         """Whether cookies have been passed to YoutubeDL"""
3754         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3755
3756     def mark_watched(self, *args, **kwargs):
3757         if not self.get_param('mark_watched', False):
3758             return
3759         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3760             self._mark_watched(*args, **kwargs)
3761
3762     def _mark_watched(self, *args, **kwargs):
3763         raise NotImplementedError('This method must be implemented by subclasses')
3764
3765     def geo_verification_headers(self):
3766         headers = {}
3767         geo_verification_proxy = self.get_param('geo_verification_proxy')
3768         if geo_verification_proxy:
3769             headers['Ytdl-request-proxy'] = geo_verification_proxy
3770         return headers
3771
3772     def _generic_id(self, url):
3773         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3774
3775     def _generic_title(self, url):
3776         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3777
3778     @staticmethod
3779     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3780         all_known = all(map(
3781             lambda x: x is not None,
3782             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3783         return (
3784             'private' if is_private
3785             else 'premium_only' if needs_premium
3786             else 'subscriber_only' if needs_subscription
3787             else 'needs_auth' if needs_auth
3788             else 'unlisted' if is_unlisted
3789             else 'public' if all_known
3790             else None)
3791
3792     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3793         '''
3794         @returns            A list of values for the extractor argument given by "key"
3795                             or "default" if no such key is present
3796         @param default      The default value to return when the key is not present (default: [])
3797         @param casesense    When false, the values are converted to lower case
3798         '''
3799         val = traverse_obj(
3800             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3801         if val is None:
3802             return [] if default is NO_DEFAULT else default
3803         return list(val) if casesense else [x.lower() for x in val]
3804
3805     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3806         if not playlist_id or not video_id:
3807             return not video_id
3808
3809         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3810         if no_playlist is not None:
3811             return not no_playlist
3812
3813         video_id = '' if video_id is True else f' {video_id}'
3814         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3815         if self.get_param('noplaylist'):
3816             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3817             return False
3818         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3819         return True
3820
3821
3822 class SearchInfoExtractor(InfoExtractor):
3823     """
3824     Base class for paged search queries extractors.
3825     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3826     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3827     """
3828
3829     _MAX_RESULTS = float('inf')
3830
3831     @classmethod
3832     def _make_valid_url(cls):
3833         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3834
3835     def _real_extract(self, query):
3836         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3837         if prefix == '':
3838             return self._get_n_results(query, 1)
3839         elif prefix == 'all':
3840             return self._get_n_results(query, self._MAX_RESULTS)
3841         else:
3842             n = int(prefix)
3843             if n <= 0:
3844                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3845             elif n > self._MAX_RESULTS:
3846                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3847                 n = self._MAX_RESULTS
3848             return self._get_n_results(query, n)
3849
3850     def _get_n_results(self, query, n):
3851         """Get a specified number of results for a query.
3852         Either this function or _search_results must be overridden by subclasses """
3853         return self.playlist_result(
3854             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3855             query, query)
3856
3857     def _search_results(self, query):
3858         """Returns an iterator of search results"""
3859         raise NotImplementedError('This method must be implemented by subclasses')
3860
3861     @classproperty
3862     def SEARCH_KEY(cls):
3863         return cls._SEARCH_KEY