yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import hashlib
   4 import itertools
   5 import json
   6 import math
   7 import netrc
   8 import os
   9 import random
  10 import sys
  11 import time
  12 import xml.etree.ElementTree
  13
  14 from ..compat import (
  15     compat_cookiejar_Cookie,
  16     compat_cookies_SimpleCookie,
  17     compat_etree_fromstring,
  18     compat_expanduser,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_unquote,
  25     compat_urllib_parse_urlencode,
  26     compat_urllib_request,
  27     compat_urlparse,
  28     re,
  29 )
  30 from ..downloader import FileDownloader
  31 from ..downloader.f4m import get_base_url, remove_encrypted_media
  32 from ..utils import (
  33     JSON_LD_RE,
  34     NO_DEFAULT,
  35     ExtractorError,
  36     GeoRestrictedError,
  37     GeoUtils,
  38     RegexNotFoundError,
  39     UnsupportedError,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     classproperty,
  44     clean_html,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     encode_data_uri,
  49     error_to_compat_str,
  50     extract_attributes,
  51     filter_dict,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     int_or_none,
  56     join_nonempty,
  57     js_to_json,
  58     mimetype2ext,
  59     network_exceptions,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     sanitize_filename,
  68     sanitized_Request,
  69     str_or_none,
  70     str_to_int,
  71     strip_or_none,
  72     traverse_obj,
  73     try_get,
  74     unescapeHTML,
  75     unified_strdate,
  76     unified_timestamp,
  77     update_Request,
  78     update_url_query,
  79     url_basename,
  80     url_or_none,
  81     urljoin,
  82     variadic,
  83     xpath_element,
  84     xpath_text,
  85     xpath_with_ns,
  86 )
  87
  88
  89 class InfoExtractor:
  90     """Information Extractor class.
  91
  92     Information extractors are the classes that, given a URL, extract
  93     information about the video (or videos) the URL refers to. This
  94     information includes the real video URL, the video title, author and
  95     others. The information is stored in a dictionary which is then
  96     passed to the YoutubeDL. The YoutubeDL processes this
  97     information possibly downloading the video to the file system, among
  98     other possible outcomes.
  99
 100     The type field determines the type of the result.
 101     By far the most common value (and the default if _type is missing) is
 102     "video", which indicates a single video.
 103
 104     For a video, the dictionaries must include the following fields:
 105
 106     id:             Video identifier.
 107     title:          Video title, unescaped. Set to an empty string if video has
 108                     no title as opposed to "None" which signifies that the
 109                     extractor failed to obtain a title
 110
 111     Additionally, it must contain either a formats entry or a url one:
 112
 113     formats:        A list of dictionaries for each format available, ordered
 114                     from worst to best quality.
 115
 116                     Potential fields:
 117                     * url        The mandatory URL representing the media:
 118                                    for plain file media - HTTP URL of this file,
 119                                    for RTMP - RTMP URL,
 120                                    for HLS - URL of the M3U8 media playlist,
 121                                    for HDS - URL of the F4M manifest,
 122                                    for DASH
 123                                      - HTTP URL to plain file media (in case of
 124                                        unfragmented media)
 125                                      - URL of the MPD manifest or base URL
 126                                        representing the media if MPD manifest
 127                                        is parsed from a string (in case of
 128                                        fragmented media)
 129                                    for MSS - URL of the ISM manifest.
 130                     * manifest_url
 131                                  The URL of the manifest file in case of
 132                                  fragmented media:
 133                                    for HLS - URL of the M3U8 master playlist,
 134                                    for HDS - URL of the F4M manifest,
 135                                    for DASH - URL of the MPD manifest,
 136                                    for MSS - URL of the ISM manifest.
 137                     * manifest_stream_number  (For internal use only)
 138                                  The index of the stream in the manifest file
 139                     * ext        Will be calculated from URL if missing
 140                     * format     A human-readable description of the format
 141                                  ("mp4 container with h264/opus").
 142                                  Calculated from the format_id, width, height.
 143                                  and format_note fields if missing.
 144                     * format_id  A short description of the format
 145                                  ("mp4_h264_opus" or "19").
 146                                 Technically optional, but strongly recommended.
 147                     * format_note Additional info about the format
 148                                  ("3D" or "DASH video")
 149                     * width      Width of the video, if known
 150                     * height     Height of the video, if known
 151                     * resolution Textual description of width and height
 152                     * dynamic_range The dynamic range of the video. One of:
 153                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 154                     * tbr        Average bitrate of audio and video in KBit/s
 155                     * abr        Average audio bitrate in KBit/s
 156                     * acodec     Name of the audio codec in use
 157                     * asr        Audio sampling rate in Hertz
 158                     * vbr        Average video bitrate in KBit/s
 159                     * fps        Frame rate
 160                     * vcodec     Name of the video codec in use
 161                     * container  Name of the container format
 162                     * filesize   The number of bytes, if known in advance
 163                     * filesize_approx  An estimate for the number of bytes
 164                     * player_url SWF Player URL (used for rtmpdump).
 165                     * protocol   The protocol that will be used for the actual
 166                                  download, lower-case. One of "http", "https" or
 167                                  one of the protocols defined in downloader.PROTOCOL_MAP
 168                     * fragment_base_url
 169                                  Base URL for fragments. Each fragment's path
 170                                  value (if present) will be relative to
 171                                  this URL.
 172                     * fragments  A list of fragments of a fragmented media.
 173                                  Each fragment entry must contain either an url
 174                                  or a path. If an url is present it should be
 175                                  considered by a client. Otherwise both path and
 176                                  fragment_base_url must be present. Here is
 177                                  the list of all potential fields:
 178                                  * "url" - fragment's URL
 179                                  * "path" - fragment's path relative to
 180                                             fragment_base_url
 181                                  * "duration" (optional, int or float)
 182                                  * "filesize" (optional, int)
 183                     * is_from_start  Is a live format that can be downloaded
 184                                 from the start. Boolean
 185                     * preference Order number of this format. If this field is
 186                                  present and not None, the formats get sorted
 187                                  by this field, regardless of all other values.
 188                                  -1 for default (order by other properties),
 189                                  -2 or smaller for less than default.
 190                                  < -1000 to hide the format (if there is
 191                                     another one which is strictly better)
 192                     * language   Language code, e.g. "de" or "en-US".
 193                     * language_preference  Is this in the language mentioned in
 194                                  the URL?
 195                                  10 if it's what the URL is about,
 196                                  -1 for default (don't know),
 197                                  -10 otherwise, other values reserved for now.
 198                     * quality    Order number of the video quality of this
 199                                  format, irrespective of the file format.
 200                                  -1 for default (order by other properties),
 201                                  -2 or smaller for less than default.
 202                     * source_preference  Order number for this video source
 203                                   (quality takes higher priority)
 204                                  -1 for default (order by other properties),
 205                                  -2 or smaller for less than default.
 206                     * http_headers  A dictionary of additional HTTP headers
 207                                  to add to the request.
 208                     * stretched_ratio  If given and not 1, indicates that the
 209                                  video's pixels are not square.
 210                                  width : height ratio as float.
 211                     * no_resume  The server does not support resuming the
 212                                  (HTTP or RTMP) download. Boolean.
 213                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 214                     * downloader_options  A dictionary of downloader options
 215                                  (For internal use only)
 216                                  * http_chunk_size Chunk size for HTTP downloads
 217                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 218                     RTMP formats can also have the additional fields: page_url,
 219                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 220                     rtmp_protocol, rtmp_real_time
 221
 222     url:            Final video URL.
 223     ext:            Video filename extension.
 224     format:         The video format, defaults to ext (used for --get-format)
 225     player_url:     SWF Player URL (used for rtmpdump).
 226
 227     The following fields are optional:
 228
 229     direct:         True if a direct video file was given (must only be set by GenericIE)
 230     alt_title:      A secondary title of the video.
 231     display_id      An alternative identifier for the video, not necessarily
 232                     unique, but available before title. Typically, id is
 233                     something like "4234987", title "Dancing naked mole rats",
 234                     and display_id "dancing-naked-mole-rats"
 235     thumbnails:     A list of dictionaries, with the following entries:
 236                         * "id" (optional, string) - Thumbnail format ID
 237                         * "url"
 238                         * "preference" (optional, int) - quality of the image
 239                         * "width" (optional, int)
 240                         * "height" (optional, int)
 241                         * "resolution" (optional, string "{width}x{height}",
 242                                         deprecated)
 243                         * "filesize" (optional, int)
 244                         * "http_headers" (dict) - HTTP headers for the request
 245     thumbnail:      Full URL to a video thumbnail image.
 246     description:    Full video description.
 247     uploader:       Full name of the video uploader.
 248     license:        License name the video is licensed under.
 249     creator:        The creator of the video.
 250     timestamp:      UNIX timestamp of the moment the video was uploaded
 251     upload_date:    Video upload date in UTC (YYYYMMDD).
 252                     If not explicitly set, calculated from timestamp
 253     release_timestamp: UNIX timestamp of the moment the video was released.
 254                     If it is not clear whether to use timestamp or this, use the former
 255     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 256                     If not explicitly set, calculated from release_timestamp
 257     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 258     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 259                     If not explicitly set, calculated from modified_timestamp
 260     uploader_id:    Nickname or id of the video uploader.
 261     uploader_url:   Full URL to a personal webpage of the video uploader.
 262     channel:        Full name of the channel the video is uploaded on.
 263                     Note that channel fields may or may not repeat uploader
 264                     fields. This depends on a particular extractor.
 265     channel_id:     Id of the channel.
 266     channel_url:    Full URL to a channel webpage.
 267     channel_follower_count: Number of followers of the channel.
 268     location:       Physical location where the video was filmed.
 269     subtitles:      The available subtitles as a dictionary in the format
 270                     {tag: subformats}. "tag" is usually a language code, and
 271                     "subformats" is a list sorted from lower to higher
 272                     preference, each element is a dictionary with the "ext"
 273                     entry and one of:
 274                         * "data": The subtitles file contents
 275                         * "url": A URL pointing to the subtitles file
 276                     It can optionally also have:
 277                         * "name": Name or description of the subtitles
 278                         * "http_headers": A dictionary of additional HTTP headers
 279                                   to add to the request.
 280                     "ext" will be calculated from URL if missing
 281     automatic_captions: Like 'subtitles'; contains automatically generated
 282                     captions instead of normal subtitles
 283     duration:       Length of the video in seconds, as an integer or float.
 284     view_count:     How many users have watched the video on the platform.
 285     like_count:     Number of positive ratings of the video
 286     dislike_count:  Number of negative ratings of the video
 287     repost_count:   Number of reposts of the video
 288     average_rating: Average rating give by users, the scale used depends on the webpage
 289     comment_count:  Number of comments on the video
 290     comments:       A list of comments, each with one or more of the following
 291                     properties (all but one of text or html optional):
 292                         * "author" - human-readable name of the comment author
 293                         * "author_id" - user ID of the comment author
 294                         * "author_thumbnail" - The thumbnail of the comment author
 295                         * "id" - Comment ID
 296                         * "html" - Comment as HTML
 297                         * "text" - Plain text of the comment
 298                         * "timestamp" - UNIX timestamp of comment
 299                         * "parent" - ID of the comment this one is replying to.
 300                                      Set to "root" to indicate that this is a
 301                                      comment to the original video.
 302                         * "like_count" - Number of positive ratings of the comment
 303                         * "dislike_count" - Number of negative ratings of the comment
 304                         * "is_favorited" - Whether the comment is marked as
 305                                            favorite by the video uploader
 306                         * "author_is_uploader" - Whether the comment is made by
 307                                                  the video uploader
 308     age_limit:      Age restriction for the video, as an integer (years)
 309     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 310                     should allow to get the same result again. (It will be set
 311                     by YoutubeDL if it's missing)
 312     categories:     A list of categories that the video falls in, for example
 313                     ["Sports", "Berlin"]
 314     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 315     cast:           A list of the video cast
 316     is_live:        True, False, or None (=unknown). Whether this video is a
 317                     live stream that goes on instead of a fixed-length video.
 318     was_live:       True, False, or None (=unknown). Whether this video was
 319                     originally a live stream.
 320     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 321                     If absent, automatically set from is_live, was_live
 322     start_time:     Time in seconds where the reproduction should start, as
 323                     specified in the URL.
 324     end_time:       Time in seconds where the reproduction should end, as
 325                     specified in the URL.
 326     chapters:       A list of dictionaries, with the following entries:
 327                         * "start_time" - The start time of the chapter in seconds
 328                         * "end_time" - The end time of the chapter in seconds
 329                         * "title" (optional, string)
 330     playable_in_embed: Whether this video is allowed to play in embedded
 331                     players on other sites. Can be True (=always allowed),
 332                     False (=never allowed), None (=unknown), or a string
 333                     specifying the criteria for embedability (Eg: 'whitelist')
 334     availability:   Under what condition the video is available. One of
 335                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 336                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 337                     to set it
 338     __post_extractor: A function to be called just before the metadata is
 339                     written to either disk, logger or console. The function
 340                     must return a dict which will be added to the info_dict.
 341                     This is usefull for additional information that is
 342                     time-consuming to extract. Note that the fields thus
 343                     extracted will not be available to output template and
 344                     match_filter. So, only "comments" and "comment_count" are
 345                     currently allowed to be extracted via this method.
 346
 347     The following fields should only be used when the video belongs to some logical
 348     chapter or section:
 349
 350     chapter:        Name or title of the chapter the video belongs to.
 351     chapter_number: Number of the chapter the video belongs to, as an integer.
 352     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 353
 354     The following fields should only be used when the video is an episode of some
 355     series, programme or podcast:
 356
 357     series:         Title of the series or programme the video episode belongs to.
 358     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 359     season:         Title of the season the video episode belongs to.
 360     season_number:  Number of the season the video episode belongs to, as an integer.
 361     season_id:      Id of the season the video episode belongs to, as a unicode string.
 362     episode:        Title of the video episode. Unlike mandatory video title field,
 363                     this field should denote the exact title of the video episode
 364                     without any kind of decoration.
 365     episode_number: Number of the video episode within a season, as an integer.
 366     episode_id:     Id of the video episode, as a unicode string.
 367
 368     The following fields should only be used when the media is a track or a part of
 369     a music album:
 370
 371     track:          Title of the track.
 372     track_number:   Number of the track within an album or a disc, as an integer.
 373     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 374                     as a unicode string.
 375     artist:         Artist(s) of the track.
 376     genre:          Genre(s) of the track.
 377     album:          Title of the album the track belongs to.
 378     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 379     album_artist:   List of all artists appeared on the album (e.g.
 380                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 381                     and compilations).
 382     disc_number:    Number of the disc or other physical medium the track belongs to,
 383                     as an integer.
 384     release_year:   Year (YYYY) when the album was released.
 385     composer:       Composer of the piece
 386
 387     Unless mentioned otherwise, the fields should be Unicode strings.
 388
 389     Unless mentioned otherwise, None is equivalent to absence of information.
 390
 391
 392     _type "playlist" indicates multiple videos.
 393     There must be a key "entries", which is a list, an iterable, or a PagedList
 394     object, each element of which is a valid dictionary by this specification.
 395
 396     Additionally, playlists can have "id", "title", and any other relevent
 397     attributes with the same semantics as videos (see above).
 398
 399     It can also have the following optional fields:
 400
 401     playlist_count: The total number of videos in a playlist. If not given,
 402                     YoutubeDL tries to calculate it from "entries"
 403
 404
 405     _type "multi_video" indicates that there are multiple videos that
 406     form a single show, for examples multiple acts of an opera or TV episode.
 407     It must have an entries key like a playlist and contain all the keys
 408     required for a video at the same time.
 409
 410
 411     _type "url" indicates that the video must be extracted from another
 412     location, possibly by a different extractor. Its only required key is:
 413     "url" - the next URL to extract.
 414     The key "ie_key" can be set to the class name (minus the trailing "IE",
 415     e.g. "Youtube") if the extractor class is known in advance.
 416     Additionally, the dictionary may have any properties of the resolved entity
 417     known in advance, for example "title" if the title of the referred video is
 418     known ahead of time.
 419
 420
 421     _type "url_transparent" entities have the same specification as "url", but
 422     indicate that the given additional information is more precise than the one
 423     associated with the resolved URL.
 424     This is useful when a site employs a video service that hosts the video and
 425     its technical metadata, but that video service does not embed a useful
 426     title, description etc.
 427
 428
 429     Subclasses of this should define a _VALID_URL regexp and, re-define the
 430     _real_extract() and (optionally) _real_initialize() methods.
 431     Probably, they should also be added to the list of extractors.
 432
 433     Subclasses may also override suitable() if necessary, but ensure the function
 434     signature is preserved and that this function imports everything it needs
 435     (except other extractors), so that lazy_extractors works correctly.
 436
 437     To support username + password (or netrc) login, the extractor must define a
 438     _NETRC_MACHINE and re-define _perform_login(username, password) and
 439     (optionally) _initialize_pre_login() methods. The _perform_login method will
 440     be called between _initialize_pre_login and _real_initialize if credentials
 441     are passed by the user. In cases where it is necessary to have the login
 442     process as part of the extraction rather than initialization, _perform_login
 443     can be left undefined.
 444
 445     _GEO_BYPASS attribute may be set to False in order to disable
 446     geo restriction bypass mechanisms for a particular extractor.
 447     Though it won't disable explicit geo restriction bypass based on
 448     country code provided with geo_bypass_country.
 449
 450     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 451     countries for this extractor. One of these countries will be used by
 452     geo restriction bypass mechanism right away in order to bypass
 453     geo restriction, of course, if the mechanism is not disabled.
 454
 455     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 456     IP blocks in CIDR notation for this extractor. One of these IP blocks
 457     will be used by geo restriction bypass mechanism similarly
 458     to _GEO_COUNTRIES.
 459
 460     The _WORKING attribute should be set to False for broken IEs
 461     in order to warn the users and skip the tests.
 462     """
 463
 464     _ready = False
 465     _downloader = None
 466     _x_forwarded_for_ip = None
 467     _GEO_BYPASS = True
 468     _GEO_COUNTRIES = None
 469     _GEO_IP_BLOCKS = None
 470     _WORKING = True
 471     _NETRC_MACHINE = None
 472     IE_DESC = None
 473     SEARCH_KEY = None
 474
 475     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 476         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 477         return {
 478             None: '',
 479             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 480             'password': f'Use {password_hint}',
 481             'cookies': (
 482                 'Use --cookies-from-browser or --cookies for the authentication. '
 483                 'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 484         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 485
 486     def __init__(self, downloader=None):
 487         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 488         If a downloader is not passed during initialization,
 489         it must be set using "set_downloader()" before "extract()" is called"""
 490         self._ready = False
 491         self._x_forwarded_for_ip = None
 492         self._printed_messages = set()
 493         self.set_downloader(downloader)
 494
 495     @classmethod
 496     def _match_valid_url(cls, url):
 497         # This does not use has/getattr intentionally - we want to know whether
 498         # we have cached the regexp for *this* class, whereas getattr would also
 499         # match the superclass
 500         if '_VALID_URL_RE' not in cls.__dict__:
 501             if '_VALID_URL' not in cls.__dict__:
 502                 cls._VALID_URL = cls._make_valid_url()
 503             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 504         return cls._VALID_URL_RE.match(url)
 505
 506     @classmethod
 507     def suitable(cls, url):
 508         """Receives a URL and returns True if suitable for this IE."""
 509         # This function must import everything it needs (except other extractors),
 510         # so that lazy_extractors works correctly
 511         return cls._match_valid_url(url) is not None
 512
 513     @classmethod
 514     def _match_id(cls, url):
 515         return cls._match_valid_url(url).group('id')
 516
 517     @classmethod
 518     def get_temp_id(cls, url):
 519         try:
 520             return cls._match_id(url)
 521         except (IndexError, AttributeError):
 522             return None
 523
 524     @classmethod
 525     def working(cls):
 526         """Getter method for _WORKING."""
 527         return cls._WORKING
 528
 529     @classmethod
 530     def supports_login(cls):
 531         return bool(cls._NETRC_MACHINE)
 532
 533     def initialize(self):
 534         """Initializes an instance (authentication, etc)."""
 535         self._printed_messages = set()
 536         self._initialize_geo_bypass({
 537             'countries': self._GEO_COUNTRIES,
 538             'ip_blocks': self._GEO_IP_BLOCKS,
 539         })
 540         if not self._ready:
 541             self._initialize_pre_login()
 542             if self.supports_login():
 543                 username, password = self._get_login_info()
 544                 if username:
 545                     self._perform_login(username, password)
 546             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 547                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 548             self._real_initialize()
 549             self._ready = True
 550
 551     def _initialize_geo_bypass(self, geo_bypass_context):
 552         """
 553         Initialize geo restriction bypass mechanism.
 554
 555         This method is used to initialize geo bypass mechanism based on faking
 556         X-Forwarded-For HTTP header. A random country from provided country list
 557         is selected and a random IP belonging to this country is generated. This
 558         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 559         HTTP requests.
 560
 561         This method will be used for initial geo bypass mechanism initialization
 562         during the instance initialization with _GEO_COUNTRIES and
 563         _GEO_IP_BLOCKS.
 564
 565         You may also manually call it from extractor's code if geo bypass
 566         information is not available beforehand (e.g. obtained during
 567         extraction) or due to some other reason. In this case you should pass
 568         this information in geo bypass context passed as first argument. It may
 569         contain following fields:
 570
 571         countries:  List of geo unrestricted countries (similar
 572                     to _GEO_COUNTRIES)
 573         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 574                     (similar to _GEO_IP_BLOCKS)
 575
 576         """
 577         if not self._x_forwarded_for_ip:
 578
 579             # Geo bypass mechanism is explicitly disabled by user
 580             if not self.get_param('geo_bypass', True):
 581                 return
 582
 583             if not geo_bypass_context:
 584                 geo_bypass_context = {}
 585
 586             # Backward compatibility: previously _initialize_geo_bypass
 587             # expected a list of countries, some 3rd party code may still use
 588             # it this way
 589             if isinstance(geo_bypass_context, (list, tuple)):
 590                 geo_bypass_context = {
 591                     'countries': geo_bypass_context,
 592                 }
 593
 594             # The whole point of geo bypass mechanism is to fake IP
 595             # as X-Forwarded-For HTTP header based on some IP block or
 596             # country code.
 597
 598             # Path 1: bypassing based on IP block in CIDR notation
 599
 600             # Explicit IP block specified by user, use it right away
 601             # regardless of whether extractor is geo bypassable or not
 602             ip_block = self.get_param('geo_bypass_ip_block', None)
 603
 604             # Otherwise use random IP block from geo bypass context but only
 605             # if extractor is known as geo bypassable
 606             if not ip_block:
 607                 ip_blocks = geo_bypass_context.get('ip_blocks')
 608                 if self._GEO_BYPASS and ip_blocks:
 609                     ip_block = random.choice(ip_blocks)
 610
 611             if ip_block:
 612                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 613                 self._downloader.write_debug(
 614                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 615                 return
 616
 617             # Path 2: bypassing based on country code
 618
 619             # Explicit country code specified by user, use it right away
 620             # regardless of whether extractor is geo bypassable or not
 621             country = self.get_param('geo_bypass_country', None)
 622
 623             # Otherwise use random country code from geo bypass context but
 624             # only if extractor is known as geo bypassable
 625             if not country:
 626                 countries = geo_bypass_context.get('countries')
 627                 if self._GEO_BYPASS and countries:
 628                     country = random.choice(countries)
 629
 630             if country:
 631                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 632                 self._downloader.write_debug(
 633                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 634
 635     def extract(self, url):
 636         """Extracts URL information and returns it in list of dicts."""
 637         try:
 638             for _ in range(2):
 639                 try:
 640                     self.initialize()
 641                     self.write_debug('Extracting URL: %s' % url)
 642                     ie_result = self._real_extract(url)
 643                     if ie_result is None:
 644                         return None
 645                     if self._x_forwarded_for_ip:
 646                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 647                     subtitles = ie_result.get('subtitles')
 648                     if (subtitles and 'live_chat' in subtitles
 649                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 650                         del subtitles['live_chat']
 651                     return ie_result
 652                 except GeoRestrictedError as e:
 653                     if self.__maybe_fake_ip_and_retry(e.countries):
 654                         continue
 655                     raise
 656         except UnsupportedError:
 657             raise
 658         except ExtractorError as e:
 659             kwargs = {
 660                 'video_id': e.video_id or self.get_temp_id(url),
 661                 'ie': self.IE_NAME,
 662                 'tb': e.traceback or sys.exc_info()[2],
 663                 'expected': e.expected,
 664                 'cause': e.cause
 665             }
 666             if hasattr(e, 'countries'):
 667                 kwargs['countries'] = e.countries
 668             raise type(e)(e.orig_msg, **kwargs)
 669         except compat_http_client.IncompleteRead as e:
 670             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 671         except (KeyError, StopIteration) as e:
 672             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 673
 674     def __maybe_fake_ip_and_retry(self, countries):
 675         if (not self.get_param('geo_bypass_country', None)
 676                 and self._GEO_BYPASS
 677                 and self.get_param('geo_bypass', True)
 678                 and not self._x_forwarded_for_ip
 679                 and countries):
 680             country_code = random.choice(countries)
 681             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 682             if self._x_forwarded_for_ip:
 683                 self.report_warning(
 684                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 685                     % (self._x_forwarded_for_ip, country_code.upper()))
 686                 return True
 687         return False
 688
 689     def set_downloader(self, downloader):
 690         """Sets a YoutubeDL instance as the downloader for this IE."""
 691         self._downloader = downloader
 692
 693     def _initialize_pre_login(self):
 694         """ Intialization before login. Redefine in subclasses."""
 695         pass
 696
 697     def _perform_login(self, username, password):
 698         """ Login with username and password. Redefine in subclasses."""
 699         pass
 700
 701     def _real_initialize(self):
 702         """Real initialization process. Redefine in subclasses."""
 703         pass
 704
 705     def _real_extract(self, url):
 706         """Real extraction process. Redefine in subclasses."""
 707         raise NotImplementedError('This method must be implemented by subclasses')
 708
 709     @classmethod
 710     def ie_key(cls):
 711         """A string for getting the InfoExtractor with get_info_extractor"""
 712         return cls.__name__[:-2]
 713
 714     @classproperty
 715     def IE_NAME(cls):
 716         return cls.__name__[:-2]
 717
 718     @staticmethod
 719     def __can_accept_status_code(err, expected_status):
 720         assert isinstance(err, compat_urllib_error.HTTPError)
 721         if expected_status is None:
 722             return False
 723         elif callable(expected_status):
 724             return expected_status(err.code) is True
 725         else:
 726             return err.code in variadic(expected_status)
 727
 728     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 729         """
 730         Return the response handle.
 731
 732         See _download_webpage docstring for arguments specification.
 733         """
 734         if not self._downloader._first_webpage_request:
 735             sleep_interval = self.get_param('sleep_interval_requests') or 0
 736             if sleep_interval > 0:
 737                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 738                 time.sleep(sleep_interval)
 739         else:
 740             self._downloader._first_webpage_request = False
 741
 742         if note is None:
 743             self.report_download_webpage(video_id)
 744         elif note is not False:
 745             if video_id is None:
 746                 self.to_screen(str(note))
 747             else:
 748                 self.to_screen(f'{video_id}: {note}')
 749
 750         # Some sites check X-Forwarded-For HTTP header in order to figure out
 751         # the origin of the client behind proxy. This allows bypassing geo
 752         # restriction by faking this header's value to IP that belongs to some
 753         # geo unrestricted country. We will do so once we encounter any
 754         # geo restriction error.
 755         if self._x_forwarded_for_ip:
 756             if 'X-Forwarded-For' not in headers:
 757                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 758
 759         if isinstance(url_or_request, compat_urllib_request.Request):
 760             url_or_request = update_Request(
 761                 url_or_request, data=data, headers=headers, query=query)
 762         else:
 763             if query:
 764                 url_or_request = update_url_query(url_or_request, query)
 765             if data is not None or headers:
 766                 url_or_request = sanitized_Request(url_or_request, data, headers)
 767         try:
 768             return self._downloader.urlopen(url_or_request)
 769         except network_exceptions as err:
 770             if isinstance(err, compat_urllib_error.HTTPError):
 771                 if self.__can_accept_status_code(err, expected_status):
 772                     # Retain reference to error to prevent file object from
 773                     # being closed before it can be read. Works around the
 774                     # effects of <https://bugs.python.org/issue15002>
 775                     # introduced in Python 3.4.1.
 776                     err.fp._error = err
 777                     return err.fp
 778
 779             if errnote is False:
 780                 return False
 781             if errnote is None:
 782                 errnote = 'Unable to download webpage'
 783
 784             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 785             if fatal:
 786                 raise ExtractorError(errmsg, cause=err)
 787             else:
 788                 self.report_warning(errmsg)
 789                 return False
 790
 791     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 792         """
 793         Return a tuple (page content as string, URL handle).
 794
 795         See _download_webpage docstring for arguments specification.
 796         """
 797         # Strip hashes from the URL (#1038)
 798         if isinstance(url_or_request, (compat_str, str)):
 799             url_or_request = url_or_request.partition('#')[0]
 800
 801         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 802         if urlh is False:
 803             assert not fatal
 804             return False
 805         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 806         return (content, urlh)
 807
 808     @staticmethod
 809     def _guess_encoding_from_content(content_type, webpage_bytes):
 810         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 811         if m:
 812             encoding = m.group(1)
 813         else:
 814             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 815                           webpage_bytes[:1024])
 816             if m:
 817                 encoding = m.group(1).decode('ascii')
 818             elif webpage_bytes.startswith(b'\xff\xfe'):
 819                 encoding = 'utf-16'
 820             else:
 821                 encoding = 'utf-8'
 822
 823         return encoding
 824
 825     def __check_blocked(self, content):
 826         first_block = content[:512]
 827         if ('<title>Access to this site is blocked</title>' in content
 828                 and 'Websense' in first_block):
 829             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 830             blocked_iframe = self._html_search_regex(
 831                 r'<iframe src="([^"]+)"', content,
 832                 'Websense information URL', default=None)
 833             if blocked_iframe:
 834                 msg += ' Visit %s for more details' % blocked_iframe
 835             raise ExtractorError(msg, expected=True)
 836         if '<title>The URL you requested has been blocked</title>' in first_block:
 837             msg = (
 838                 'Access to this webpage has been blocked by Indian censorship. '
 839                 'Use a VPN or proxy server (with --proxy) to route around it.')
 840             block_msg = self._html_search_regex(
 841                 r'</h1><p>(.*?)</p>',
 842                 content, 'block message', default=None)
 843             if block_msg:
 844                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 845             raise ExtractorError(msg, expected=True)
 846         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 847                 and 'blocklist.rkn.gov.ru' in content):
 848             raise ExtractorError(
 849                 'Access to this webpage has been blocked by decision of the Russian government. '
 850                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 851                 expected=True)
 852
 853     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 854         content_type = urlh.headers.get('Content-Type', '')
 855         webpage_bytes = urlh.read()
 856         if prefix is not None:
 857             webpage_bytes = prefix + webpage_bytes
 858         if not encoding:
 859             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 860         if self.get_param('dump_intermediate_pages', False):
 861             self.to_screen('Dumping request to ' + urlh.geturl())
 862             dump = base64.b64encode(webpage_bytes).decode('ascii')
 863             self._downloader.to_screen(dump)
 864         if self.get_param('write_pages', False):
 865             basen = f'{video_id}_{urlh.geturl()}'
 866             trim_length = self.get_param('trim_file_name') or 240
 867             if len(basen) > trim_length:
 868                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 869                 basen = basen[:trim_length - len(h)] + h
 870             raw_filename = basen + '.dump'
 871             filename = sanitize_filename(raw_filename, restricted=True)
 872             self.to_screen('Saving request to ' + filename)
 873             # Working around MAX_PATH limitation on Windows (see
 874             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 875             if compat_os_name == 'nt':
 876                 absfilepath = os.path.abspath(filename)
 877                 if len(absfilepath) > 259:
 878                     filename = '\\\\?\\' + absfilepath
 879             with open(filename, 'wb') as outf:
 880                 outf.write(webpage_bytes)
 881
 882         try:
 883             content = webpage_bytes.decode(encoding, 'replace')
 884         except LookupError:
 885             content = webpage_bytes.decode('utf-8', 'replace')
 886
 887         self.__check_blocked(content)
 888
 889         return content
 890
 891     def _download_webpage(
 892             self, url_or_request, video_id, note=None, errnote=None,
 893             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 894             headers={}, query={}, expected_status=None):
 895         """
 896         Return the data of the page as a string.
 897
 898         Arguments:
 899         url_or_request -- plain text URL as a string or
 900             a compat_urllib_request.Requestobject
 901         video_id -- Video/playlist/item identifier (string)
 902
 903         Keyword arguments:
 904         note -- note printed before downloading (string)
 905         errnote -- note printed in case of an error (string)
 906         fatal -- flag denoting whether error should be considered fatal,
 907             i.e. whether it should cause ExtractionError to be raised,
 908             otherwise a warning will be reported and extraction continued
 909         tries -- number of tries
 910         timeout -- sleep interval between tries
 911         encoding -- encoding for a page content decoding, guessed automatically
 912             when not explicitly specified
 913         data -- POST data (bytes)
 914         headers -- HTTP headers (dict)
 915         query -- URL query (dict)
 916         expected_status -- allows to accept failed HTTP requests (non 2xx
 917             status code) by explicitly specifying a set of accepted status
 918             codes. Can be any of the following entities:
 919                 - an integer type specifying an exact failed status code to
 920                   accept
 921                 - a list or a tuple of integer types specifying a list of
 922                   failed status codes to accept
 923                 - a callable accepting an actual failed status code and
 924                   returning True if it should be accepted
 925             Note that this argument does not affect success status codes (2xx)
 926             which are always accepted.
 927         """
 928
 929         success = False
 930         try_count = 0
 931         while success is False:
 932             try:
 933                 res = self._download_webpage_handle(
 934                     url_or_request, video_id, note, errnote, fatal,
 935                     encoding=encoding, data=data, headers=headers, query=query,
 936                     expected_status=expected_status)
 937                 success = True
 938             except compat_http_client.IncompleteRead as e:
 939                 try_count += 1
 940                 if try_count >= tries:
 941                     raise e
 942                 self._sleep(timeout, video_id)
 943         if res is False:
 944             return res
 945         else:
 946             content, _ = res
 947             return content
 948
 949     def _download_xml_handle(
 950             self, url_or_request, video_id, note='Downloading XML',
 951             errnote='Unable to download XML', transform_source=None,
 952             fatal=True, encoding=None, data=None, headers={}, query={},
 953             expected_status=None):
 954         """
 955         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
 956
 957         See _download_webpage docstring for arguments specification.
 958         """
 959         res = self._download_webpage_handle(
 960             url_or_request, video_id, note, errnote, fatal=fatal,
 961             encoding=encoding, data=data, headers=headers, query=query,
 962             expected_status=expected_status)
 963         if res is False:
 964             return res
 965         xml_string, urlh = res
 966         return self._parse_xml(
 967             xml_string, video_id, transform_source=transform_source,
 968             fatal=fatal), urlh
 969
 970     def _download_xml(
 971             self, url_or_request, video_id,
 972             note='Downloading XML', errnote='Unable to download XML',
 973             transform_source=None, fatal=True, encoding=None,
 974             data=None, headers={}, query={}, expected_status=None):
 975         """
 976         Return the xml as an xml.etree.ElementTree.Element.
 977
 978         See _download_webpage docstring for arguments specification.
 979         """
 980         res = self._download_xml_handle(
 981             url_or_request, video_id, note=note, errnote=errnote,
 982             transform_source=transform_source, fatal=fatal, encoding=encoding,
 983             data=data, headers=headers, query=query,
 984             expected_status=expected_status)
 985         return res if res is False else res[0]
 986
 987     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 988         if transform_source:
 989             xml_string = transform_source(xml_string)
 990         try:
 991             return compat_etree_fromstring(xml_string.encode('utf-8'))
 992         except xml.etree.ElementTree.ParseError as ve:
 993             errmsg = '%s: Failed to parse XML ' % video_id
 994             if fatal:
 995                 raise ExtractorError(errmsg, cause=ve)
 996             else:
 997                 self.report_warning(errmsg + str(ve))
 998
 999     def _download_json_handle(
1000             self, url_or_request, video_id, note='Downloading JSON metadata',
1001             errnote='Unable to download JSON metadata', transform_source=None,
1002             fatal=True, encoding=None, data=None, headers={}, query={},
1003             expected_status=None):
1004         """
1005         Return a tuple (JSON object, URL handle).
1006
1007         See _download_webpage docstring for arguments specification.
1008         """
1009         res = self._download_webpage_handle(
1010             url_or_request, video_id, note, errnote, fatal=fatal,
1011             encoding=encoding, data=data, headers=headers, query=query,
1012             expected_status=expected_status)
1013         if res is False:
1014             return res
1015         json_string, urlh = res
1016         return self._parse_json(
1017             json_string, video_id, transform_source=transform_source,
1018             fatal=fatal), urlh
1019
1020     def _download_json(
1021             self, url_or_request, video_id, note='Downloading JSON metadata',
1022             errnote='Unable to download JSON metadata', transform_source=None,
1023             fatal=True, encoding=None, data=None, headers={}, query={},
1024             expected_status=None):
1025         """
1026         Return the JSON object as a dict.
1027
1028         See _download_webpage docstring for arguments specification.
1029         """
1030         res = self._download_json_handle(
1031             url_or_request, video_id, note=note, errnote=errnote,
1032             transform_source=transform_source, fatal=fatal, encoding=encoding,
1033             data=data, headers=headers, query=query,
1034             expected_status=expected_status)
1035         return res if res is False else res[0]
1036
1037     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
1038         if transform_source:
1039             json_string = transform_source(json_string)
1040         try:
1041             return json.loads(json_string, strict=False)
1042         except ValueError as ve:
1043             errmsg = '%s: Failed to parse JSON ' % video_id
1044             if fatal:
1045                 raise ExtractorError(errmsg, cause=ve)
1046             else:
1047                 self.report_warning(errmsg + str(ve))
1048
1049     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1050         return self._parse_json(
1051             data[data.find('{'):data.rfind('}') + 1],
1052             video_id, transform_source, fatal)
1053
1054     def _download_socket_json_handle(
1055             self, url_or_request, video_id, note='Polling socket',
1056             errnote='Unable to poll socket', transform_source=None,
1057             fatal=True, encoding=None, data=None, headers={}, query={},
1058             expected_status=None):
1059         """
1060         Return a tuple (JSON object, URL handle).
1061
1062         See _download_webpage docstring for arguments specification.
1063         """
1064         res = self._download_webpage_handle(
1065             url_or_request, video_id, note, errnote, fatal=fatal,
1066             encoding=encoding, data=data, headers=headers, query=query,
1067             expected_status=expected_status)
1068         if res is False:
1069             return res
1070         webpage, urlh = res
1071         return self._parse_socket_response_as_json(
1072             webpage, video_id, transform_source=transform_source,
1073             fatal=fatal), urlh
1074
1075     def _download_socket_json(
1076             self, url_or_request, video_id, note='Polling socket',
1077             errnote='Unable to poll socket', transform_source=None,
1078             fatal=True, encoding=None, data=None, headers={}, query={},
1079             expected_status=None):
1080         """
1081         Return the JSON object as a dict.
1082
1083         See _download_webpage docstring for arguments specification.
1084         """
1085         res = self._download_socket_json_handle(
1086             url_or_request, video_id, note=note, errnote=errnote,
1087             transform_source=transform_source, fatal=fatal, encoding=encoding,
1088             data=data, headers=headers, query=query,
1089             expected_status=expected_status)
1090         return res if res is False else res[0]
1091
1092     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1093         idstr = format_field(video_id, template='%s: ')
1094         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1095         if only_once:
1096             if f'WARNING: {msg}' in self._printed_messages:
1097                 return
1098             self._printed_messages.add(f'WARNING: {msg}')
1099         self._downloader.report_warning(msg, *args, **kwargs)
1100
1101     def to_screen(self, msg, *args, **kwargs):
1102         """Print msg to screen, prefixing it with '[ie_name]'"""
1103         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1104
1105     def write_debug(self, msg, *args, **kwargs):
1106         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1107
1108     def get_param(self, name, default=None, *args, **kwargs):
1109         if self._downloader:
1110             return self._downloader.params.get(name, default, *args, **kwargs)
1111         return default
1112
1113     def report_drm(self, video_id, partial=False):
1114         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1115
1116     def report_extraction(self, id_or_name):
1117         """Report information extraction."""
1118         self.to_screen('%s: Extracting information' % id_or_name)
1119
1120     def report_download_webpage(self, video_id):
1121         """Report webpage download."""
1122         self.to_screen('%s: Downloading webpage' % video_id)
1123
1124     def report_age_confirmation(self):
1125         """Report attempt to confirm age."""
1126         self.to_screen('Confirming age')
1127
1128     def report_login(self):
1129         """Report attempt to log in."""
1130         self.to_screen('Logging in')
1131
1132     def raise_login_required(
1133             self, msg='This video is only available for registered users',
1134             metadata_available=False, method=NO_DEFAULT):
1135         if metadata_available and (
1136                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1137             self.report_warning(msg)
1138             return
1139         msg += format_field(self._login_hint(method), template='. %s')
1140         raise ExtractorError(msg, expected=True)
1141
1142     def raise_geo_restricted(
1143             self, msg='This video is not available from your location due to geo restriction',
1144             countries=None, metadata_available=False):
1145         if metadata_available and (
1146                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1147             self.report_warning(msg)
1148         else:
1149             raise GeoRestrictedError(msg, countries=countries)
1150
1151     def raise_no_formats(self, msg, expected=False, video_id=None):
1152         if expected and (
1153                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1154             self.report_warning(msg, video_id)
1155         elif isinstance(msg, ExtractorError):
1156             raise msg
1157         else:
1158             raise ExtractorError(msg, expected=expected, video_id=video_id)
1159
1160     # Methods for following #608
1161     @staticmethod
1162     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1163         """Returns a URL that points to a page that should be processed"""
1164         if ie is not None:
1165             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1166         if video_id is not None:
1167             kwargs['id'] = video_id
1168         if video_title is not None:
1169             kwargs['title'] = video_title
1170         return {
1171             **kwargs,
1172             '_type': 'url_transparent' if url_transparent else 'url',
1173             'url': url,
1174         }
1175
1176     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1177         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1178                 for m in orderedSet(map(getter, matches) if getter else matches))
1179         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1180
1181     @staticmethod
1182     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1183         """Returns a playlist"""
1184         if playlist_id:
1185             kwargs['id'] = playlist_id
1186         if playlist_title:
1187             kwargs['title'] = playlist_title
1188         if playlist_description is not None:
1189             kwargs['description'] = playlist_description
1190         return {
1191             **kwargs,
1192             '_type': 'multi_video' if multi_video else 'playlist',
1193             'entries': entries,
1194         }
1195
1196     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1197         """
1198         Perform a regex search on the given string, using a single or a list of
1199         patterns returning the first matching group.
1200         In case of failure return a default value or raise a WARNING or a
1201         RegexNotFoundError, depending on fatal, specifying the field name.
1202         """
1203         if string is None:
1204             mobj = None
1205         elif isinstance(pattern, (str, re.Pattern)):
1206             mobj = re.search(pattern, string, flags)
1207         else:
1208             for p in pattern:
1209                 mobj = re.search(p, string, flags)
1210                 if mobj:
1211                     break
1212
1213         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1214
1215         if mobj:
1216             if group is None:
1217                 # return the first matching group
1218                 return next(g for g in mobj.groups() if g is not None)
1219             elif isinstance(group, (list, tuple)):
1220                 return tuple(mobj.group(g) for g in group)
1221             else:
1222                 return mobj.group(group)
1223         elif default is not NO_DEFAULT:
1224             return default
1225         elif fatal:
1226             raise RegexNotFoundError('Unable to extract %s' % _name)
1227         else:
1228             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1229             return None
1230
1231     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1232         """
1233         Like _search_regex, but strips HTML tags and unescapes entities.
1234         """
1235         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1236         if res:
1237             return clean_html(res).strip()
1238         else:
1239             return res
1240
1241     def _get_netrc_login_info(self, netrc_machine=None):
1242         username = None
1243         password = None
1244         netrc_machine = netrc_machine or self._NETRC_MACHINE
1245
1246         if self.get_param('usenetrc', False):
1247             try:
1248                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1249                 if os.path.isdir(netrc_file):
1250                     netrc_file = os.path.join(netrc_file, '.netrc')
1251                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1252                 if info is not None:
1253                     username = info[0]
1254                     password = info[2]
1255                 else:
1256                     raise netrc.NetrcParseError(
1257                         'No authenticators for %s' % netrc_machine)
1258             except (OSError, netrc.NetrcParseError) as err:
1259                 self.report_warning(
1260                     'parsing .netrc: %s' % error_to_compat_str(err))
1261
1262         return username, password
1263
1264     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1265         """
1266         Get the login info as (username, password)
1267         First look for the manually specified credentials using username_option
1268         and password_option as keys in params dictionary. If no such credentials
1269         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1270         value.
1271         If there's no info available, return (None, None)
1272         """
1273
1274         # Attempt to use provided username and password or .netrc data
1275         username = self.get_param(username_option)
1276         if username is not None:
1277             password = self.get_param(password_option)
1278         else:
1279             username, password = self._get_netrc_login_info(netrc_machine)
1280
1281         return username, password
1282
1283     def _get_tfa_info(self, note='two-factor verification code'):
1284         """
1285         Get the two-factor authentication info
1286         TODO - asking the user will be required for sms/phone verify
1287         currently just uses the command line option
1288         If there's no info available, return None
1289         """
1290
1291         tfa = self.get_param('twofactor')
1292         if tfa is not None:
1293             return tfa
1294
1295         return compat_getpass('Type %s and press [Return]: ' % note)
1296
1297     # Helper functions for extracting OpenGraph info
1298     @staticmethod
1299     def _og_regexes(prop):
1300         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1301         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1302                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1303         template = r'<meta[^>]+?%s[^>]+?%s'
1304         return [
1305             template % (property_re, content_re),
1306             template % (content_re, property_re),
1307         ]
1308
1309     @staticmethod
1310     def _meta_regex(prop):
1311         return r'''(?isx)<meta
1312                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1313                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1314
1315     def _og_search_property(self, prop, html, name=None, **kargs):
1316         prop = variadic(prop)
1317         if name is None:
1318             name = 'OpenGraph %s' % prop[0]
1319         og_regexes = []
1320         for p in prop:
1321             og_regexes.extend(self._og_regexes(p))
1322         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1323         if escaped is None:
1324             return None
1325         return unescapeHTML(escaped)
1326
1327     def _og_search_thumbnail(self, html, **kargs):
1328         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1329
1330     def _og_search_description(self, html, **kargs):
1331         return self._og_search_property('description', html, fatal=False, **kargs)
1332
1333     def _og_search_title(self, html, *, fatal=False, **kargs):
1334         return self._og_search_property('title', html, fatal=fatal, **kargs)
1335
1336     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1337         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1338         if secure:
1339             regexes = self._og_regexes('video:secure_url') + regexes
1340         return self._html_search_regex(regexes, html, name, **kargs)
1341
1342     def _og_search_url(self, html, **kargs):
1343         return self._og_search_property('url', html, **kargs)
1344
1345     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1346         return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1347
1348     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1349         name = variadic(name)
1350         if display_name is None:
1351             display_name = name[0]
1352         return self._html_search_regex(
1353             [self._meta_regex(n) for n in name],
1354             html, display_name, fatal=fatal, group='content', **kwargs)
1355
1356     def _dc_search_uploader(self, html):
1357         return self._html_search_meta('dc.creator', html, 'uploader')
1358
1359     def _rta_search(self, html):
1360         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1361         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1362                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1363                      html):
1364             return 18
1365         return 0
1366
1367     def _media_rating_search(self, html):
1368         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1369         rating = self._html_search_meta('rating', html)
1370
1371         if not rating:
1372             return None
1373
1374         RATING_TABLE = {
1375             'safe for kids': 0,
1376             'general': 8,
1377             '14 years': 14,
1378             'mature': 17,
1379             'restricted': 19,
1380         }
1381         return RATING_TABLE.get(rating.lower())
1382
1383     def _family_friendly_search(self, html):
1384         # See http://schema.org/VideoObject
1385         family_friendly = self._html_search_meta(
1386             'isFamilyFriendly', html, default=None)
1387
1388         if not family_friendly:
1389             return None
1390
1391         RATING_TABLE = {
1392             '1': 0,
1393             'true': 0,
1394             '0': 18,
1395             'false': 18,
1396         }
1397         return RATING_TABLE.get(family_friendly.lower())
1398
1399     def _twitter_search_player(self, html):
1400         return self._html_search_meta('twitter:player', html,
1401                                       'twitter card player')
1402
1403     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1404         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1405         default = kwargs.get('default', NO_DEFAULT)
1406         # JSON-LD may be malformed and thus `fatal` should be respected.
1407         # At the same time `default` may be passed that assumes `fatal=False`
1408         # for _search_regex. Let's simulate the same behavior here as well.
1409         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1410         json_ld = []
1411         for mobj in json_ld_list:
1412             json_ld_item = self._parse_json(
1413                 mobj.group('json_ld'), video_id, fatal=fatal)
1414             if not json_ld_item:
1415                 continue
1416             if isinstance(json_ld_item, dict):
1417                 json_ld.append(json_ld_item)
1418             elif isinstance(json_ld_item, (list, tuple)):
1419                 json_ld.extend(json_ld_item)
1420         if json_ld:
1421             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1422         if json_ld:
1423             return json_ld
1424         if default is not NO_DEFAULT:
1425             return default
1426         elif fatal:
1427             raise RegexNotFoundError('Unable to extract JSON-LD')
1428         else:
1429             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1430             return {}
1431
1432     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1433         if isinstance(json_ld, compat_str):
1434             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1435         if not json_ld:
1436             return {}
1437         info = {}
1438         if not isinstance(json_ld, (list, tuple, dict)):
1439             return info
1440         if isinstance(json_ld, dict):
1441             json_ld = [json_ld]
1442
1443         INTERACTION_TYPE_MAP = {
1444             'CommentAction': 'comment',
1445             'AgreeAction': 'like',
1446             'DisagreeAction': 'dislike',
1447             'LikeAction': 'like',
1448             'DislikeAction': 'dislike',
1449             'ListenAction': 'view',
1450             'WatchAction': 'view',
1451             'ViewAction': 'view',
1452         }
1453
1454         def extract_interaction_type(e):
1455             interaction_type = e.get('interactionType')
1456             if isinstance(interaction_type, dict):
1457                 interaction_type = interaction_type.get('@type')
1458             return str_or_none(interaction_type)
1459
1460         def extract_interaction_statistic(e):
1461             interaction_statistic = e.get('interactionStatistic')
1462             if isinstance(interaction_statistic, dict):
1463                 interaction_statistic = [interaction_statistic]
1464             if not isinstance(interaction_statistic, list):
1465                 return
1466             for is_e in interaction_statistic:
1467                 if not isinstance(is_e, dict):
1468                     continue
1469                 if is_e.get('@type') != 'InteractionCounter':
1470                     continue
1471                 interaction_type = extract_interaction_type(is_e)
1472                 if not interaction_type:
1473                     continue
1474                 # For interaction count some sites provide string instead of
1475                 # an integer (as per spec) with non digit characters (e.g. ",")
1476                 # so extracting count with more relaxed str_to_int
1477                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1478                 if interaction_count is None:
1479                     continue
1480                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1481                 if not count_kind:
1482                     continue
1483                 count_key = '%s_count' % count_kind
1484                 if info.get(count_key) is not None:
1485                     continue
1486                 info[count_key] = interaction_count
1487
1488         def extract_chapter_information(e):
1489             chapters = [{
1490                 'title': part.get('name'),
1491                 'start_time': part.get('startOffset'),
1492                 'end_time': part.get('endOffset'),
1493             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1494             for idx, (last_c, current_c, next_c) in enumerate(zip(
1495                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1496                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1497                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1498                 if None in current_c.values():
1499                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1500                     return
1501             if chapters:
1502                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1503                 info['chapters'] = chapters
1504
1505         def extract_video_object(e):
1506             assert e['@type'] == 'VideoObject'
1507             author = e.get('author')
1508             info.update({
1509                 'url': url_or_none(e.get('contentUrl')),
1510                 'title': unescapeHTML(e.get('name')),
1511                 'description': unescapeHTML(e.get('description')),
1512                 'thumbnails': [{'url': url_or_none(url)}
1513                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
1514                 'duration': parse_duration(e.get('duration')),
1515                 'timestamp': unified_timestamp(e.get('uploadDate')),
1516                 # author can be an instance of 'Organization' or 'Person' types.
1517                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1518                 # however some websites are using 'Text' type instead.
1519                 # 1. https://schema.org/VideoObject
1520                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1521                 'filesize': float_or_none(e.get('contentSize')),
1522                 'tbr': int_or_none(e.get('bitrate')),
1523                 'width': int_or_none(e.get('width')),
1524                 'height': int_or_none(e.get('height')),
1525                 'view_count': int_or_none(e.get('interactionCount')),
1526             })
1527             extract_interaction_statistic(e)
1528             extract_chapter_information(e)
1529
1530         def traverse_json_ld(json_ld, at_top_level=True):
1531             for e in json_ld:
1532                 if at_top_level and '@context' not in e:
1533                     continue
1534                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1535                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1536                     break
1537                 item_type = e.get('@type')
1538                 if expected_type is not None and expected_type != item_type:
1539                     continue
1540                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1541                 if rating is not None:
1542                     info['average_rating'] = rating
1543                 if item_type in ('TVEpisode', 'Episode'):
1544                     episode_name = unescapeHTML(e.get('name'))
1545                     info.update({
1546                         'episode': episode_name,
1547                         'episode_number': int_or_none(e.get('episodeNumber')),
1548                         'description': unescapeHTML(e.get('description')),
1549                     })
1550                     if not info.get('title') and episode_name:
1551                         info['title'] = episode_name
1552                     part_of_season = e.get('partOfSeason')
1553                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1554                         info.update({
1555                             'season': unescapeHTML(part_of_season.get('name')),
1556                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1557                         })
1558                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1559                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1560                         info['series'] = unescapeHTML(part_of_series.get('name'))
1561                 elif item_type == 'Movie':
1562                     info.update({
1563                         'title': unescapeHTML(e.get('name')),
1564                         'description': unescapeHTML(e.get('description')),
1565                         'duration': parse_duration(e.get('duration')),
1566                         'timestamp': unified_timestamp(e.get('dateCreated')),
1567                     })
1568                 elif item_type in ('Article', 'NewsArticle'):
1569                     info.update({
1570                         'timestamp': parse_iso8601(e.get('datePublished')),
1571                         'title': unescapeHTML(e.get('headline')),
1572                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1573                     })
1574                     if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
1575                         extract_video_object(e['video'][0])
1576                 elif item_type == 'VideoObject':
1577                     extract_video_object(e)
1578                     if expected_type is None:
1579                         continue
1580                     else:
1581                         break
1582                 video = e.get('video')
1583                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1584                     extract_video_object(video)
1585                 if expected_type is None:
1586                     continue
1587                 else:
1588                     break
1589         traverse_json_ld(json_ld)
1590
1591         return filter_dict(info)
1592
1593     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1594         return self._parse_json(
1595             self._search_regex(
1596                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1597                 webpage, 'next.js data', fatal=fatal, **kw),
1598             video_id, transform_source=transform_source, fatal=fatal)
1599
1600     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1601         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1602         # not all website do this, but it can be changed
1603         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1604         rectx = re.escape(context_name)
1605         js, arg_keys, arg_vals = self._search_regex(
1606             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1607              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1608             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1609
1610         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1611
1612         for key, val in args.items():
1613             if val in ('undefined', 'void 0'):
1614                 args[key] = 'null'
1615
1616         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1617
1618     @staticmethod
1619     def _hidden_inputs(html):
1620         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1621         hidden_inputs = {}
1622         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1623             attrs = extract_attributes(input)
1624             if not input:
1625                 continue
1626             if attrs.get('type') not in ('hidden', 'submit'):
1627                 continue
1628             name = attrs.get('name') or attrs.get('id')
1629             value = attrs.get('value')
1630             if name and value is not None:
1631                 hidden_inputs[name] = value
1632         return hidden_inputs
1633
1634     def _form_hidden_inputs(self, form_id, html):
1635         form = self._search_regex(
1636             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1637             html, '%s form' % form_id, group='form')
1638         return self._hidden_inputs(form)
1639
1640     class FormatSort:
1641         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1642
1643         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1644                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1645                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1646         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1647                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1648                         'fps', 'fs_approx', 'source', 'id')
1649
1650         settings = {
1651             'vcodec': {'type': 'ordered', 'regex': True,
1652                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1653             'acodec': {'type': 'ordered', 'regex': True,
1654                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1655             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1656                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1657             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1658                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1659             'vext': {'type': 'ordered', 'field': 'video_ext',
1660                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1661                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1662             'aext': {'type': 'ordered', 'field': 'audio_ext',
1663                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1664                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1665             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1666             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1667                            'field': ('vcodec', 'acodec'),
1668                            'function': lambda it: int(any(v != 'none' for v in it))},
1669             'ie_pref': {'priority': True, 'type': 'extractor'},
1670             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1671             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1672             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1673             'quality': {'convert': 'float', 'default': -1},
1674             'filesize': {'convert': 'bytes'},
1675             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1676             'id': {'convert': 'string', 'field': 'format_id'},
1677             'height': {'convert': 'float_none'},
1678             'width': {'convert': 'float_none'},
1679             'fps': {'convert': 'float_none'},
1680             'tbr': {'convert': 'float_none'},
1681             'vbr': {'convert': 'float_none'},
1682             'abr': {'convert': 'float_none'},
1683             'asr': {'convert': 'float_none'},
1684             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1685
1686             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1687             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1688             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1689             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1690             'res': {'type': 'multiple', 'field': ('height', 'width'),
1691                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1692
1693             # For compatibility with youtube-dl
1694             'format_id': {'type': 'alias', 'field': 'id'},
1695             'preference': {'type': 'alias', 'field': 'ie_pref'},
1696             'language_preference': {'type': 'alias', 'field': 'lang'},
1697             'source_preference': {'type': 'alias', 'field': 'source'},
1698             'protocol': {'type': 'alias', 'field': 'proto'},
1699             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1700
1701             # Deprecated
1702             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1703             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1704             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1705             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1706             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1707             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1708             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1709             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1710             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1711             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1712             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1713             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1714             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1715             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1716             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1717             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1718             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1719             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1720             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1721             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1722         }
1723
1724         def __init__(self, ie, field_preference):
1725             self._order = []
1726             self.ydl = ie._downloader
1727             self.evaluate_params(self.ydl.params, field_preference)
1728             if ie.get_param('verbose'):
1729                 self.print_verbose_info(self.ydl.write_debug)
1730
1731         def _get_field_setting(self, field, key):
1732             if field not in self.settings:
1733                 if key in ('forced', 'priority'):
1734                     return False
1735                 self.ydl.deprecation_warning(
1736                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1737                     'and may be removed in a future version')
1738                 self.settings[field] = {}
1739             propObj = self.settings[field]
1740             if key not in propObj:
1741                 type = propObj.get('type')
1742                 if key == 'field':
1743                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1744                 elif key == 'convert':
1745                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1746                 else:
1747                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1748                 propObj[key] = default
1749             return propObj[key]
1750
1751         def _resolve_field_value(self, field, value, convertNone=False):
1752             if value is None:
1753                 if not convertNone:
1754                     return None
1755             else:
1756                 value = value.lower()
1757             conversion = self._get_field_setting(field, 'convert')
1758             if conversion == 'ignore':
1759                 return None
1760             if conversion == 'string':
1761                 return value
1762             elif conversion == 'float_none':
1763                 return float_or_none(value)
1764             elif conversion == 'bytes':
1765                 return FileDownloader.parse_bytes(value)
1766             elif conversion == 'order':
1767                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1768                 use_regex = self._get_field_setting(field, 'regex')
1769                 list_length = len(order_list)
1770                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1771                 if use_regex and value is not None:
1772                     for i, regex in enumerate(order_list):
1773                         if regex and re.match(regex, value):
1774                             return list_length - i
1775                     return list_length - empty_pos  # not in list
1776                 else:  # not regex or  value = None
1777                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1778             else:
1779                 if value.isnumeric():
1780                     return float(value)
1781                 else:
1782                     self.settings[field]['convert'] = 'string'
1783                     return value
1784
1785         def evaluate_params(self, params, sort_extractor):
1786             self._use_free_order = params.get('prefer_free_formats', False)
1787             self._sort_user = params.get('format_sort', [])
1788             self._sort_extractor = sort_extractor
1789
1790             def add_item(field, reverse, closest, limit_text):
1791                 field = field.lower()
1792                 if field in self._order:
1793                     return
1794                 self._order.append(field)
1795                 limit = self._resolve_field_value(field, limit_text)
1796                 data = {
1797                     'reverse': reverse,
1798                     'closest': False if limit is None else closest,
1799                     'limit_text': limit_text,
1800                     'limit': limit}
1801                 if field in self.settings:
1802                     self.settings[field].update(data)
1803                 else:
1804                     self.settings[field] = data
1805
1806             sort_list = (
1807                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1808                 + (tuple() if params.get('format_sort_force', False)
1809                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1810                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1811
1812             for item in sort_list:
1813                 match = re.match(self.regex, item)
1814                 if match is None:
1815                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1816                 field = match.group('field')
1817                 if field is None:
1818                     continue
1819                 if self._get_field_setting(field, 'type') == 'alias':
1820                     alias, field = field, self._get_field_setting(field, 'field')
1821                     if self._get_field_setting(alias, 'deprecated'):
1822                         self.ydl.deprecation_warning(
1823                             f'Format sorting alias {alias} is deprecated '
1824                             f'and may be removed in a future version. Please use {field} instead')
1825                 reverse = match.group('reverse') is not None
1826                 closest = match.group('separator') == '~'
1827                 limit_text = match.group('limit')
1828
1829                 has_limit = limit_text is not None
1830                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1831                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1832
1833                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1834                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1835                 limit_count = len(limits)
1836                 for (i, f) in enumerate(fields):
1837                     add_item(f, reverse, closest,
1838                              limits[i] if i < limit_count
1839                              else limits[0] if has_limit and not has_multiple_limits
1840                              else None)
1841
1842         def print_verbose_info(self, write_debug):
1843             if self._sort_user:
1844                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1845             if self._sort_extractor:
1846                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1847             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1848                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1849                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1850                               self._get_field_setting(field, 'limit_text'),
1851                               self._get_field_setting(field, 'limit'))
1852                 if self._get_field_setting(field, 'limit_text') is not None else '')
1853                 for field in self._order if self._get_field_setting(field, 'visible')]))
1854
1855         def _calculate_field_preference_from_value(self, format, field, type, value):
1856             reverse = self._get_field_setting(field, 'reverse')
1857             closest = self._get_field_setting(field, 'closest')
1858             limit = self._get_field_setting(field, 'limit')
1859
1860             if type == 'extractor':
1861                 maximum = self._get_field_setting(field, 'max')
1862                 if value is None or (maximum is not None and value >= maximum):
1863                     value = -1
1864             elif type == 'boolean':
1865                 in_list = self._get_field_setting(field, 'in_list')
1866                 not_in_list = self._get_field_setting(field, 'not_in_list')
1867                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1868             elif type == 'ordered':
1869                 value = self._resolve_field_value(field, value, True)
1870
1871             # try to convert to number
1872             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1873             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1874             if is_num:
1875                 value = val_num
1876
1877             return ((-10, 0) if value is None
1878                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1879                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1880                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1881                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1882                     else (-1, value, 0))
1883
1884         def _calculate_field_preference(self, format, field):
1885             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1886             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1887             if type == 'multiple':
1888                 type = 'field'  # Only 'field' is allowed in multiple for now
1889                 actual_fields = self._get_field_setting(field, 'field')
1890
1891                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1892             else:
1893                 value = get_value(field)
1894             return self._calculate_field_preference_from_value(format, field, type, value)
1895
1896         def calculate_preference(self, format):
1897             # Determine missing protocol
1898             if not format.get('protocol'):
1899                 format['protocol'] = determine_protocol(format)
1900
1901             # Determine missing ext
1902             if not format.get('ext') and 'url' in format:
1903                 format['ext'] = determine_ext(format['url'])
1904             if format.get('vcodec') == 'none':
1905                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1906                 format['video_ext'] = 'none'
1907             else:
1908                 format['video_ext'] = format['ext']
1909                 format['audio_ext'] = 'none'
1910             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1911             #    format['preference'] = -1000
1912
1913             # Determine missing bitrates
1914             if format.get('tbr') is None:
1915                 if format.get('vbr') is not None and format.get('abr') is not None:
1916                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1917             else:
1918                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1919                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1920                 if format.get('acodec') != 'none' and format.get('abr') is None:
1921                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1922
1923             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1924
1925     def _sort_formats(self, formats, field_preference=[]):
1926         if not formats:
1927             return
1928         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1929
1930     def _check_formats(self, formats, video_id):
1931         if formats:
1932             formats[:] = filter(
1933                 lambda f: self._is_valid_url(
1934                     f['url'], video_id,
1935                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1936                 formats)
1937
1938     @staticmethod
1939     def _remove_duplicate_formats(formats):
1940         format_urls = set()
1941         unique_formats = []
1942         for f in formats:
1943             if f['url'] not in format_urls:
1944                 format_urls.add(f['url'])
1945                 unique_formats.append(f)
1946         formats[:] = unique_formats
1947
1948     def _is_valid_url(self, url, video_id, item='video', headers={}):
1949         url = self._proto_relative_url(url, scheme='http:')
1950         # For now assume non HTTP(S) URLs always valid
1951         if not (url.startswith('http://') or url.startswith('https://')):
1952             return True
1953         try:
1954             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1955             return True
1956         except ExtractorError as e:
1957             self.to_screen(
1958                 '%s: %s URL is invalid, skipping: %s'
1959                 % (video_id, item, error_to_compat_str(e.cause)))
1960             return False
1961
1962     def http_scheme(self):
1963         """ Either "http:" or "https:", depending on the user's preferences """
1964         return (
1965             'http:'
1966             if self.get_param('prefer_insecure', False)
1967             else 'https:')
1968
1969     def _proto_relative_url(self, url, scheme=None):
1970         if url is None:
1971             return url
1972         if url.startswith('//'):
1973             if scheme is None:
1974                 scheme = self.http_scheme()
1975             return scheme + url
1976         else:
1977             return url
1978
1979     def _sleep(self, timeout, video_id, msg_template=None):
1980         if msg_template is None:
1981             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1982         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1983         self.to_screen(msg)
1984         time.sleep(timeout)
1985
1986     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1987                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1988                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1989         res = self._download_xml_handle(
1990             manifest_url, video_id, 'Downloading f4m manifest',
1991             'Unable to download f4m manifest',
1992             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1993             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1994             transform_source=transform_source,
1995             fatal=fatal, data=data, headers=headers, query=query)
1996         if res is False:
1997             return []
1998
1999         manifest, urlh = res
2000         manifest_url = urlh.geturl()
2001
2002         return self._parse_f4m_formats(
2003             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2004             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2005
2006     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2007                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2008                            fatal=True, m3u8_id=None):
2009         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2010             return []
2011
2012         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2013         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2014         if akamai_pv is not None and ';' in akamai_pv.text:
2015             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2016             if playerVerificationChallenge.strip() != '':
2017                 return []
2018
2019         formats = []
2020         manifest_version = '1.0'
2021         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2022         if not media_nodes:
2023             manifest_version = '2.0'
2024             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2025         # Remove unsupported DRM protected media from final formats
2026         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2027         media_nodes = remove_encrypted_media(media_nodes)
2028         if not media_nodes:
2029             return formats
2030
2031         manifest_base_url = get_base_url(manifest)
2032
2033         bootstrap_info = xpath_element(
2034             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2035             'bootstrap info', default=None)
2036
2037         vcodec = None
2038         mime_type = xpath_text(
2039             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2040             'base URL', default=None)
2041         if mime_type and mime_type.startswith('audio/'):
2042             vcodec = 'none'
2043
2044         for i, media_el in enumerate(media_nodes):
2045             tbr = int_or_none(media_el.attrib.get('bitrate'))
2046             width = int_or_none(media_el.attrib.get('width'))
2047             height = int_or_none(media_el.attrib.get('height'))
2048             format_id = join_nonempty(f4m_id, tbr or i)
2049             # If <bootstrapInfo> is present, the specified f4m is a
2050             # stream-level manifest, and only set-level manifests may refer to
2051             # external resources.  See section 11.4 and section 4 of F4M spec
2052             if bootstrap_info is None:
2053                 media_url = None
2054                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2055                 if manifest_version == '2.0':
2056                     media_url = media_el.attrib.get('href')
2057                 if media_url is None:
2058                     media_url = media_el.attrib.get('url')
2059                 if not media_url:
2060                     continue
2061                 manifest_url = (
2062                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2063                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2064                 # If media_url is itself a f4m manifest do the recursive extraction
2065                 # since bitrates in parent manifest (this one) and media_url manifest
2066                 # may differ leading to inability to resolve the format by requested
2067                 # bitrate in f4m downloader
2068                 ext = determine_ext(manifest_url)
2069                 if ext == 'f4m':
2070                     f4m_formats = self._extract_f4m_formats(
2071                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2072                         transform_source=transform_source, fatal=fatal)
2073                     # Sometimes stream-level manifest contains single media entry that
2074                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2075                     # At the same time parent's media entry in set-level manifest may
2076                     # contain it. We will copy it from parent in such cases.
2077                     if len(f4m_formats) == 1:
2078                         f = f4m_formats[0]
2079                         f.update({
2080                             'tbr': f.get('tbr') or tbr,
2081                             'width': f.get('width') or width,
2082                             'height': f.get('height') or height,
2083                             'format_id': f.get('format_id') if not tbr else format_id,
2084                             'vcodec': vcodec,
2085                         })
2086                     formats.extend(f4m_formats)
2087                     continue
2088                 elif ext == 'm3u8':
2089                     formats.extend(self._extract_m3u8_formats(
2090                         manifest_url, video_id, 'mp4', preference=preference,
2091                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2092                     continue
2093             formats.append({
2094                 'format_id': format_id,
2095                 'url': manifest_url,
2096                 'manifest_url': manifest_url,
2097                 'ext': 'flv' if bootstrap_info is not None else None,
2098                 'protocol': 'f4m',
2099                 'tbr': tbr,
2100                 'width': width,
2101                 'height': height,
2102                 'vcodec': vcodec,
2103                 'preference': preference,
2104                 'quality': quality,
2105             })
2106         return formats
2107
2108     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2109         return {
2110             'format_id': join_nonempty(m3u8_id, 'meta'),
2111             'url': m3u8_url,
2112             'ext': ext,
2113             'protocol': 'm3u8',
2114             'preference': preference - 100 if preference else -100,
2115             'quality': quality,
2116             'resolution': 'multiple',
2117             'format_note': 'Quality selection URL',
2118         }
2119
2120     def _report_ignoring_subs(self, name):
2121         self.report_warning(bug_reports_message(
2122             f'Ignoring subtitle tracks found in the {name} manifest; '
2123             'if any subtitle tracks are missing,'
2124         ), only_once=True)
2125
2126     def _extract_m3u8_formats(self, *args, **kwargs):
2127         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2128         if subs:
2129             self._report_ignoring_subs('HLS')
2130         return fmts
2131
2132     def _extract_m3u8_formats_and_subtitles(
2133             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2134             preference=None, quality=None, m3u8_id=None, note=None,
2135             errnote=None, fatal=True, live=False, data=None, headers={},
2136             query={}):
2137
2138         res = self._download_webpage_handle(
2139             m3u8_url, video_id,
2140             note='Downloading m3u8 information' if note is None else note,
2141             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2142             fatal=fatal, data=data, headers=headers, query=query)
2143
2144         if res is False:
2145             return [], {}
2146
2147         m3u8_doc, urlh = res
2148         m3u8_url = urlh.geturl()
2149
2150         return self._parse_m3u8_formats_and_subtitles(
2151             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2152             preference=preference, quality=quality, m3u8_id=m3u8_id,
2153             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2154             headers=headers, query=query, video_id=video_id)
2155
2156     def _parse_m3u8_formats_and_subtitles(
2157             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2158             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2159             errnote=None, fatal=True, data=None, headers={}, query={},
2160             video_id=None):
2161         formats, subtitles = [], {}
2162
2163         has_drm = re.search('|'.join([
2164             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2165             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2166         ]), m3u8_doc)
2167
2168         def format_url(url):
2169             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2170
2171         if self.get_param('hls_split_discontinuity', False):
2172             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2173                 if not m3u8_doc:
2174                     if not manifest_url:
2175                         return []
2176                     m3u8_doc = self._download_webpage(
2177                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2178                         note=False, errnote='Failed to download m3u8 playlist information')
2179                     if m3u8_doc is False:
2180                         return []
2181                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2182
2183         else:
2184             def _extract_m3u8_playlist_indices(*args, **kwargs):
2185                 return [None]
2186
2187         # References:
2188         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2189         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2190         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2191
2192         # We should try extracting formats only from master playlists [1, 4.3.4],
2193         # i.e. playlists that describe available qualities. On the other hand
2194         # media playlists [1, 4.3.3] should be returned as is since they contain
2195         # just the media without qualities renditions.
2196         # Fortunately, master playlist can be easily distinguished from media
2197         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2198         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2199         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2200         # media playlist and MUST NOT appear in master playlist thus we can
2201         # clearly detect media playlist with this criterion.
2202
2203         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2204             formats = [{
2205                 'format_id': join_nonempty(m3u8_id, idx),
2206                 'format_index': idx,
2207                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2208                 'ext': ext,
2209                 'protocol': entry_protocol,
2210                 'preference': preference,
2211                 'quality': quality,
2212                 'has_drm': has_drm,
2213             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2214
2215             return formats, subtitles
2216
2217         groups = {}
2218         last_stream_inf = {}
2219
2220         def extract_media(x_media_line):
2221             media = parse_m3u8_attributes(x_media_line)
2222             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2223             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2224             if not (media_type and group_id and name):
2225                 return
2226             groups.setdefault(group_id, []).append(media)
2227             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2228             if media_type == 'SUBTITLES':
2229                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2230                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2231                 # However, lack of URI has been spotted in the wild.
2232                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2233                 if not media.get('URI'):
2234                     return
2235                 url = format_url(media['URI'])
2236                 sub_info = {
2237                     'url': url,
2238                     'ext': determine_ext(url),
2239                 }
2240                 if sub_info['ext'] == 'm3u8':
2241                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2242                     # files may contain is WebVTT:
2243                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2244                     sub_info['ext'] = 'vtt'
2245                     sub_info['protocol'] = 'm3u8_native'
2246                 lang = media.get('LANGUAGE') or 'und'
2247                 subtitles.setdefault(lang, []).append(sub_info)
2248             if media_type not in ('VIDEO', 'AUDIO'):
2249                 return
2250             media_url = media.get('URI')
2251             if media_url:
2252                 manifest_url = format_url(media_url)
2253                 formats.extend({
2254                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2255                     'format_note': name,
2256                     'format_index': idx,
2257                     'url': manifest_url,
2258                     'manifest_url': m3u8_url,
2259                     'language': media.get('LANGUAGE'),
2260                     'ext': ext,
2261                     'protocol': entry_protocol,
2262                     'preference': preference,
2263                     'quality': quality,
2264                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2265                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2266
2267         def build_stream_name():
2268             # Despite specification does not mention NAME attribute for
2269             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2270             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2271             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2272             stream_name = last_stream_inf.get('NAME')
2273             if stream_name:
2274                 return stream_name
2275             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2276             # from corresponding rendition group
2277             stream_group_id = last_stream_inf.get('VIDEO')
2278             if not stream_group_id:
2279                 return
2280             stream_group = groups.get(stream_group_id)
2281             if not stream_group:
2282                 return stream_group_id
2283             rendition = stream_group[0]
2284             return rendition.get('NAME') or stream_group_id
2285
2286         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2287         # chance to detect video only formats when EXT-X-STREAM-INF tags
2288         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2289         for line in m3u8_doc.splitlines():
2290             if line.startswith('#EXT-X-MEDIA:'):
2291                 extract_media(line)
2292
2293         for line in m3u8_doc.splitlines():
2294             if line.startswith('#EXT-X-STREAM-INF:'):
2295                 last_stream_inf = parse_m3u8_attributes(line)
2296             elif line.startswith('#') or not line.strip():
2297                 continue
2298             else:
2299                 tbr = float_or_none(
2300                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2301                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2302                 manifest_url = format_url(line.strip())
2303
2304                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2305                     format_id = [m3u8_id, None, idx]
2306                     # Bandwidth of live streams may differ over time thus making
2307                     # format_id unpredictable. So it's better to keep provided
2308                     # format_id intact.
2309                     if not live:
2310                         stream_name = build_stream_name()
2311                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2312                     f = {
2313                         'format_id': join_nonempty(*format_id),
2314                         'format_index': idx,
2315                         'url': manifest_url,
2316                         'manifest_url': m3u8_url,
2317                         'tbr': tbr,
2318                         'ext': ext,
2319                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2320                         'protocol': entry_protocol,
2321                         'preference': preference,
2322                         'quality': quality,
2323                     }
2324                     resolution = last_stream_inf.get('RESOLUTION')
2325                     if resolution:
2326                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2327                         if mobj:
2328                             f['width'] = int(mobj.group('width'))
2329                             f['height'] = int(mobj.group('height'))
2330                     # Unified Streaming Platform
2331                     mobj = re.search(
2332                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2333                     if mobj:
2334                         abr, vbr = mobj.groups()
2335                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2336                         f.update({
2337                             'vbr': vbr,
2338                             'abr': abr,
2339                         })
2340                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2341                     f.update(codecs)
2342                     audio_group_id = last_stream_inf.get('AUDIO')
2343                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2344                     # references a rendition group MUST have a CODECS attribute.
2345                     # However, this is not always respected, for example, [2]
2346                     # contains EXT-X-STREAM-INF tag which references AUDIO
2347                     # rendition group but does not have CODECS and despite
2348                     # referencing an audio group it represents a complete
2349                     # (with audio and video) format. So, for such cases we will
2350                     # ignore references to rendition groups and treat them
2351                     # as complete formats.
2352                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2353                         audio_group = groups.get(audio_group_id)
2354                         if audio_group and audio_group[0].get('URI'):
2355                             # TODO: update acodec for audio only formats with
2356                             # the same GROUP-ID
2357                             f['acodec'] = 'none'
2358                     if not f.get('ext'):
2359                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2360                     formats.append(f)
2361
2362                     # for DailyMotion
2363                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2364                     if progressive_uri:
2365                         http_f = f.copy()
2366                         del http_f['manifest_url']
2367                         http_f.update({
2368                             'format_id': f['format_id'].replace('hls-', 'http-'),
2369                             'protocol': 'http',
2370                             'url': progressive_uri,
2371                         })
2372                         formats.append(http_f)
2373
2374                 last_stream_inf = {}
2375         return formats, subtitles
2376
2377     def _extract_m3u8_vod_duration(
2378             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2379
2380         m3u8_vod = self._download_webpage(
2381             m3u8_vod_url, video_id,
2382             note='Downloading m3u8 VOD manifest' if note is None else note,
2383             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2384             fatal=False, data=data, headers=headers, query=query)
2385
2386         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2387
2388     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2389         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2390             return None
2391
2392         return int(sum(
2393             float(line[len('#EXTINF:'):].split(',')[0])
2394             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2395
2396     @staticmethod
2397     def _xpath_ns(path, namespace=None):
2398         if not namespace:
2399             return path
2400         out = []
2401         for c in path.split('/'):
2402             if not c or c == '.':
2403                 out.append(c)
2404             else:
2405                 out.append('{%s}%s' % (namespace, c))
2406         return '/'.join(out)
2407
2408     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2409         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2410         if res is False:
2411             assert not fatal
2412             return [], {}
2413
2414         smil, urlh = res
2415         smil_url = urlh.geturl()
2416
2417         namespace = self._parse_smil_namespace(smil)
2418
2419         fmts = self._parse_smil_formats(
2420             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2421         subs = self._parse_smil_subtitles(
2422             smil, namespace=namespace)
2423
2424         return fmts, subs
2425
2426     def _extract_smil_formats(self, *args, **kwargs):
2427         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2428         if subs:
2429             self._report_ignoring_subs('SMIL')
2430         return fmts
2431
2432     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2433         res = self._download_smil(smil_url, video_id, fatal=fatal)
2434         if res is False:
2435             return {}
2436
2437         smil, urlh = res
2438         smil_url = urlh.geturl()
2439
2440         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2441
2442     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2443         return self._download_xml_handle(
2444             smil_url, video_id, 'Downloading SMIL file',
2445             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2446
2447     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2448         namespace = self._parse_smil_namespace(smil)
2449
2450         formats = self._parse_smil_formats(
2451             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2452         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2453
2454         video_id = os.path.splitext(url_basename(smil_url))[0]
2455         title = None
2456         description = None
2457         upload_date = None
2458         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2459             name = meta.attrib.get('name')
2460             content = meta.attrib.get('content')
2461             if not name or not content:
2462                 continue
2463             if not title and name == 'title':
2464                 title = content
2465             elif not description and name in ('description', 'abstract'):
2466                 description = content
2467             elif not upload_date and name == 'date':
2468                 upload_date = unified_strdate(content)
2469
2470         thumbnails = [{
2471             'id': image.get('type'),
2472             'url': image.get('src'),
2473             'width': int_or_none(image.get('width')),
2474             'height': int_or_none(image.get('height')),
2475         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2476
2477         return {
2478             'id': video_id,
2479             'title': title or video_id,
2480             'description': description,
2481             'upload_date': upload_date,
2482             'thumbnails': thumbnails,
2483             'formats': formats,
2484             'subtitles': subtitles,
2485         }
2486
2487     def _parse_smil_namespace(self, smil):
2488         return self._search_regex(
2489             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2490
2491     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2492         base = smil_url
2493         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2494             b = meta.get('base') or meta.get('httpBase')
2495             if b:
2496                 base = b
2497                 break
2498
2499         formats = []
2500         rtmp_count = 0
2501         http_count = 0
2502         m3u8_count = 0
2503         imgs_count = 0
2504
2505         srcs = set()
2506         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2507         for medium in media:
2508             src = medium.get('src')
2509             if not src or src in srcs:
2510                 continue
2511             srcs.add(src)
2512
2513             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2514             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2515             width = int_or_none(medium.get('width'))
2516             height = int_or_none(medium.get('height'))
2517             proto = medium.get('proto')
2518             ext = medium.get('ext')
2519             src_ext = determine_ext(src)
2520             streamer = medium.get('streamer') or base
2521
2522             if proto == 'rtmp' or streamer.startswith('rtmp'):
2523                 rtmp_count += 1
2524                 formats.append({
2525                     'url': streamer,
2526                     'play_path': src,
2527                     'ext': 'flv',
2528                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2529                     'tbr': bitrate,
2530                     'filesize': filesize,
2531                     'width': width,
2532                     'height': height,
2533                 })
2534                 if transform_rtmp_url:
2535                     streamer, src = transform_rtmp_url(streamer, src)
2536                     formats[-1].update({
2537                         'url': streamer,
2538                         'play_path': src,
2539                     })
2540                 continue
2541
2542             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2543             src_url = src_url.strip()
2544
2545             if proto == 'm3u8' or src_ext == 'm3u8':
2546                 m3u8_formats = self._extract_m3u8_formats(
2547                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2548                 if len(m3u8_formats) == 1:
2549                     m3u8_count += 1
2550                     m3u8_formats[0].update({
2551                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2552                         'tbr': bitrate,
2553                         'width': width,
2554                         'height': height,
2555                     })
2556                 formats.extend(m3u8_formats)
2557             elif src_ext == 'f4m':
2558                 f4m_url = src_url
2559                 if not f4m_params:
2560                     f4m_params = {
2561                         'hdcore': '3.2.0',
2562                         'plugin': 'flowplayer-3.2.0.1',
2563                     }
2564                 f4m_url += '&' if '?' in f4m_url else '?'
2565                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2566                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2567             elif src_ext == 'mpd':
2568                 formats.extend(self._extract_mpd_formats(
2569                     src_url, video_id, mpd_id='dash', fatal=False))
2570             elif re.search(r'\.ism/[Mm]anifest', src_url):
2571                 formats.extend(self._extract_ism_formats(
2572                     src_url, video_id, ism_id='mss', fatal=False))
2573             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2574                 http_count += 1
2575                 formats.append({
2576                     'url': src_url,
2577                     'ext': ext or src_ext or 'flv',
2578                     'format_id': 'http-%d' % (bitrate or http_count),
2579                     'tbr': bitrate,
2580                     'filesize': filesize,
2581                     'width': width,
2582                     'height': height,
2583                 })
2584
2585         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2586             src = medium.get('src')
2587             if not src or src in srcs:
2588                 continue
2589             srcs.add(src)
2590
2591             imgs_count += 1
2592             formats.append({
2593                 'format_id': 'imagestream-%d' % (imgs_count),
2594                 'url': src,
2595                 'ext': mimetype2ext(medium.get('type')),
2596                 'acodec': 'none',
2597                 'vcodec': 'none',
2598                 'width': int_or_none(medium.get('width')),
2599                 'height': int_or_none(medium.get('height')),
2600                 'format_note': 'SMIL storyboards',
2601             })
2602
2603         return formats
2604
2605     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2606         urls = []
2607         subtitles = {}
2608         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2609             src = textstream.get('src')
2610             if not src or src in urls:
2611                 continue
2612             urls.append(src)
2613             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2614             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2615             subtitles.setdefault(lang, []).append({
2616                 'url': src,
2617                 'ext': ext,
2618             })
2619         return subtitles
2620
2621     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2622         res = self._download_xml_handle(
2623             xspf_url, playlist_id, 'Downloading xpsf playlist',
2624             'Unable to download xspf manifest', fatal=fatal)
2625         if res is False:
2626             return []
2627
2628         xspf, urlh = res
2629         xspf_url = urlh.geturl()
2630
2631         return self._parse_xspf(
2632             xspf, playlist_id, xspf_url=xspf_url,
2633             xspf_base_url=base_url(xspf_url))
2634
2635     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2636         NS_MAP = {
2637             'xspf': 'http://xspf.org/ns/0/',
2638             's1': 'http://static.streamone.nl/player/ns/0',
2639         }
2640
2641         entries = []
2642         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2643             title = xpath_text(
2644                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2645             description = xpath_text(
2646                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2647             thumbnail = xpath_text(
2648                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2649             duration = float_or_none(
2650                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2651
2652             formats = []
2653             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2654                 format_url = urljoin(xspf_base_url, location.text)
2655                 if not format_url:
2656                     continue
2657                 formats.append({
2658                     'url': format_url,
2659                     'manifest_url': xspf_url,
2660                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2661                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2662                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2663                 })
2664             self._sort_formats(formats)
2665
2666             entries.append({
2667                 'id': playlist_id,
2668                 'title': title,
2669                 'description': description,
2670                 'thumbnail': thumbnail,
2671                 'duration': duration,
2672                 'formats': formats,
2673             })
2674         return entries
2675
2676     def _extract_mpd_formats(self, *args, **kwargs):
2677         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2678         if subs:
2679             self._report_ignoring_subs('DASH')
2680         return fmts
2681
2682     def _extract_mpd_formats_and_subtitles(
2683             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2684             fatal=True, data=None, headers={}, query={}):
2685         res = self._download_xml_handle(
2686             mpd_url, video_id,
2687             note='Downloading MPD manifest' if note is None else note,
2688             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2689             fatal=fatal, data=data, headers=headers, query=query)
2690         if res is False:
2691             return [], {}
2692         mpd_doc, urlh = res
2693         if mpd_doc is None:
2694             return [], {}
2695
2696         # We could have been redirected to a new url when we retrieved our mpd file.
2697         mpd_url = urlh.geturl()
2698         mpd_base_url = base_url(mpd_url)
2699
2700         return self._parse_mpd_formats_and_subtitles(
2701             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2702
2703     def _parse_mpd_formats(self, *args, **kwargs):
2704         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2705         if subs:
2706             self._report_ignoring_subs('DASH')
2707         return fmts
2708
2709     def _parse_mpd_formats_and_subtitles(
2710             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2711         """
2712         Parse formats from MPD manifest.
2713         References:
2714          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2715             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2716          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2717         """
2718         if not self.get_param('dynamic_mpd', True):
2719             if mpd_doc.get('type') == 'dynamic':
2720                 return [], {}
2721
2722         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2723
2724         def _add_ns(path):
2725             return self._xpath_ns(path, namespace)
2726
2727         def is_drm_protected(element):
2728             return element.find(_add_ns('ContentProtection')) is not None
2729
2730         def extract_multisegment_info(element, ms_parent_info):
2731             ms_info = ms_parent_info.copy()
2732
2733             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2734             # common attributes and elements.  We will only extract relevant
2735             # for us.
2736             def extract_common(source):
2737                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2738                 if segment_timeline is not None:
2739                     s_e = segment_timeline.findall(_add_ns('S'))
2740                     if s_e:
2741                         ms_info['total_number'] = 0
2742                         ms_info['s'] = []
2743                         for s in s_e:
2744                             r = int(s.get('r', 0))
2745                             ms_info['total_number'] += 1 + r
2746                             ms_info['s'].append({
2747                                 't': int(s.get('t', 0)),
2748                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2749                                 'd': int(s.attrib['d']),
2750                                 'r': r,
2751                             })
2752                 start_number = source.get('startNumber')
2753                 if start_number:
2754                     ms_info['start_number'] = int(start_number)
2755                 timescale = source.get('timescale')
2756                 if timescale:
2757                     ms_info['timescale'] = int(timescale)
2758                 segment_duration = source.get('duration')
2759                 if segment_duration:
2760                     ms_info['segment_duration'] = float(segment_duration)
2761
2762             def extract_Initialization(source):
2763                 initialization = source.find(_add_ns('Initialization'))
2764                 if initialization is not None:
2765                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2766
2767             segment_list = element.find(_add_ns('SegmentList'))
2768             if segment_list is not None:
2769                 extract_common(segment_list)
2770                 extract_Initialization(segment_list)
2771                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2772                 if segment_urls_e:
2773                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2774             else:
2775                 segment_template = element.find(_add_ns('SegmentTemplate'))
2776                 if segment_template is not None:
2777                     extract_common(segment_template)
2778                     media = segment_template.get('media')
2779                     if media:
2780                         ms_info['media'] = media
2781                     initialization = segment_template.get('initialization')
2782                     if initialization:
2783                         ms_info['initialization'] = initialization
2784                     else:
2785                         extract_Initialization(segment_template)
2786             return ms_info
2787
2788         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2789         formats, subtitles = [], {}
2790         stream_numbers = collections.defaultdict(int)
2791         for period in mpd_doc.findall(_add_ns('Period')):
2792             period_duration = parse_duration(period.get('duration')) or mpd_duration
2793             period_ms_info = extract_multisegment_info(period, {
2794                 'start_number': 1,
2795                 'timescale': 1,
2796             })
2797             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2798                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2799                 for representation in adaptation_set.findall(_add_ns('Representation')):
2800                     representation_attrib = adaptation_set.attrib.copy()
2801                     representation_attrib.update(representation.attrib)
2802                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2803                     mime_type = representation_attrib['mimeType']
2804                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2805
2806                     codecs = parse_codecs(representation_attrib.get('codecs', ''))
2807                     if content_type not in ('video', 'audio', 'text'):
2808                         if mime_type == 'image/jpeg':
2809                             content_type = mime_type
2810                         elif codecs['vcodec'] != 'none':
2811                             content_type = 'video'
2812                         elif codecs['acodec'] != 'none':
2813                             content_type = 'audio'
2814                         elif codecs.get('scodec', 'none') != 'none':
2815                             content_type = 'text'
2816                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2817                             content_type = 'text'
2818                         else:
2819                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2820                             continue
2821
2822                     base_url = ''
2823                     for element in (representation, adaptation_set, period, mpd_doc):
2824                         base_url_e = element.find(_add_ns('BaseURL'))
2825                         if base_url_e is not None:
2826                             base_url = base_url_e.text + base_url
2827                             if re.match(r'^https?://', base_url):
2828                                 break
2829                     if mpd_base_url and base_url.startswith('/'):
2830                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2831                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2832                         if not mpd_base_url.endswith('/'):
2833                             mpd_base_url += '/'
2834                         base_url = mpd_base_url + base_url
2835                     representation_id = representation_attrib.get('id')
2836                     lang = representation_attrib.get('lang')
2837                     url_el = representation.find(_add_ns('BaseURL'))
2838                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2839                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2840                     if representation_id is not None:
2841                         format_id = representation_id
2842                     else:
2843                         format_id = content_type
2844                     if mpd_id:
2845                         format_id = mpd_id + '-' + format_id
2846                     if content_type in ('video', 'audio'):
2847                         f = {
2848                             'format_id': format_id,
2849                             'manifest_url': mpd_url,
2850                             'ext': mimetype2ext(mime_type),
2851                             'width': int_or_none(representation_attrib.get('width')),
2852                             'height': int_or_none(representation_attrib.get('height')),
2853                             'tbr': float_or_none(bandwidth, 1000),
2854                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2855                             'fps': int_or_none(representation_attrib.get('frameRate')),
2856                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2857                             'format_note': 'DASH %s' % content_type,
2858                             'filesize': filesize,
2859                             'container': mimetype2ext(mime_type) + '_dash',
2860                             **codecs
2861                         }
2862                     elif content_type == 'text':
2863                         f = {
2864                             'ext': mimetype2ext(mime_type),
2865                             'manifest_url': mpd_url,
2866                             'filesize': filesize,
2867                         }
2868                     elif content_type == 'image/jpeg':
2869                         # See test case in VikiIE
2870                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2871                         f = {
2872                             'format_id': format_id,
2873                             'ext': 'mhtml',
2874                             'manifest_url': mpd_url,
2875                             'format_note': 'DASH storyboards (jpeg)',
2876                             'acodec': 'none',
2877                             'vcodec': 'none',
2878                         }
2879                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2880                         f['has_drm'] = True
2881                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2882
2883                     def prepare_template(template_name, identifiers):
2884                         tmpl = representation_ms_info[template_name]
2885                         # First of, % characters outside $...$ templates
2886                         # must be escaped by doubling for proper processing
2887                         # by % operator string formatting used further (see
2888                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2889                         t = ''
2890                         in_template = False
2891                         for c in tmpl:
2892                             t += c
2893                             if c == '$':
2894                                 in_template = not in_template
2895                             elif c == '%' and not in_template:
2896                                 t += c
2897                         # Next, $...$ templates are translated to their
2898                         # %(...) counterparts to be used with % operator
2899                         if representation_id is not None:
2900                             t = t.replace('$RepresentationID$', representation_id)
2901                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2902                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2903                         t.replace('$$', '$')
2904                         return t
2905
2906                     # @initialization is a regular template like @media one
2907                     # so it should be handled just the same way (see
2908                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2909                     if 'initialization' in representation_ms_info:
2910                         initialization_template = prepare_template(
2911                             'initialization',
2912                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2913                             # $Time$ shall not be included for @initialization thus
2914                             # only $Bandwidth$ remains
2915                             ('Bandwidth', ))
2916                         representation_ms_info['initialization_url'] = initialization_template % {
2917                             'Bandwidth': bandwidth,
2918                         }
2919
2920                     def location_key(location):
2921                         return 'url' if re.match(r'^https?://', location) else 'path'
2922
2923                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2924
2925                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2926                         media_location_key = location_key(media_template)
2927
2928                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2929                         # can't be used at the same time
2930                         if '%(Number' in media_template and 's' not in representation_ms_info:
2931                             segment_duration = None
2932                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2933                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2934                                 representation_ms_info['total_number'] = int(math.ceil(
2935                                     float_or_none(period_duration, segment_duration, default=0)))
2936                             representation_ms_info['fragments'] = [{
2937                                 media_location_key: media_template % {
2938                                     'Number': segment_number,
2939                                     'Bandwidth': bandwidth,
2940                                 },
2941                                 'duration': segment_duration,
2942                             } for segment_number in range(
2943                                 representation_ms_info['start_number'],
2944                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2945                         else:
2946                             # $Number*$ or $Time$ in media template with S list available
2947                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2948                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2949                             representation_ms_info['fragments'] = []
2950                             segment_time = 0
2951                             segment_d = None
2952                             segment_number = representation_ms_info['start_number']
2953
2954                             def add_segment_url():
2955                                 segment_url = media_template % {
2956                                     'Time': segment_time,
2957                                     'Bandwidth': bandwidth,
2958                                     'Number': segment_number,
2959                                 }
2960                                 representation_ms_info['fragments'].append({
2961                                     media_location_key: segment_url,
2962                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2963                                 })
2964
2965                             for num, s in enumerate(representation_ms_info['s']):
2966                                 segment_time = s.get('t') or segment_time
2967                                 segment_d = s['d']
2968                                 add_segment_url()
2969                                 segment_number += 1
2970                                 for r in range(s.get('r', 0)):
2971                                     segment_time += segment_d
2972                                     add_segment_url()
2973                                     segment_number += 1
2974                                 segment_time += segment_d
2975                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2976                         # No media template
2977                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2978                         # or any YouTube dashsegments video
2979                         fragments = []
2980                         segment_index = 0
2981                         timescale = representation_ms_info['timescale']
2982                         for s in representation_ms_info['s']:
2983                             duration = float_or_none(s['d'], timescale)
2984                             for r in range(s.get('r', 0) + 1):
2985                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2986                                 fragments.append({
2987                                     location_key(segment_uri): segment_uri,
2988                                     'duration': duration,
2989                                 })
2990                                 segment_index += 1
2991                         representation_ms_info['fragments'] = fragments
2992                     elif 'segment_urls' in representation_ms_info:
2993                         # Segment URLs with no SegmentTimeline
2994                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2995                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2996                         fragments = []
2997                         segment_duration = float_or_none(
2998                             representation_ms_info['segment_duration'],
2999                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3000                         for segment_url in representation_ms_info['segment_urls']:
3001                             fragment = {
3002                                 location_key(segment_url): segment_url,
3003                             }
3004                             if segment_duration:
3005                                 fragment['duration'] = segment_duration
3006                             fragments.append(fragment)
3007                         representation_ms_info['fragments'] = fragments
3008                     # If there is a fragments key available then we correctly recognized fragmented media.
3009                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3010                     # assumption is not necessarily correct since we may simply have no support for
3011                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3012                     if 'fragments' in representation_ms_info:
3013                         f.update({
3014                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3015                             'url': mpd_url or base_url,
3016                             'fragment_base_url': base_url,
3017                             'fragments': [],
3018                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3019                         })
3020                         if 'initialization_url' in representation_ms_info:
3021                             initialization_url = representation_ms_info['initialization_url']
3022                             if not f.get('url'):
3023                                 f['url'] = initialization_url
3024                             f['fragments'].append({location_key(initialization_url): initialization_url})
3025                         f['fragments'].extend(representation_ms_info['fragments'])
3026                         if not period_duration:
3027                             period_duration = try_get(
3028                                 representation_ms_info,
3029                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3030                     else:
3031                         # Assuming direct URL to unfragmented media.
3032                         f['url'] = base_url
3033                     if content_type in ('video', 'audio', 'image/jpeg'):
3034                         f['manifest_stream_number'] = stream_numbers[f['url']]
3035                         stream_numbers[f['url']] += 1
3036                         formats.append(f)
3037                     elif content_type == 'text':
3038                         subtitles.setdefault(lang or 'und', []).append(f)
3039
3040         return formats, subtitles
3041
3042     def _extract_ism_formats(self, *args, **kwargs):
3043         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3044         if subs:
3045             self._report_ignoring_subs('ISM')
3046         return fmts
3047
3048     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3049         res = self._download_xml_handle(
3050             ism_url, video_id,
3051             note='Downloading ISM manifest' if note is None else note,
3052             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3053             fatal=fatal, data=data, headers=headers, query=query)
3054         if res is False:
3055             return [], {}
3056         ism_doc, urlh = res
3057         if ism_doc is None:
3058             return [], {}
3059
3060         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3061
3062     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3063         """
3064         Parse formats from ISM manifest.
3065         References:
3066          1. [MS-SSTR]: Smooth Streaming Protocol,
3067             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3068         """
3069         if ism_doc.get('IsLive') == 'TRUE':
3070             return [], {}
3071
3072         duration = int(ism_doc.attrib['Duration'])
3073         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3074
3075         formats = []
3076         subtitles = {}
3077         for stream in ism_doc.findall('StreamIndex'):
3078             stream_type = stream.get('Type')
3079             if stream_type not in ('video', 'audio', 'text'):
3080                 continue
3081             url_pattern = stream.attrib['Url']
3082             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3083             stream_name = stream.get('Name')
3084             stream_language = stream.get('Language', 'und')
3085             for track in stream.findall('QualityLevel'):
3086                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3087                 # TODO: add support for WVC1 and WMAP
3088                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3089                     self.report_warning('%s is not a supported codec' % fourcc)
3090                     continue
3091                 tbr = int(track.attrib['Bitrate']) // 1000
3092                 # [1] does not mention Width and Height attributes. However,
3093                 # they're often present while MaxWidth and MaxHeight are
3094                 # missing, so should be used as fallbacks
3095                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3096                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3097                 sampling_rate = int_or_none(track.get('SamplingRate'))
3098
3099                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3100                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3101
3102                 fragments = []
3103                 fragment_ctx = {
3104                     'time': 0,
3105                 }
3106                 stream_fragments = stream.findall('c')
3107                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3108                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3109                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3110                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3111                     if not fragment_ctx['duration']:
3112                         try:
3113                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3114                         except IndexError:
3115                             next_fragment_time = duration
3116                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3117                     for _ in range(fragment_repeat):
3118                         fragments.append({
3119                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3120                             'duration': fragment_ctx['duration'] / stream_timescale,
3121                         })
3122                         fragment_ctx['time'] += fragment_ctx['duration']
3123
3124                 if stream_type == 'text':
3125                     subtitles.setdefault(stream_language, []).append({
3126                         'ext': 'ismt',
3127                         'protocol': 'ism',
3128                         'url': ism_url,
3129                         'manifest_url': ism_url,
3130                         'fragments': fragments,
3131                         '_download_params': {
3132                             'stream_type': stream_type,
3133                             'duration': duration,
3134                             'timescale': stream_timescale,
3135                             'fourcc': fourcc,
3136                             'language': stream_language,
3137                             'codec_private_data': track.get('CodecPrivateData'),
3138                         }
3139                     })
3140                 elif stream_type in ('video', 'audio'):
3141                     formats.append({
3142                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3143                         'url': ism_url,
3144                         'manifest_url': ism_url,
3145                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3146                         'width': width,
3147                         'height': height,
3148                         'tbr': tbr,
3149                         'asr': sampling_rate,
3150                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3151                         'acodec': 'none' if stream_type == 'video' else fourcc,
3152                         'protocol': 'ism',
3153                         'fragments': fragments,
3154                         'has_drm': ism_doc.find('Protection') is not None,
3155                         '_download_params': {
3156                             'stream_type': stream_type,
3157                             'duration': duration,
3158                             'timescale': stream_timescale,
3159                             'width': width or 0,
3160                             'height': height or 0,
3161                             'fourcc': fourcc,
3162                             'language': stream_language,
3163                             'codec_private_data': track.get('CodecPrivateData'),
3164                             'sampling_rate': sampling_rate,
3165                             'channels': int_or_none(track.get('Channels', 2)),
3166                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3167                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3168                         },
3169                     })
3170         return formats, subtitles
3171
3172     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3173         def absolute_url(item_url):
3174             return urljoin(base_url, item_url)
3175
3176         def parse_content_type(content_type):
3177             if not content_type:
3178                 return {}
3179             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3180             if ctr:
3181                 mimetype, codecs = ctr.groups()
3182                 f = parse_codecs(codecs)
3183                 f['ext'] = mimetype2ext(mimetype)
3184                 return f
3185             return {}
3186
3187         def _media_formats(src, cur_media_type, type_info={}):
3188             full_url = absolute_url(src)
3189             ext = type_info.get('ext') or determine_ext(full_url)
3190             if ext == 'm3u8':
3191                 is_plain_url = False
3192                 formats = self._extract_m3u8_formats(
3193                     full_url, video_id, ext='mp4',
3194                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3195                     preference=preference, quality=quality, fatal=False)
3196             elif ext == 'mpd':
3197                 is_plain_url = False
3198                 formats = self._extract_mpd_formats(
3199                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3200             else:
3201                 is_plain_url = True
3202                 formats = [{
3203                     'url': full_url,
3204                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3205                 }]
3206             return is_plain_url, formats
3207
3208         entries = []
3209         # amp-video and amp-audio are very similar to their HTML5 counterparts
3210         # so we wll include them right here (see
3211         # https://www.ampproject.org/docs/reference/components/amp-video)
3212         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3213         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3214         media_tags = [(media_tag, media_tag_name, media_type, '')
3215                       for media_tag, media_tag_name, media_type
3216                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3217         media_tags.extend(re.findall(
3218             # We only allow video|audio followed by a whitespace or '>'.
3219             # Allowing more characters may end up in significant slow down (see
3220             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3221             # http://www.porntrex.com/maps/videositemap.xml).
3222             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3223         for media_tag, _, media_type, media_content in media_tags:
3224             media_info = {
3225                 'formats': [],
3226                 'subtitles': {},
3227             }
3228             media_attributes = extract_attributes(media_tag)
3229             src = strip_or_none(media_attributes.get('src'))
3230             if src:
3231                 _, formats = _media_formats(src, media_type)
3232                 media_info['formats'].extend(formats)
3233             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3234             if media_content:
3235                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3236                     s_attr = extract_attributes(source_tag)
3237                     # data-video-src and data-src are non standard but seen
3238                     # several times in the wild
3239                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3240                     if not src:
3241                         continue
3242                     f = parse_content_type(s_attr.get('type'))
3243                     is_plain_url, formats = _media_formats(src, media_type, f)
3244                     if is_plain_url:
3245                         # width, height, res, label and title attributes are
3246                         # all not standard but seen several times in the wild
3247                         labels = [
3248                             s_attr.get(lbl)
3249                             for lbl in ('label', 'title')
3250                             if str_or_none(s_attr.get(lbl))
3251                         ]
3252                         width = int_or_none(s_attr.get('width'))
3253                         height = (int_or_none(s_attr.get('height'))
3254                                   or int_or_none(s_attr.get('res')))
3255                         if not width or not height:
3256                             for lbl in labels:
3257                                 resolution = parse_resolution(lbl)
3258                                 if not resolution:
3259                                     continue
3260                                 width = width or resolution.get('width')
3261                                 height = height or resolution.get('height')
3262                         for lbl in labels:
3263                             tbr = parse_bitrate(lbl)
3264                             if tbr:
3265                                 break
3266                         else:
3267                             tbr = None
3268                         f.update({
3269                             'width': width,
3270                             'height': height,
3271                             'tbr': tbr,
3272                             'format_id': s_attr.get('label') or s_attr.get('title'),
3273                         })
3274                         f.update(formats[0])
3275                         media_info['formats'].append(f)
3276                     else:
3277                         media_info['formats'].extend(formats)
3278                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3279                     track_attributes = extract_attributes(track_tag)
3280                     kind = track_attributes.get('kind')
3281                     if not kind or kind in ('subtitles', 'captions'):
3282                         src = strip_or_none(track_attributes.get('src'))
3283                         if not src:
3284                             continue
3285                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3286                         media_info['subtitles'].setdefault(lang, []).append({
3287                             'url': absolute_url(src),
3288                         })
3289             for f in media_info['formats']:
3290                 f.setdefault('http_headers', {})['Referer'] = base_url
3291             if media_info['formats'] or media_info['subtitles']:
3292                 entries.append(media_info)
3293         return entries
3294
3295     def _extract_akamai_formats(self, *args, **kwargs):
3296         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3297         if subs:
3298             self._report_ignoring_subs('akamai')
3299         return fmts
3300
3301     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3302         signed = 'hdnea=' in manifest_url
3303         if not signed:
3304             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3305             manifest_url = re.sub(
3306                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3307                 '', manifest_url).strip('?')
3308
3309         formats = []
3310         subtitles = {}
3311
3312         hdcore_sign = 'hdcore=3.7.0'
3313         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3314         hds_host = hosts.get('hds')
3315         if hds_host:
3316             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3317         if 'hdcore=' not in f4m_url:
3318             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3319         f4m_formats = self._extract_f4m_formats(
3320             f4m_url, video_id, f4m_id='hds', fatal=False)
3321         for entry in f4m_formats:
3322             entry.update({'extra_param_to_segment_url': hdcore_sign})
3323         formats.extend(f4m_formats)
3324
3325         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3326         hls_host = hosts.get('hls')
3327         if hls_host:
3328             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3329         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3330             m3u8_url, video_id, 'mp4', 'm3u8_native',
3331             m3u8_id='hls', fatal=False)
3332         formats.extend(m3u8_formats)
3333         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3334
3335         http_host = hosts.get('http')
3336         if http_host and m3u8_formats and not signed:
3337             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3338             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3339             qualities_length = len(qualities)
3340             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3341                 i = 0
3342                 for f in m3u8_formats:
3343                     if f['vcodec'] != 'none':
3344                         for protocol in ('http', 'https'):
3345                             http_f = f.copy()
3346                             del http_f['manifest_url']
3347                             http_url = re.sub(
3348                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3349                             http_f.update({
3350                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3351                                 'url': http_url,
3352                                 'protocol': protocol,
3353                             })
3354                             formats.append(http_f)
3355                         i += 1
3356
3357         return formats, subtitles
3358
3359     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3360         query = compat_urlparse.urlparse(url).query
3361         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3362         mobj = re.search(
3363             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3364         url_base = mobj.group('url')
3365         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3366         formats = []
3367
3368         def manifest_url(manifest):
3369             m_url = f'{http_base_url}/{manifest}'
3370             if query:
3371                 m_url += '?%s' % query
3372             return m_url
3373
3374         if 'm3u8' not in skip_protocols:
3375             formats.extend(self._extract_m3u8_formats(
3376                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3377                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3378         if 'f4m' not in skip_protocols:
3379             formats.extend(self._extract_f4m_formats(
3380                 manifest_url('manifest.f4m'),
3381                 video_id, f4m_id='hds', fatal=False))
3382         if 'dash' not in skip_protocols:
3383             formats.extend(self._extract_mpd_formats(
3384                 manifest_url('manifest.mpd'),
3385                 video_id, mpd_id='dash', fatal=False))
3386         if re.search(r'(?:/smil:|\.smil)', url_base):
3387             if 'smil' not in skip_protocols:
3388                 rtmp_formats = self._extract_smil_formats(
3389                     manifest_url('jwplayer.smil'),
3390                     video_id, fatal=False)
3391                 for rtmp_format in rtmp_formats:
3392                     rtsp_format = rtmp_format.copy()
3393                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3394                     del rtsp_format['play_path']
3395                     del rtsp_format['ext']
3396                     rtsp_format.update({
3397                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3398                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3399                         'protocol': 'rtsp',
3400                     })
3401                     formats.extend([rtmp_format, rtsp_format])
3402         else:
3403             for protocol in ('rtmp', 'rtsp'):
3404                 if protocol not in skip_protocols:
3405                     formats.append({
3406                         'url': f'{protocol}:{url_base}',
3407                         'format_id': protocol,
3408                         'protocol': protocol,
3409                     })
3410         return formats
3411
3412     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3413         mobj = re.search(
3414             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3415             webpage)
3416         if mobj:
3417             try:
3418                 jwplayer_data = self._parse_json(mobj.group('options'),
3419                                                  video_id=video_id,
3420                                                  transform_source=transform_source)
3421             except ExtractorError:
3422                 pass
3423             else:
3424                 if isinstance(jwplayer_data, dict):
3425                     return jwplayer_data
3426
3427     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3428         jwplayer_data = self._find_jwplayer_data(
3429             webpage, video_id, transform_source=js_to_json)
3430         return self._parse_jwplayer_data(
3431             jwplayer_data, video_id, *args, **kwargs)
3432
3433     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3434                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3435         # JWPlayer backward compatibility: flattened playlists
3436         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3437         if 'playlist' not in jwplayer_data:
3438             jwplayer_data = {'playlist': [jwplayer_data]}
3439
3440         entries = []
3441
3442         # JWPlayer backward compatibility: single playlist item
3443         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3444         if not isinstance(jwplayer_data['playlist'], list):
3445             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3446
3447         for video_data in jwplayer_data['playlist']:
3448             # JWPlayer backward compatibility: flattened sources
3449             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3450             if 'sources' not in video_data:
3451                 video_data['sources'] = [video_data]
3452
3453             this_video_id = video_id or video_data['mediaid']
3454
3455             formats = self._parse_jwplayer_formats(
3456                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3457                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3458
3459             subtitles = {}
3460             tracks = video_data.get('tracks')
3461             if tracks and isinstance(tracks, list):
3462                 for track in tracks:
3463                     if not isinstance(track, dict):
3464                         continue
3465                     track_kind = track.get('kind')
3466                     if not track_kind or not isinstance(track_kind, compat_str):
3467                         continue
3468                     if track_kind.lower() not in ('captions', 'subtitles'):
3469                         continue
3470                     track_url = urljoin(base_url, track.get('file'))
3471                     if not track_url:
3472                         continue
3473                     subtitles.setdefault(track.get('label') or 'en', []).append({
3474                         'url': self._proto_relative_url(track_url)
3475                     })
3476
3477             entry = {
3478                 'id': this_video_id,
3479                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3480                 'description': clean_html(video_data.get('description')),
3481                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3482                 'timestamp': int_or_none(video_data.get('pubdate')),
3483                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3484                 'subtitles': subtitles,
3485             }
3486             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3487             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3488                 entry.update({
3489                     '_type': 'url_transparent',
3490                     'url': formats[0]['url'],
3491                 })
3492             else:
3493                 self._sort_formats(formats)
3494                 entry['formats'] = formats
3495             entries.append(entry)
3496         if len(entries) == 1:
3497             return entries[0]
3498         else:
3499             return self.playlist_result(entries)
3500
3501     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3502                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3503         urls = []
3504         formats = []
3505         for source in jwplayer_sources_data:
3506             if not isinstance(source, dict):
3507                 continue
3508             source_url = urljoin(
3509                 base_url, self._proto_relative_url(source.get('file')))
3510             if not source_url or source_url in urls:
3511                 continue
3512             urls.append(source_url)
3513             source_type = source.get('type') or ''
3514             ext = mimetype2ext(source_type) or determine_ext(source_url)
3515             if source_type == 'hls' or ext == 'm3u8':
3516                 formats.extend(self._extract_m3u8_formats(
3517                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3518                     m3u8_id=m3u8_id, fatal=False))
3519             elif source_type == 'dash' or ext == 'mpd':
3520                 formats.extend(self._extract_mpd_formats(
3521                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3522             elif ext == 'smil':
3523                 formats.extend(self._extract_smil_formats(
3524                     source_url, video_id, fatal=False))
3525             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3526             elif source_type.startswith('audio') or ext in (
3527                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3528                 formats.append({
3529                     'url': source_url,
3530                     'vcodec': 'none',
3531                     'ext': ext,
3532                 })
3533             else:
3534                 height = int_or_none(source.get('height'))
3535                 if height is None:
3536                     # Often no height is provided but there is a label in
3537                     # format like "1080p", "720p SD", or 1080.
3538                     height = int_or_none(self._search_regex(
3539                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3540                         'height', default=None))
3541                 a_format = {
3542                     'url': source_url,
3543                     'width': int_or_none(source.get('width')),
3544                     'height': height,
3545                     'tbr': int_or_none(source.get('bitrate')),
3546                     'ext': ext,
3547                 }
3548                 if source_url.startswith('rtmp'):
3549                     a_format['ext'] = 'flv'
3550                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3551                     # of jwplayer.flash.swf
3552                     rtmp_url_parts = re.split(
3553                         r'((?:mp4|mp3|flv):)', source_url, 1)
3554                     if len(rtmp_url_parts) == 3:
3555                         rtmp_url, prefix, play_path = rtmp_url_parts
3556                         a_format.update({
3557                             'url': rtmp_url,
3558                             'play_path': prefix + play_path,
3559                         })
3560                     if rtmp_params:
3561                         a_format.update(rtmp_params)
3562                 formats.append(a_format)
3563         return formats
3564
3565     def _live_title(self, name):
3566         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3567         return name
3568
3569     def _int(self, v, name, fatal=False, **kwargs):
3570         res = int_or_none(v, **kwargs)
3571         if res is None:
3572             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3573             if fatal:
3574                 raise ExtractorError(msg)
3575             else:
3576                 self.report_warning(msg)
3577         return res
3578
3579     def _float(self, v, name, fatal=False, **kwargs):
3580         res = float_or_none(v, **kwargs)
3581         if res is None:
3582             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3583             if fatal:
3584                 raise ExtractorError(msg)
3585             else:
3586                 self.report_warning(msg)
3587         return res
3588
3589     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3590                     path='/', secure=False, discard=False, rest={}, **kwargs):
3591         cookie = compat_cookiejar_Cookie(
3592             0, name, value, port, port is not None, domain, True,
3593             domain.startswith('.'), path, True, secure, expire_time,
3594             discard, None, None, rest)
3595         self._downloader.cookiejar.set_cookie(cookie)
3596
3597     def _get_cookies(self, url):
3598         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3599         req = sanitized_Request(url)
3600         self._downloader.cookiejar.add_cookie_header(req)
3601         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3602
3603     def _apply_first_set_cookie_header(self, url_handle, cookie):
3604         """
3605         Apply first Set-Cookie header instead of the last. Experimental.
3606
3607         Some sites (e.g. [1-3]) may serve two cookies under the same name
3608         in Set-Cookie header and expect the first (old) one to be set rather
3609         than second (new). However, as of RFC6265 the newer one cookie
3610         should be set into cookie store what actually happens.
3611         We will workaround this issue by resetting the cookie to
3612         the first one manually.
3613         1. https://new.vk.com/
3614         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3615         3. https://learning.oreilly.com/
3616         """
3617         for header, cookies in url_handle.headers.items():
3618             if header.lower() != 'set-cookie':
3619                 continue
3620             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3621             cookie_value = re.search(
3622                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3623             if cookie_value:
3624                 value, domain = cookie_value.groups()
3625                 self._set_cookie(domain, cookie, value)
3626                 break
3627
3628     @classmethod
3629     def get_testcases(cls, include_onlymatching=False):
3630         t = getattr(cls, '_TEST', None)
3631         if t:
3632             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3633             tests = [t]
3634         else:
3635             tests = getattr(cls, '_TESTS', [])
3636         for t in tests:
3637             if not include_onlymatching and t.get('only_matching', False):
3638                 continue
3639             t['name'] = cls.ie_key()
3640             yield t
3641
3642     @classproperty
3643     def age_limit(cls):
3644         """Get age limit from the testcases"""
3645         return max(traverse_obj(
3646             tuple(cls.get_testcases(include_onlymatching=False)),
3647             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3648
3649     @classmethod
3650     def is_suitable(cls, age_limit):
3651         """Test whether the extractor is generally suitable for the given age limit"""
3652         return not age_restricted(cls.age_limit, age_limit)
3653
3654     @classmethod
3655     def description(cls, *, markdown=True, search_examples=None):
3656         """Description of the extractor"""
3657         desc = ''
3658         if cls._NETRC_MACHINE:
3659             if markdown:
3660                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3661             else:
3662                 desc += f' [{cls._NETRC_MACHINE}]'
3663         if cls.IE_DESC is False:
3664             desc += ' [HIDDEN]'
3665         elif cls.IE_DESC:
3666             desc += f' {cls.IE_DESC}'
3667         if cls.SEARCH_KEY:
3668             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3669             if search_examples:
3670                 _COUNTS = ('', '5', '10', 'all')
3671                 desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3672         if not cls.working():
3673             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3674
3675         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3676         return f'{name}:{desc}' if desc else name
3677
3678     def extract_subtitles(self, *args, **kwargs):
3679         if (self.get_param('writesubtitles', False)
3680                 or self.get_param('listsubtitles')):
3681             return self._get_subtitles(*args, **kwargs)
3682         return {}
3683
3684     def _get_subtitles(self, *args, **kwargs):
3685         raise NotImplementedError('This method must be implemented by subclasses')
3686
3687     def extract_comments(self, *args, **kwargs):
3688         if not self.get_param('getcomments'):
3689             return None
3690         generator = self._get_comments(*args, **kwargs)
3691
3692         def extractor():
3693             comments = []
3694             interrupted = True
3695             try:
3696                 while True:
3697                     comments.append(next(generator))
3698             except StopIteration:
3699                 interrupted = False
3700             except KeyboardInterrupt:
3701                 self.to_screen('Interrupted by user')
3702             except Exception as e:
3703                 if self.get_param('ignoreerrors') is not True:
3704                     raise
3705                 self._downloader.report_error(e)
3706             comment_count = len(comments)
3707             self.to_screen(f'Extracted {comment_count} comments')
3708             return {
3709                 'comments': comments,
3710                 'comment_count': None if interrupted else comment_count
3711             }
3712         return extractor
3713
3714     def _get_comments(self, *args, **kwargs):
3715         raise NotImplementedError('This method must be implemented by subclasses')
3716
3717     @staticmethod
3718     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3719         """ Merge subtitle items for one language. Items with duplicated URLs/data
3720         will be dropped. """
3721         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3722         ret = list(subtitle_list1)
3723         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3724         return ret
3725
3726     @classmethod
3727     def _merge_subtitles(cls, *dicts, target=None):
3728         """ Merge subtitle dictionaries, language by language. """
3729         if target is None:
3730             target = {}
3731         for d in dicts:
3732             for lang, subs in d.items():
3733                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3734         return target
3735
3736     def extract_automatic_captions(self, *args, **kwargs):
3737         if (self.get_param('writeautomaticsub', False)
3738                 or self.get_param('listsubtitles')):
3739             return self._get_automatic_captions(*args, **kwargs)
3740         return {}
3741
3742     def _get_automatic_captions(self, *args, **kwargs):
3743         raise NotImplementedError('This method must be implemented by subclasses')
3744
3745     @property
3746     def _cookies_passed(self):
3747         """Whether cookies have been passed to YoutubeDL"""
3748         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3749
3750     def mark_watched(self, *args, **kwargs):
3751         if not self.get_param('mark_watched', False):
3752             return
3753         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3754             self._mark_watched(*args, **kwargs)
3755
3756     def _mark_watched(self, *args, **kwargs):
3757         raise NotImplementedError('This method must be implemented by subclasses')
3758
3759     def geo_verification_headers(self):
3760         headers = {}
3761         geo_verification_proxy = self.get_param('geo_verification_proxy')
3762         if geo_verification_proxy:
3763             headers['Ytdl-request-proxy'] = geo_verification_proxy
3764         return headers
3765
3766     def _generic_id(self, url):
3767         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3768
3769     def _generic_title(self, url):
3770         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3771
3772     @staticmethod
3773     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3774         all_known = all(map(
3775             lambda x: x is not None,
3776             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3777         return (
3778             'private' if is_private
3779             else 'premium_only' if needs_premium
3780             else 'subscriber_only' if needs_subscription
3781             else 'needs_auth' if needs_auth
3782             else 'unlisted' if is_unlisted
3783             else 'public' if all_known
3784             else None)
3785
3786     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3787         '''
3788         @returns            A list of values for the extractor argument given by "key"
3789                             or "default" if no such key is present
3790         @param default      The default value to return when the key is not present (default: [])
3791         @param casesense    When false, the values are converted to lower case
3792         '''
3793         val = traverse_obj(
3794             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3795         if val is None:
3796             return [] if default is NO_DEFAULT else default
3797         return list(val) if casesense else [x.lower() for x in val]
3798
3799     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3800         if not playlist_id or not video_id:
3801             return not video_id
3802
3803         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3804         if no_playlist is not None:
3805             return not no_playlist
3806
3807         video_id = '' if video_id is True else f' {video_id}'
3808         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3809         if self.get_param('noplaylist'):
3810             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3811             return False
3812         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3813         return True
3814
3815
3816 class SearchInfoExtractor(InfoExtractor):
3817     """
3818     Base class for paged search queries extractors.
3819     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3820     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3821     """
3822
3823     _MAX_RESULTS = float('inf')
3824
3825     @classmethod
3826     def _make_valid_url(cls):
3827         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3828
3829     def _real_extract(self, query):
3830         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3831         if prefix == '':
3832             return self._get_n_results(query, 1)
3833         elif prefix == 'all':
3834             return self._get_n_results(query, self._MAX_RESULTS)
3835         else:
3836             n = int(prefix)
3837             if n <= 0:
3838                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3839             elif n > self._MAX_RESULTS:
3840                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3841                 n = self._MAX_RESULTS
3842             return self._get_n_results(query, n)
3843
3844     def _get_n_results(self, query, n):
3845         """Get a specified number of results for a query.
3846         Either this function or _search_results must be overridden by subclasses """
3847         return self.playlist_result(
3848             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3849             query, query)
3850
3851     def _search_results(self, query):
3852         """Returns an iterator of search results"""
3853         raise NotImplementedError('This method must be implemented by subclasses')
3854
3855     @classproperty
3856     def SEARCH_KEY(cls):
3857         return cls._SEARCH_KEY