yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import hashlib
   4 import itertools
   5 import json
   6 import math
   7 import netrc
   8 import os
   9 import random
  10 import sys
  11 import time
  12 import xml.etree.ElementTree
  13
  14 from ..compat import functools, re  # isort: split
  15 from ..compat import (
  16     compat_cookiejar_Cookie,
  17     compat_cookies_SimpleCookie,
  18     compat_etree_fromstring,
  19     compat_expanduser,
  20     compat_getpass,
  21     compat_http_client,
  22     compat_os_name,
  23     compat_str,
  24     compat_urllib_error,
  25     compat_urllib_parse_unquote,
  26     compat_urllib_parse_urlencode,
  27     compat_urllib_request,
  28     compat_urlparse,
  29 )
  30 from ..downloader import FileDownloader
  31 from ..downloader.f4m import get_base_url, remove_encrypted_media
  32 from ..utils import (
  33     JSON_LD_RE,
  34     NO_DEFAULT,
  35     ExtractorError,
  36     GeoRestrictedError,
  37     GeoUtils,
  38     LenientJSONDecoder,
  39     RegexNotFoundError,
  40     UnsupportedError,
  41     age_restricted,
  42     base_url,
  43     bug_reports_message,
  44     classproperty,
  45     clean_html,
  46     determine_ext,
  47     determine_protocol,
  48     dict_get,
  49     encode_data_uri,
  50     error_to_compat_str,
  51     extract_attributes,
  52     filter_dict,
  53     fix_xml_ampersands,
  54     float_or_none,
  55     format_field,
  56     int_or_none,
  57     join_nonempty,
  58     js_to_json,
  59     mimetype2ext,
  60     network_exceptions,
  61     orderedSet,
  62     parse_bitrate,
  63     parse_codecs,
  64     parse_duration,
  65     parse_iso8601,
  66     parse_m3u8_attributes,
  67     parse_resolution,
  68     sanitize_filename,
  69     sanitized_Request,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     traverse_obj,
  74     try_get,
  75     unescapeHTML,
  76     unified_strdate,
  77     unified_timestamp,
  78     update_Request,
  79     update_url_query,
  80     url_basename,
  81     url_or_none,
  82     urljoin,
  83     variadic,
  84     xpath_element,
  85     xpath_text,
  86     xpath_with_ns,
  87 )
  88
  89
  90 class InfoExtractor:
  91     """Information Extractor class.
  92
  93     Information extractors are the classes that, given a URL, extract
  94     information about the video (or videos) the URL refers to. This
  95     information includes the real video URL, the video title, author and
  96     others. The information is stored in a dictionary which is then
  97     passed to the YoutubeDL. The YoutubeDL processes this
  98     information possibly downloading the video to the file system, among
  99     other possible outcomes.
 100
 101     The type field determines the type of the result.
 102     By far the most common value (and the default if _type is missing) is
 103     "video", which indicates a single video.
 104
 105     For a video, the dictionaries must include the following fields:
 106
 107     id:             Video identifier.
 108     title:          Video title, unescaped. Set to an empty string if video has
 109                     no title as opposed to "None" which signifies that the
 110                     extractor failed to obtain a title
 111
 112     Additionally, it must contain either a formats entry or a url one:
 113
 114     formats:        A list of dictionaries for each format available, ordered
 115                     from worst to best quality.
 116
 117                     Potential fields:
 118                     * url        The mandatory URL representing the media:
 119                                    for plain file media - HTTP URL of this file,
 120                                    for RTMP - RTMP URL,
 121                                    for HLS - URL of the M3U8 media playlist,
 122                                    for HDS - URL of the F4M manifest,
 123                                    for DASH
 124                                      - HTTP URL to plain file media (in case of
 125                                        unfragmented media)
 126                                      - URL of the MPD manifest or base URL
 127                                        representing the media if MPD manifest
 128                                        is parsed from a string (in case of
 129                                        fragmented media)
 130                                    for MSS - URL of the ISM manifest.
 131                     * manifest_url
 132                                  The URL of the manifest file in case of
 133                                  fragmented media:
 134                                    for HLS - URL of the M3U8 master playlist,
 135                                    for HDS - URL of the F4M manifest,
 136                                    for DASH - URL of the MPD manifest,
 137                                    for MSS - URL of the ISM manifest.
 138                     * manifest_stream_number  (For internal use only)
 139                                  The index of the stream in the manifest file
 140                     * ext        Will be calculated from URL if missing
 141                     * format     A human-readable description of the format
 142                                  ("mp4 container with h264/opus").
 143                                  Calculated from the format_id, width, height.
 144                                  and format_note fields if missing.
 145                     * format_id  A short description of the format
 146                                  ("mp4_h264_opus" or "19").
 147                                 Technically optional, but strongly recommended.
 148                     * format_note Additional info about the format
 149                                  ("3D" or "DASH video")
 150                     * width      Width of the video, if known
 151                     * height     Height of the video, if known
 152                     * resolution Textual description of width and height
 153                     * dynamic_range The dynamic range of the video. One of:
 154                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 155                     * tbr        Average bitrate of audio and video in KBit/s
 156                     * abr        Average audio bitrate in KBit/s
 157                     * acodec     Name of the audio codec in use
 158                     * asr        Audio sampling rate in Hertz
 159                     * vbr        Average video bitrate in KBit/s
 160                     * fps        Frame rate
 161                     * vcodec     Name of the video codec in use
 162                     * container  Name of the container format
 163                     * filesize   The number of bytes, if known in advance
 164                     * filesize_approx  An estimate for the number of bytes
 165                     * player_url SWF Player URL (used for rtmpdump).
 166                     * protocol   The protocol that will be used for the actual
 167                                  download, lower-case. One of "http", "https" or
 168                                  one of the protocols defined in downloader.PROTOCOL_MAP
 169                     * fragment_base_url
 170                                  Base URL for fragments. Each fragment's path
 171                                  value (if present) will be relative to
 172                                  this URL.
 173                     * fragments  A list of fragments of a fragmented media.
 174                                  Each fragment entry must contain either an url
 175                                  or a path. If an url is present it should be
 176                                  considered by a client. Otherwise both path and
 177                                  fragment_base_url must be present. Here is
 178                                  the list of all potential fields:
 179                                  * "url" - fragment's URL
 180                                  * "path" - fragment's path relative to
 181                                             fragment_base_url
 182                                  * "duration" (optional, int or float)
 183                                  * "filesize" (optional, int)
 184                     * is_from_start  Is a live format that can be downloaded
 185                                 from the start. Boolean
 186                     * preference Order number of this format. If this field is
 187                                  present and not None, the formats get sorted
 188                                  by this field, regardless of all other values.
 189                                  -1 for default (order by other properties),
 190                                  -2 or smaller for less than default.
 191                                  < -1000 to hide the format (if there is
 192                                     another one which is strictly better)
 193                     * language   Language code, e.g. "de" or "en-US".
 194                     * language_preference  Is this in the language mentioned in
 195                                  the URL?
 196                                  10 if it's what the URL is about,
 197                                  -1 for default (don't know),
 198                                  -10 otherwise, other values reserved for now.
 199                     * quality    Order number of the video quality of this
 200                                  format, irrespective of the file format.
 201                                  -1 for default (order by other properties),
 202                                  -2 or smaller for less than default.
 203                     * source_preference  Order number for this video source
 204                                   (quality takes higher priority)
 205                                  -1 for default (order by other properties),
 206                                  -2 or smaller for less than default.
 207                     * http_headers  A dictionary of additional HTTP headers
 208                                  to add to the request.
 209                     * stretched_ratio  If given and not 1, indicates that the
 210                                  video's pixels are not square.
 211                                  width : height ratio as float.
 212                     * no_resume  The server does not support resuming the
 213                                  (HTTP or RTMP) download. Boolean.
 214                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 215                     * downloader_options  A dictionary of downloader options
 216                                  (For internal use only)
 217                                  * http_chunk_size Chunk size for HTTP downloads
 218                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 219                     RTMP formats can also have the additional fields: page_url,
 220                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 221                     rtmp_protocol, rtmp_real_time
 222
 223     url:            Final video URL.
 224     ext:            Video filename extension.
 225     format:         The video format, defaults to ext (used for --get-format)
 226     player_url:     SWF Player URL (used for rtmpdump).
 227
 228     The following fields are optional:
 229
 230     direct:         True if a direct video file was given (must only be set by GenericIE)
 231     alt_title:      A secondary title of the video.
 232     display_id      An alternative identifier for the video, not necessarily
 233                     unique, but available before title. Typically, id is
 234                     something like "4234987", title "Dancing naked mole rats",
 235                     and display_id "dancing-naked-mole-rats"
 236     thumbnails:     A list of dictionaries, with the following entries:
 237                         * "id" (optional, string) - Thumbnail format ID
 238                         * "url"
 239                         * "preference" (optional, int) - quality of the image
 240                         * "width" (optional, int)
 241                         * "height" (optional, int)
 242                         * "resolution" (optional, string "{width}x{height}",
 243                                         deprecated)
 244                         * "filesize" (optional, int)
 245                         * "http_headers" (dict) - HTTP headers for the request
 246     thumbnail:      Full URL to a video thumbnail image.
 247     description:    Full video description.
 248     uploader:       Full name of the video uploader.
 249     license:        License name the video is licensed under.
 250     creator:        The creator of the video.
 251     timestamp:      UNIX timestamp of the moment the video was uploaded
 252     upload_date:    Video upload date in UTC (YYYYMMDD).
 253                     If not explicitly set, calculated from timestamp
 254     release_timestamp: UNIX timestamp of the moment the video was released.
 255                     If it is not clear whether to use timestamp or this, use the former
 256     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 257                     If not explicitly set, calculated from release_timestamp
 258     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 259     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 260                     If not explicitly set, calculated from modified_timestamp
 261     uploader_id:    Nickname or id of the video uploader.
 262     uploader_url:   Full URL to a personal webpage of the video uploader.
 263     channel:        Full name of the channel the video is uploaded on.
 264                     Note that channel fields may or may not repeat uploader
 265                     fields. This depends on a particular extractor.
 266     channel_id:     Id of the channel.
 267     channel_url:    Full URL to a channel webpage.
 268     channel_follower_count: Number of followers of the channel.
 269     location:       Physical location where the video was filmed.
 270     subtitles:      The available subtitles as a dictionary in the format
 271                     {tag: subformats}. "tag" is usually a language code, and
 272                     "subformats" is a list sorted from lower to higher
 273                     preference, each element is a dictionary with the "ext"
 274                     entry and one of:
 275                         * "data": The subtitles file contents
 276                         * "url": A URL pointing to the subtitles file
 277                     It can optionally also have:
 278                         * "name": Name or description of the subtitles
 279                         * "http_headers": A dictionary of additional HTTP headers
 280                                   to add to the request.
 281                     "ext" will be calculated from URL if missing
 282     automatic_captions: Like 'subtitles'; contains automatically generated
 283                     captions instead of normal subtitles
 284     duration:       Length of the video in seconds, as an integer or float.
 285     view_count:     How many users have watched the video on the platform.
 286     like_count:     Number of positive ratings of the video
 287     dislike_count:  Number of negative ratings of the video
 288     repost_count:   Number of reposts of the video
 289     average_rating: Average rating give by users, the scale used depends on the webpage
 290     comment_count:  Number of comments on the video
 291     comments:       A list of comments, each with one or more of the following
 292                     properties (all but one of text or html optional):
 293                         * "author" - human-readable name of the comment author
 294                         * "author_id" - user ID of the comment author
 295                         * "author_thumbnail" - The thumbnail of the comment author
 296                         * "id" - Comment ID
 297                         * "html" - Comment as HTML
 298                         * "text" - Plain text of the comment
 299                         * "timestamp" - UNIX timestamp of comment
 300                         * "parent" - ID of the comment this one is replying to.
 301                                      Set to "root" to indicate that this is a
 302                                      comment to the original video.
 303                         * "like_count" - Number of positive ratings of the comment
 304                         * "dislike_count" - Number of negative ratings of the comment
 305                         * "is_favorited" - Whether the comment is marked as
 306                                            favorite by the video uploader
 307                         * "author_is_uploader" - Whether the comment is made by
 308                                                  the video uploader
 309     age_limit:      Age restriction for the video, as an integer (years)
 310     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 311                     should allow to get the same result again. (It will be set
 312                     by YoutubeDL if it's missing)
 313     categories:     A list of categories that the video falls in, for example
 314                     ["Sports", "Berlin"]
 315     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 316     cast:           A list of the video cast
 317     is_live:        True, False, or None (=unknown). Whether this video is a
 318                     live stream that goes on instead of a fixed-length video.
 319     was_live:       True, False, or None (=unknown). Whether this video was
 320                     originally a live stream.
 321     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 322                     If absent, automatically set from is_live, was_live
 323     start_time:     Time in seconds where the reproduction should start, as
 324                     specified in the URL.
 325     end_time:       Time in seconds where the reproduction should end, as
 326                     specified in the URL.
 327     chapters:       A list of dictionaries, with the following entries:
 328                         * "start_time" - The start time of the chapter in seconds
 329                         * "end_time" - The end time of the chapter in seconds
 330                         * "title" (optional, string)
 331     playable_in_embed: Whether this video is allowed to play in embedded
 332                     players on other sites. Can be True (=always allowed),
 333                     False (=never allowed), None (=unknown), or a string
 334                     specifying the criteria for embedability (Eg: 'whitelist')
 335     availability:   Under what condition the video is available. One of
 336                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 337                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 338                     to set it
 339     __post_extractor: A function to be called just before the metadata is
 340                     written to either disk, logger or console. The function
 341                     must return a dict which will be added to the info_dict.
 342                     This is usefull for additional information that is
 343                     time-consuming to extract. Note that the fields thus
 344                     extracted will not be available to output template and
 345                     match_filter. So, only "comments" and "comment_count" are
 346                     currently allowed to be extracted via this method.
 347
 348     The following fields should only be used when the video belongs to some logical
 349     chapter or section:
 350
 351     chapter:        Name or title of the chapter the video belongs to.
 352     chapter_number: Number of the chapter the video belongs to, as an integer.
 353     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 354
 355     The following fields should only be used when the video is an episode of some
 356     series, programme or podcast:
 357
 358     series:         Title of the series or programme the video episode belongs to.
 359     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 360     season:         Title of the season the video episode belongs to.
 361     season_number:  Number of the season the video episode belongs to, as an integer.
 362     season_id:      Id of the season the video episode belongs to, as a unicode string.
 363     episode:        Title of the video episode. Unlike mandatory video title field,
 364                     this field should denote the exact title of the video episode
 365                     without any kind of decoration.
 366     episode_number: Number of the video episode within a season, as an integer.
 367     episode_id:     Id of the video episode, as a unicode string.
 368
 369     The following fields should only be used when the media is a track or a part of
 370     a music album:
 371
 372     track:          Title of the track.
 373     track_number:   Number of the track within an album or a disc, as an integer.
 374     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 375                     as a unicode string.
 376     artist:         Artist(s) of the track.
 377     genre:          Genre(s) of the track.
 378     album:          Title of the album the track belongs to.
 379     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 380     album_artist:   List of all artists appeared on the album (e.g.
 381                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 382                     and compilations).
 383     disc_number:    Number of the disc or other physical medium the track belongs to,
 384                     as an integer.
 385     release_year:   Year (YYYY) when the album was released.
 386     composer:       Composer of the piece
 387
 388     Unless mentioned otherwise, the fields should be Unicode strings.
 389
 390     Unless mentioned otherwise, None is equivalent to absence of information.
 391
 392
 393     _type "playlist" indicates multiple videos.
 394     There must be a key "entries", which is a list, an iterable, or a PagedList
 395     object, each element of which is a valid dictionary by this specification.
 396
 397     Additionally, playlists can have "id", "title", and any other relevent
 398     attributes with the same semantics as videos (see above).
 399
 400     It can also have the following optional fields:
 401
 402     playlist_count: The total number of videos in a playlist. If not given,
 403                     YoutubeDL tries to calculate it from "entries"
 404
 405
 406     _type "multi_video" indicates that there are multiple videos that
 407     form a single show, for examples multiple acts of an opera or TV episode.
 408     It must have an entries key like a playlist and contain all the keys
 409     required for a video at the same time.
 410
 411
 412     _type "url" indicates that the video must be extracted from another
 413     location, possibly by a different extractor. Its only required key is:
 414     "url" - the next URL to extract.
 415     The key "ie_key" can be set to the class name (minus the trailing "IE",
 416     e.g. "Youtube") if the extractor class is known in advance.
 417     Additionally, the dictionary may have any properties of the resolved entity
 418     known in advance, for example "title" if the title of the referred video is
 419     known ahead of time.
 420
 421
 422     _type "url_transparent" entities have the same specification as "url", but
 423     indicate that the given additional information is more precise than the one
 424     associated with the resolved URL.
 425     This is useful when a site employs a video service that hosts the video and
 426     its technical metadata, but that video service does not embed a useful
 427     title, description etc.
 428
 429
 430     Subclasses of this should define a _VALID_URL regexp and, re-define the
 431     _real_extract() and (optionally) _real_initialize() methods.
 432     Probably, they should also be added to the list of extractors.
 433
 434     Subclasses may also override suitable() if necessary, but ensure the function
 435     signature is preserved and that this function imports everything it needs
 436     (except other extractors), so that lazy_extractors works correctly.
 437
 438     To support username + password (or netrc) login, the extractor must define a
 439     _NETRC_MACHINE and re-define _perform_login(username, password) and
 440     (optionally) _initialize_pre_login() methods. The _perform_login method will
 441     be called between _initialize_pre_login and _real_initialize if credentials
 442     are passed by the user. In cases where it is necessary to have the login
 443     process as part of the extraction rather than initialization, _perform_login
 444     can be left undefined.
 445
 446     _GEO_BYPASS attribute may be set to False in order to disable
 447     geo restriction bypass mechanisms for a particular extractor.
 448     Though it won't disable explicit geo restriction bypass based on
 449     country code provided with geo_bypass_country.
 450
 451     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 452     countries for this extractor. One of these countries will be used by
 453     geo restriction bypass mechanism right away in order to bypass
 454     geo restriction, of course, if the mechanism is not disabled.
 455
 456     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 457     IP blocks in CIDR notation for this extractor. One of these IP blocks
 458     will be used by geo restriction bypass mechanism similarly
 459     to _GEO_COUNTRIES.
 460
 461     The _WORKING attribute should be set to False for broken IEs
 462     in order to warn the users and skip the tests.
 463     """
 464
 465     _ready = False
 466     _downloader = None
 467     _x_forwarded_for_ip = None
 468     _GEO_BYPASS = True
 469     _GEO_COUNTRIES = None
 470     _GEO_IP_BLOCKS = None
 471     _WORKING = True
 472     _NETRC_MACHINE = None
 473     IE_DESC = None
 474     SEARCH_KEY = None
 475
 476     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 477         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 478         return {
 479             None: '',
 480             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 481             'password': f'Use {password_hint}',
 482             'cookies': (
 483                 'Use --cookies-from-browser or --cookies for the authentication. '
 484                 'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 485         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 486
 487     def __init__(self, downloader=None):
 488         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 489         If a downloader is not passed during initialization,
 490         it must be set using "set_downloader()" before "extract()" is called"""
 491         self._ready = False
 492         self._x_forwarded_for_ip = None
 493         self._printed_messages = set()
 494         self.set_downloader(downloader)
 495
 496     @classmethod
 497     def _match_valid_url(cls, url):
 498         # This does not use has/getattr intentionally - we want to know whether
 499         # we have cached the regexp for *this* class, whereas getattr would also
 500         # match the superclass
 501         if '_VALID_URL_RE' not in cls.__dict__:
 502             if '_VALID_URL' not in cls.__dict__:
 503                 cls._VALID_URL = cls._make_valid_url()
 504             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 505         return cls._VALID_URL_RE.match(url)
 506
 507     @classmethod
 508     def suitable(cls, url):
 509         """Receives a URL and returns True if suitable for this IE."""
 510         # This function must import everything it needs (except other extractors),
 511         # so that lazy_extractors works correctly
 512         return cls._match_valid_url(url) is not None
 513
 514     @classmethod
 515     def _match_id(cls, url):
 516         return cls._match_valid_url(url).group('id')
 517
 518     @classmethod
 519     def get_temp_id(cls, url):
 520         try:
 521             return cls._match_id(url)
 522         except (IndexError, AttributeError):
 523             return None
 524
 525     @classmethod
 526     def working(cls):
 527         """Getter method for _WORKING."""
 528         return cls._WORKING
 529
 530     @classmethod
 531     def supports_login(cls):
 532         return bool(cls._NETRC_MACHINE)
 533
 534     def initialize(self):
 535         """Initializes an instance (authentication, etc)."""
 536         self._printed_messages = set()
 537         self._initialize_geo_bypass({
 538             'countries': self._GEO_COUNTRIES,
 539             'ip_blocks': self._GEO_IP_BLOCKS,
 540         })
 541         if not self._ready:
 542             self._initialize_pre_login()
 543             if self.supports_login():
 544                 username, password = self._get_login_info()
 545                 if username:
 546                     self._perform_login(username, password)
 547             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 548                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 549             self._real_initialize()
 550             self._ready = True
 551
 552     def _initialize_geo_bypass(self, geo_bypass_context):
 553         """
 554         Initialize geo restriction bypass mechanism.
 555
 556         This method is used to initialize geo bypass mechanism based on faking
 557         X-Forwarded-For HTTP header. A random country from provided country list
 558         is selected and a random IP belonging to this country is generated. This
 559         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 560         HTTP requests.
 561
 562         This method will be used for initial geo bypass mechanism initialization
 563         during the instance initialization with _GEO_COUNTRIES and
 564         _GEO_IP_BLOCKS.
 565
 566         You may also manually call it from extractor's code if geo bypass
 567         information is not available beforehand (e.g. obtained during
 568         extraction) or due to some other reason. In this case you should pass
 569         this information in geo bypass context passed as first argument. It may
 570         contain following fields:
 571
 572         countries:  List of geo unrestricted countries (similar
 573                     to _GEO_COUNTRIES)
 574         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 575                     (similar to _GEO_IP_BLOCKS)
 576
 577         """
 578         if not self._x_forwarded_for_ip:
 579
 580             # Geo bypass mechanism is explicitly disabled by user
 581             if not self.get_param('geo_bypass', True):
 582                 return
 583
 584             if not geo_bypass_context:
 585                 geo_bypass_context = {}
 586
 587             # Backward compatibility: previously _initialize_geo_bypass
 588             # expected a list of countries, some 3rd party code may still use
 589             # it this way
 590             if isinstance(geo_bypass_context, (list, tuple)):
 591                 geo_bypass_context = {
 592                     'countries': geo_bypass_context,
 593                 }
 594
 595             # The whole point of geo bypass mechanism is to fake IP
 596             # as X-Forwarded-For HTTP header based on some IP block or
 597             # country code.
 598
 599             # Path 1: bypassing based on IP block in CIDR notation
 600
 601             # Explicit IP block specified by user, use it right away
 602             # regardless of whether extractor is geo bypassable or not
 603             ip_block = self.get_param('geo_bypass_ip_block', None)
 604
 605             # Otherwise use random IP block from geo bypass context but only
 606             # if extractor is known as geo bypassable
 607             if not ip_block:
 608                 ip_blocks = geo_bypass_context.get('ip_blocks')
 609                 if self._GEO_BYPASS and ip_blocks:
 610                     ip_block = random.choice(ip_blocks)
 611
 612             if ip_block:
 613                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 614                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 615                 return
 616
 617             # Path 2: bypassing based on country code
 618
 619             # Explicit country code specified by user, use it right away
 620             # regardless of whether extractor is geo bypassable or not
 621             country = self.get_param('geo_bypass_country', None)
 622
 623             # Otherwise use random country code from geo bypass context but
 624             # only if extractor is known as geo bypassable
 625             if not country:
 626                 countries = geo_bypass_context.get('countries')
 627                 if self._GEO_BYPASS and countries:
 628                     country = random.choice(countries)
 629
 630             if country:
 631                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 632                 self._downloader.write_debug(
 633                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 634
 635     def extract(self, url):
 636         """Extracts URL information and returns it in list of dicts."""
 637         try:
 638             for _ in range(2):
 639                 try:
 640                     self.initialize()
 641                     self.write_debug('Extracting URL: %s' % url)
 642                     ie_result = self._real_extract(url)
 643                     if ie_result is None:
 644                         return None
 645                     if self._x_forwarded_for_ip:
 646                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 647                     subtitles = ie_result.get('subtitles')
 648                     if (subtitles and 'live_chat' in subtitles
 649                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 650                         del subtitles['live_chat']
 651                     return ie_result
 652                 except GeoRestrictedError as e:
 653                     if self.__maybe_fake_ip_and_retry(e.countries):
 654                         continue
 655                     raise
 656         except UnsupportedError:
 657             raise
 658         except ExtractorError as e:
 659             kwargs = {
 660                 'video_id': e.video_id or self.get_temp_id(url),
 661                 'ie': self.IE_NAME,
 662                 'tb': e.traceback or sys.exc_info()[2],
 663                 'expected': e.expected,
 664                 'cause': e.cause
 665             }
 666             if hasattr(e, 'countries'):
 667                 kwargs['countries'] = e.countries
 668             raise type(e)(e.orig_msg, **kwargs)
 669         except compat_http_client.IncompleteRead as e:
 670             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 671         except (KeyError, StopIteration) as e:
 672             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 673
 674     def __maybe_fake_ip_and_retry(self, countries):
 675         if (not self.get_param('geo_bypass_country', None)
 676                 and self._GEO_BYPASS
 677                 and self.get_param('geo_bypass', True)
 678                 and not self._x_forwarded_for_ip
 679                 and countries):
 680             country_code = random.choice(countries)
 681             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 682             if self._x_forwarded_for_ip:
 683                 self.report_warning(
 684                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 685                     % (self._x_forwarded_for_ip, country_code.upper()))
 686                 return True
 687         return False
 688
 689     def set_downloader(self, downloader):
 690         """Sets a YoutubeDL instance as the downloader for this IE."""
 691         self._downloader = downloader
 692
 693     def _initialize_pre_login(self):
 694         """ Intialization before login. Redefine in subclasses."""
 695         pass
 696
 697     def _perform_login(self, username, password):
 698         """ Login with username and password. Redefine in subclasses."""
 699         pass
 700
 701     def _real_initialize(self):
 702         """Real initialization process. Redefine in subclasses."""
 703         pass
 704
 705     def _real_extract(self, url):
 706         """Real extraction process. Redefine in subclasses."""
 707         raise NotImplementedError('This method must be implemented by subclasses')
 708
 709     @classmethod
 710     def ie_key(cls):
 711         """A string for getting the InfoExtractor with get_info_extractor"""
 712         return cls.__name__[:-2]
 713
 714     @classproperty
 715     def IE_NAME(cls):
 716         return cls.__name__[:-2]
 717
 718     @staticmethod
 719     def __can_accept_status_code(err, expected_status):
 720         assert isinstance(err, compat_urllib_error.HTTPError)
 721         if expected_status is None:
 722             return False
 723         elif callable(expected_status):
 724             return expected_status(err.code) is True
 725         else:
 726             return err.code in variadic(expected_status)
 727
 728     def _create_request(self, url_or_request, data=None, headers={}, query={}):
 729         if isinstance(url_or_request, compat_urllib_request.Request):
 730             return update_Request(url_or_request, data=data, headers=headers, query=query)
 731         if query:
 732             url_or_request = update_url_query(url_or_request, query)
 733         return sanitized_Request(url_or_request, data, headers)
 734
 735     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 736         """
 737         Return the response handle.
 738
 739         See _download_webpage docstring for arguments specification.
 740         """
 741         if not self._downloader._first_webpage_request:
 742             sleep_interval = self.get_param('sleep_interval_requests') or 0
 743             if sleep_interval > 0:
 744                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 745                 time.sleep(sleep_interval)
 746         else:
 747             self._downloader._first_webpage_request = False
 748
 749         if note is None:
 750             self.report_download_webpage(video_id)
 751         elif note is not False:
 752             if video_id is None:
 753                 self.to_screen(str(note))
 754             else:
 755                 self.to_screen(f'{video_id}: {note}')
 756
 757         # Some sites check X-Forwarded-For HTTP header in order to figure out
 758         # the origin of the client behind proxy. This allows bypassing geo
 759         # restriction by faking this header's value to IP that belongs to some
 760         # geo unrestricted country. We will do so once we encounter any
 761         # geo restriction error.
 762         if self._x_forwarded_for_ip:
 763             if 'X-Forwarded-For' not in headers:
 764                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 765
 766         try:
 767             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 768         except network_exceptions as err:
 769             if isinstance(err, compat_urllib_error.HTTPError):
 770                 if self.__can_accept_status_code(err, expected_status):
 771                     # Retain reference to error to prevent file object from
 772                     # being closed before it can be read. Works around the
 773                     # effects of <https://bugs.python.org/issue15002>
 774                     # introduced in Python 3.4.1.
 775                     err.fp._error = err
 776                     return err.fp
 777
 778             if errnote is False:
 779                 return False
 780             if errnote is None:
 781                 errnote = 'Unable to download webpage'
 782
 783             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 784             if fatal:
 785                 raise ExtractorError(errmsg, cause=err)
 786             else:
 787                 self.report_warning(errmsg)
 788                 return False
 789
 790     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 791                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 792         """
 793         Return a tuple (page content as string, URL handle).
 794
 795         Arguments:
 796         url_or_request -- plain text URL as a string or
 797             a compat_urllib_request.Requestobject
 798         video_id -- Video/playlist/item identifier (string)
 799
 800         Keyword arguments:
 801         note -- note printed before downloading (string)
 802         errnote -- note printed in case of an error (string)
 803         fatal -- flag denoting whether error should be considered fatal,
 804             i.e. whether it should cause ExtractionError to be raised,
 805             otherwise a warning will be reported and extraction continued
 806         encoding -- encoding for a page content decoding, guessed automatically
 807             when not explicitly specified
 808         data -- POST data (bytes)
 809         headers -- HTTP headers (dict)
 810         query -- URL query (dict)
 811         expected_status -- allows to accept failed HTTP requests (non 2xx
 812             status code) by explicitly specifying a set of accepted status
 813             codes. Can be any of the following entities:
 814                 - an integer type specifying an exact failed status code to
 815                   accept
 816                 - a list or a tuple of integer types specifying a list of
 817                   failed status codes to accept
 818                 - a callable accepting an actual failed status code and
 819                   returning True if it should be accepted
 820             Note that this argument does not affect success status codes (2xx)
 821             which are always accepted.
 822         """
 823
 824         # Strip hashes from the URL (#1038)
 825         if isinstance(url_or_request, (compat_str, str)):
 826             url_or_request = url_or_request.partition('#')[0]
 827
 828         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 829         if urlh is False:
 830             assert not fatal
 831             return False
 832         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 833         return (content, urlh)
 834
 835     @staticmethod
 836     def _guess_encoding_from_content(content_type, webpage_bytes):
 837         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 838         if m:
 839             encoding = m.group(1)
 840         else:
 841             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 842                           webpage_bytes[:1024])
 843             if m:
 844                 encoding = m.group(1).decode('ascii')
 845             elif webpage_bytes.startswith(b'\xff\xfe'):
 846                 encoding = 'utf-16'
 847             else:
 848                 encoding = 'utf-8'
 849
 850         return encoding
 851
 852     def __check_blocked(self, content):
 853         first_block = content[:512]
 854         if ('<title>Access to this site is blocked</title>' in content
 855                 and 'Websense' in first_block):
 856             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 857             blocked_iframe = self._html_search_regex(
 858                 r'<iframe src="([^"]+)"', content,
 859                 'Websense information URL', default=None)
 860             if blocked_iframe:
 861                 msg += ' Visit %s for more details' % blocked_iframe
 862             raise ExtractorError(msg, expected=True)
 863         if '<title>The URL you requested has been blocked</title>' in first_block:
 864             msg = (
 865                 'Access to this webpage has been blocked by Indian censorship. '
 866                 'Use a VPN or proxy server (with --proxy) to route around it.')
 867             block_msg = self._html_search_regex(
 868                 r'</h1><p>(.*?)</p>',
 869                 content, 'block message', default=None)
 870             if block_msg:
 871                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 872             raise ExtractorError(msg, expected=True)
 873         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 874                 and 'blocklist.rkn.gov.ru' in content):
 875             raise ExtractorError(
 876                 'Access to this webpage has been blocked by decision of the Russian government. '
 877                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 878                 expected=True)
 879
 880     def _request_dump_filename(self, url, video_id):
 881         basen = f'{video_id}_{url}'
 882         trim_length = self.get_param('trim_file_name') or 240
 883         if len(basen) > trim_length:
 884             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 885             basen = basen[:trim_length - len(h)] + h
 886         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 887         # Working around MAX_PATH limitation on Windows (see
 888         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 889         if compat_os_name == 'nt':
 890             absfilepath = os.path.abspath(filename)
 891             if len(absfilepath) > 259:
 892                 filename = fR'\\?\{absfilepath}'
 893         return filename
 894
 895     def __decode_webpage(self, webpage_bytes, encoding, headers):
 896         if not encoding:
 897             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 898         try:
 899             return webpage_bytes.decode(encoding, 'replace')
 900         except LookupError:
 901             return webpage_bytes.decode('utf-8', 'replace')
 902
 903     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 904         webpage_bytes = urlh.read()
 905         if prefix is not None:
 906             webpage_bytes = prefix + webpage_bytes
 907         if self.get_param('dump_intermediate_pages', False):
 908             self.to_screen('Dumping request to ' + urlh.geturl())
 909             dump = base64.b64encode(webpage_bytes).decode('ascii')
 910             self._downloader.to_screen(dump)
 911         if self.get_param('write_pages'):
 912             filename = self._request_dump_filename(video_id, urlh.geturl())
 913             self.to_screen(f'Saving request to {filename}')
 914             with open(filename, 'wb') as outf:
 915                 outf.write(webpage_bytes)
 916
 917         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 918         self.__check_blocked(content)
 919
 920         return content
 921
 922     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 923         if transform_source:
 924             xml_string = transform_source(xml_string)
 925         try:
 926             return compat_etree_fromstring(xml_string.encode('utf-8'))
 927         except xml.etree.ElementTree.ParseError as ve:
 928             errmsg = '%s: Failed to parse XML ' % video_id
 929             if fatal:
 930                 raise ExtractorError(errmsg, cause=ve)
 931             else:
 932                 self.report_warning(errmsg + str(ve))
 933
 934     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, **parser_kwargs):
 935         try:
 936             return json.loads(
 937                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 938         except ValueError as ve:
 939             errmsg = f'{video_id}: Failed to parse JSON'
 940             if fatal:
 941                 raise ExtractorError(errmsg, cause=ve)
 942             else:
 943                 self.report_warning(errmsg + str(ve))
 944
 945     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 946         return self._parse_json(
 947             data[data.find('{'):data.rfind('}') + 1],
 948             video_id, transform_source, fatal)
 949
 950     def __create_download_methods(name, parser, note, errnote, return_value):
 951
 952         def parse(ie, content, *args, **kwargs):
 953             if parser is None:
 954                 return content
 955             # parser is fetched by name so subclasses can override it
 956             return getattr(ie, parser)(content, *args, **kwargs)
 957
 958         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 959                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 960             res = self._download_webpage_handle(
 961                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 962                 data=data, headers=headers, query=query, expected_status=expected_status)
 963             if res is False:
 964                 return res
 965             content, urlh = res
 966             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal), urlh
 967
 968         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 969                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 970             if self.get_param('load_pages'):
 971                 url_or_request = self._create_request(url_or_request, data, headers, query)
 972                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
 973                 self.to_screen(f'Loading request from {filename}')
 974                 try:
 975                     with open(filename, 'rb') as dumpf:
 976                         webpage_bytes = dumpf.read()
 977                 except OSError as e:
 978                     self.report_warning(f'Unable to load request from disk: {e}')
 979                 else:
 980                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
 981                     return parse(self, content, video_id, transform_source, fatal)
 982             kwargs = {
 983                 'note': note,
 984                 'errnote': errnote,
 985                 'transform_source': transform_source,
 986                 'fatal': fatal,
 987                 'encoding': encoding,
 988                 'data': data,
 989                 'headers': headers,
 990                 'query': query,
 991                 'expected_status': expected_status,
 992             }
 993             if parser is None:
 994                 kwargs.pop('transform_source')
 995             # The method is fetched by name so subclasses can override _download_..._handle
 996             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
 997             return res if res is False else res[0]
 998
 999         def impersonate(func, name, return_value):
1000             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1001             func.__doc__ = f'''
1002                 @param transform_source     Apply this transformation before parsing
1003                 @returns                    {return_value}
1004
1005                 See _download_webpage_handle docstring for other arguments specification
1006             '''
1007
1008         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1009         impersonate(download_content, f'_download_{name}', f'{return_value}')
1010         return download_handle, download_content
1011
1012     _download_xml_handle, _download_xml = __create_download_methods(
1013         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1014     _download_json_handle, _download_json = __create_download_methods(
1015         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1016     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1017         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1018     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1019
1020     def _download_webpage(
1021             self, url_or_request, video_id, note=None, errnote=None,
1022             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1023         """
1024         Return the data of the page as a string.
1025
1026         Keyword arguments:
1027         tries -- number of tries
1028         timeout -- sleep interval between tries
1029
1030         See _download_webpage_handle docstring for other arguments specification.
1031         """
1032
1033         R''' # NB: These are unused; should they be deprecated?
1034         if tries != 1:
1035             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1036         if timeout is NO_DEFAULT:
1037             timeout = 5
1038         else:
1039             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1040         '''
1041
1042         try_count = 0
1043         while True:
1044             try:
1045                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1046             except compat_http_client.IncompleteRead as e:
1047                 try_count += 1
1048                 if try_count >= tries:
1049                     raise e
1050                 self._sleep(timeout, video_id)
1051
1052     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1053         idstr = format_field(video_id, template='%s: ')
1054         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1055         if only_once:
1056             if f'WARNING: {msg}' in self._printed_messages:
1057                 return
1058             self._printed_messages.add(f'WARNING: {msg}')
1059         self._downloader.report_warning(msg, *args, **kwargs)
1060
1061     def to_screen(self, msg, *args, **kwargs):
1062         """Print msg to screen, prefixing it with '[ie_name]'"""
1063         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1064
1065     def write_debug(self, msg, *args, **kwargs):
1066         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1067
1068     def get_param(self, name, default=None, *args, **kwargs):
1069         if self._downloader:
1070             return self._downloader.params.get(name, default, *args, **kwargs)
1071         return default
1072
1073     def report_drm(self, video_id, partial=False):
1074         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1075
1076     def report_extraction(self, id_or_name):
1077         """Report information extraction."""
1078         self.to_screen('%s: Extracting information' % id_or_name)
1079
1080     def report_download_webpage(self, video_id):
1081         """Report webpage download."""
1082         self.to_screen('%s: Downloading webpage' % video_id)
1083
1084     def report_age_confirmation(self):
1085         """Report attempt to confirm age."""
1086         self.to_screen('Confirming age')
1087
1088     def report_login(self):
1089         """Report attempt to log in."""
1090         self.to_screen('Logging in')
1091
1092     def raise_login_required(
1093             self, msg='This video is only available for registered users',
1094             metadata_available=False, method=NO_DEFAULT):
1095         if metadata_available and (
1096                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1097             self.report_warning(msg)
1098             return
1099         msg += format_field(self._login_hint(method), template='. %s')
1100         raise ExtractorError(msg, expected=True)
1101
1102     def raise_geo_restricted(
1103             self, msg='This video is not available from your location due to geo restriction',
1104             countries=None, metadata_available=False):
1105         if metadata_available and (
1106                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1107             self.report_warning(msg)
1108         else:
1109             raise GeoRestrictedError(msg, countries=countries)
1110
1111     def raise_no_formats(self, msg, expected=False, video_id=None):
1112         if expected and (
1113                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1114             self.report_warning(msg, video_id)
1115         elif isinstance(msg, ExtractorError):
1116             raise msg
1117         else:
1118             raise ExtractorError(msg, expected=expected, video_id=video_id)
1119
1120     # Methods for following #608
1121     @staticmethod
1122     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1123         """Returns a URL that points to a page that should be processed"""
1124         if ie is not None:
1125             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1126         if video_id is not None:
1127             kwargs['id'] = video_id
1128         if video_title is not None:
1129             kwargs['title'] = video_title
1130         return {
1131             **kwargs,
1132             '_type': 'url_transparent' if url_transparent else 'url',
1133             'url': url,
1134         }
1135
1136     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1137         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1138                 for m in orderedSet(map(getter, matches) if getter else matches))
1139         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1140
1141     @staticmethod
1142     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1143         """Returns a playlist"""
1144         if playlist_id:
1145             kwargs['id'] = playlist_id
1146         if playlist_title:
1147             kwargs['title'] = playlist_title
1148         if playlist_description is not None:
1149             kwargs['description'] = playlist_description
1150         return {
1151             **kwargs,
1152             '_type': 'multi_video' if multi_video else 'playlist',
1153             'entries': entries,
1154         }
1155
1156     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1157         """
1158         Perform a regex search on the given string, using a single or a list of
1159         patterns returning the first matching group.
1160         In case of failure return a default value or raise a WARNING or a
1161         RegexNotFoundError, depending on fatal, specifying the field name.
1162         """
1163         if string is None:
1164             mobj = None
1165         elif isinstance(pattern, (str, re.Pattern)):
1166             mobj = re.search(pattern, string, flags)
1167         else:
1168             for p in pattern:
1169                 mobj = re.search(p, string, flags)
1170                 if mobj:
1171                     break
1172
1173         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1174
1175         if mobj:
1176             if group is None:
1177                 # return the first matching group
1178                 return next(g for g in mobj.groups() if g is not None)
1179             elif isinstance(group, (list, tuple)):
1180                 return tuple(mobj.group(g) for g in group)
1181             else:
1182                 return mobj.group(group)
1183         elif default is not NO_DEFAULT:
1184             return default
1185         elif fatal:
1186             raise RegexNotFoundError('Unable to extract %s' % _name)
1187         else:
1188             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1189             return None
1190
1191     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', contains_pattern='.+', fatal=True, **kwargs):
1192         """Searches string for the JSON object specified by start_pattern"""
1193         # NB: end_pattern is only used to reduce the size of the initial match
1194         return self._parse_json(
1195             self._search_regex(rf'{start_pattern}\s*(?P<json>{{{contains_pattern}}})\s*{end_pattern}',
1196                                string, name, group='json', fatal=fatal) or '{}',
1197             video_id, fatal=fatal, ignore_extra=True, **kwargs) or {}
1198
1199     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1200         """
1201         Like _search_regex, but strips HTML tags and unescapes entities.
1202         """
1203         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1204         if res:
1205             return clean_html(res).strip()
1206         else:
1207             return res
1208
1209     def _get_netrc_login_info(self, netrc_machine=None):
1210         username = None
1211         password = None
1212         netrc_machine = netrc_machine or self._NETRC_MACHINE
1213
1214         if self.get_param('usenetrc', False):
1215             try:
1216                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1217                 if os.path.isdir(netrc_file):
1218                     netrc_file = os.path.join(netrc_file, '.netrc')
1219                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1220                 if info is not None:
1221                     username = info[0]
1222                     password = info[2]
1223                 else:
1224                     raise netrc.NetrcParseError(
1225                         'No authenticators for %s' % netrc_machine)
1226             except (OSError, netrc.NetrcParseError) as err:
1227                 self.report_warning(
1228                     'parsing .netrc: %s' % error_to_compat_str(err))
1229
1230         return username, password
1231
1232     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1233         """
1234         Get the login info as (username, password)
1235         First look for the manually specified credentials using username_option
1236         and password_option as keys in params dictionary. If no such credentials
1237         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1238         value.
1239         If there's no info available, return (None, None)
1240         """
1241
1242         # Attempt to use provided username and password or .netrc data
1243         username = self.get_param(username_option)
1244         if username is not None:
1245             password = self.get_param(password_option)
1246         else:
1247             username, password = self._get_netrc_login_info(netrc_machine)
1248
1249         return username, password
1250
1251     def _get_tfa_info(self, note='two-factor verification code'):
1252         """
1253         Get the two-factor authentication info
1254         TODO - asking the user will be required for sms/phone verify
1255         currently just uses the command line option
1256         If there's no info available, return None
1257         """
1258
1259         tfa = self.get_param('twofactor')
1260         if tfa is not None:
1261             return tfa
1262
1263         return compat_getpass('Type %s and press [Return]: ' % note)
1264
1265     # Helper functions for extracting OpenGraph info
1266     @staticmethod
1267     def _og_regexes(prop):
1268         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1269         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1270                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1271         template = r'<meta[^>]+?%s[^>]+?%s'
1272         return [
1273             template % (property_re, content_re),
1274             template % (content_re, property_re),
1275         ]
1276
1277     @staticmethod
1278     def _meta_regex(prop):
1279         return r'''(?isx)<meta
1280                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1281                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1282
1283     def _og_search_property(self, prop, html, name=None, **kargs):
1284         prop = variadic(prop)
1285         if name is None:
1286             name = 'OpenGraph %s' % prop[0]
1287         og_regexes = []
1288         for p in prop:
1289             og_regexes.extend(self._og_regexes(p))
1290         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1291         if escaped is None:
1292             return None
1293         return unescapeHTML(escaped)
1294
1295     def _og_search_thumbnail(self, html, **kargs):
1296         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1297
1298     def _og_search_description(self, html, **kargs):
1299         return self._og_search_property('description', html, fatal=False, **kargs)
1300
1301     def _og_search_title(self, html, *, fatal=False, **kargs):
1302         return self._og_search_property('title', html, fatal=fatal, **kargs)
1303
1304     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1305         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1306         if secure:
1307             regexes = self._og_regexes('video:secure_url') + regexes
1308         return self._html_search_regex(regexes, html, name, **kargs)
1309
1310     def _og_search_url(self, html, **kargs):
1311         return self._og_search_property('url', html, **kargs)
1312
1313     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1314         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1315
1316     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1317         name = variadic(name)
1318         if display_name is None:
1319             display_name = name[0]
1320         return self._html_search_regex(
1321             [self._meta_regex(n) for n in name],
1322             html, display_name, fatal=fatal, group='content', **kwargs)
1323
1324     def _dc_search_uploader(self, html):
1325         return self._html_search_meta('dc.creator', html, 'uploader')
1326
1327     def _rta_search(self, html):
1328         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1329         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1330                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1331                      html):
1332             return 18
1333         return 0
1334
1335     def _media_rating_search(self, html):
1336         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1337         rating = self._html_search_meta('rating', html)
1338
1339         if not rating:
1340             return None
1341
1342         RATING_TABLE = {
1343             'safe for kids': 0,
1344             'general': 8,
1345             '14 years': 14,
1346             'mature': 17,
1347             'restricted': 19,
1348         }
1349         return RATING_TABLE.get(rating.lower())
1350
1351     def _family_friendly_search(self, html):
1352         # See http://schema.org/VideoObject
1353         family_friendly = self._html_search_meta(
1354             'isFamilyFriendly', html, default=None)
1355
1356         if not family_friendly:
1357             return None
1358
1359         RATING_TABLE = {
1360             '1': 0,
1361             'true': 0,
1362             '0': 18,
1363             'false': 18,
1364         }
1365         return RATING_TABLE.get(family_friendly.lower())
1366
1367     def _twitter_search_player(self, html):
1368         return self._html_search_meta('twitter:player', html,
1369                                       'twitter card player')
1370
1371     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1372         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1373         default = kwargs.get('default', NO_DEFAULT)
1374         # JSON-LD may be malformed and thus `fatal` should be respected.
1375         # At the same time `default` may be passed that assumes `fatal=False`
1376         # for _search_regex. Let's simulate the same behavior here as well.
1377         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1378         json_ld = []
1379         for mobj in json_ld_list:
1380             json_ld_item = self._parse_json(
1381                 mobj.group('json_ld'), video_id, fatal=fatal)
1382             if not json_ld_item:
1383                 continue
1384             if isinstance(json_ld_item, dict):
1385                 json_ld.append(json_ld_item)
1386             elif isinstance(json_ld_item, (list, tuple)):
1387                 json_ld.extend(json_ld_item)
1388         if json_ld:
1389             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1390         if json_ld:
1391             return json_ld
1392         if default is not NO_DEFAULT:
1393             return default
1394         elif fatal:
1395             raise RegexNotFoundError('Unable to extract JSON-LD')
1396         else:
1397             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1398             return {}
1399
1400     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1401         if isinstance(json_ld, compat_str):
1402             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1403         if not json_ld:
1404             return {}
1405         info = {}
1406         if not isinstance(json_ld, (list, tuple, dict)):
1407             return info
1408         if isinstance(json_ld, dict):
1409             json_ld = [json_ld]
1410
1411         INTERACTION_TYPE_MAP = {
1412             'CommentAction': 'comment',
1413             'AgreeAction': 'like',
1414             'DisagreeAction': 'dislike',
1415             'LikeAction': 'like',
1416             'DislikeAction': 'dislike',
1417             'ListenAction': 'view',
1418             'WatchAction': 'view',
1419             'ViewAction': 'view',
1420         }
1421
1422         def is_type(e, *expected_types):
1423             type = variadic(traverse_obj(e, '@type'))
1424             return any(x in type for x in expected_types)
1425
1426         def extract_interaction_type(e):
1427             interaction_type = e.get('interactionType')
1428             if isinstance(interaction_type, dict):
1429                 interaction_type = interaction_type.get('@type')
1430             return str_or_none(interaction_type)
1431
1432         def extract_interaction_statistic(e):
1433             interaction_statistic = e.get('interactionStatistic')
1434             if isinstance(interaction_statistic, dict):
1435                 interaction_statistic = [interaction_statistic]
1436             if not isinstance(interaction_statistic, list):
1437                 return
1438             for is_e in interaction_statistic:
1439                 if not is_type(is_e, 'InteractionCounter'):
1440                     continue
1441                 interaction_type = extract_interaction_type(is_e)
1442                 if not interaction_type:
1443                     continue
1444                 # For interaction count some sites provide string instead of
1445                 # an integer (as per spec) with non digit characters (e.g. ",")
1446                 # so extracting count with more relaxed str_to_int
1447                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1448                 if interaction_count is None:
1449                     continue
1450                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1451                 if not count_kind:
1452                     continue
1453                 count_key = '%s_count' % count_kind
1454                 if info.get(count_key) is not None:
1455                     continue
1456                 info[count_key] = interaction_count
1457
1458         def extract_chapter_information(e):
1459             chapters = [{
1460                 'title': part.get('name'),
1461                 'start_time': part.get('startOffset'),
1462                 'end_time': part.get('endOffset'),
1463             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1464             for idx, (last_c, current_c, next_c) in enumerate(zip(
1465                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1466                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1467                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1468                 if None in current_c.values():
1469                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1470                     return
1471             if chapters:
1472                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1473                 info['chapters'] = chapters
1474
1475         def extract_video_object(e):
1476             assert is_type(e, 'VideoObject')
1477             author = e.get('author')
1478             info.update({
1479                 'url': traverse_obj(e, 'contentUrl', 'embedUrl', expected_type=url_or_none),
1480                 'title': unescapeHTML(e.get('name')),
1481                 'description': unescapeHTML(e.get('description')),
1482                 'thumbnails': [{'url': url}
1483                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1484                                if url_or_none(url)],
1485                 'duration': parse_duration(e.get('duration')),
1486                 'timestamp': unified_timestamp(e.get('uploadDate')),
1487                 # author can be an instance of 'Organization' or 'Person' types.
1488                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1489                 # however some websites are using 'Text' type instead.
1490                 # 1. https://schema.org/VideoObject
1491                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1492                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1493                 'tbr': int_or_none(e.get('bitrate')),
1494                 'width': int_or_none(e.get('width')),
1495                 'height': int_or_none(e.get('height')),
1496                 'view_count': int_or_none(e.get('interactionCount')),
1497             })
1498             extract_interaction_statistic(e)
1499             extract_chapter_information(e)
1500
1501         def traverse_json_ld(json_ld, at_top_level=True):
1502             for e in json_ld:
1503                 if at_top_level and '@context' not in e:
1504                     continue
1505                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1506                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1507                     break
1508                 if expected_type is not None and not is_type(e, expected_type):
1509                     continue
1510                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1511                 if rating is not None:
1512                     info['average_rating'] = rating
1513                 if is_type(e, 'TVEpisode', 'Episode'):
1514                     episode_name = unescapeHTML(e.get('name'))
1515                     info.update({
1516                         'episode': episode_name,
1517                         'episode_number': int_or_none(e.get('episodeNumber')),
1518                         'description': unescapeHTML(e.get('description')),
1519                     })
1520                     if not info.get('title') and episode_name:
1521                         info['title'] = episode_name
1522                     part_of_season = e.get('partOfSeason')
1523                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1524                         info.update({
1525                             'season': unescapeHTML(part_of_season.get('name')),
1526                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1527                         })
1528                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1529                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1530                         info['series'] = unescapeHTML(part_of_series.get('name'))
1531                 elif is_type(e, 'Movie'):
1532                     info.update({
1533                         'title': unescapeHTML(e.get('name')),
1534                         'description': unescapeHTML(e.get('description')),
1535                         'duration': parse_duration(e.get('duration')),
1536                         'timestamp': unified_timestamp(e.get('dateCreated')),
1537                     })
1538                 elif is_type(e, 'Article', 'NewsArticle'):
1539                     info.update({
1540                         'timestamp': parse_iso8601(e.get('datePublished')),
1541                         'title': unescapeHTML(e.get('headline')),
1542                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1543                     })
1544                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1545                         extract_video_object(e['video'][0])
1546                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1547                         extract_video_object(e['subjectOf'][0])
1548                 elif is_type(e, 'VideoObject'):
1549                     extract_video_object(e)
1550                     if expected_type is None:
1551                         continue
1552                     else:
1553                         break
1554                 video = e.get('video')
1555                 if is_type(video, 'VideoObject'):
1556                     extract_video_object(video)
1557                 if expected_type is None:
1558                     continue
1559                 else:
1560                     break
1561         traverse_json_ld(json_ld)
1562
1563         return filter_dict(info)
1564
1565     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1566         return self._parse_json(
1567             self._search_regex(
1568                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1569                 webpage, 'next.js data', fatal=fatal, **kw),
1570             video_id, transform_source=transform_source, fatal=fatal)
1571
1572     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', return_full_data=False):
1573         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1574         # not all website do this, but it can be changed
1575         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1576         rectx = re.escape(context_name)
1577         js, arg_keys, arg_vals = self._search_regex(
1578             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1579              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1580             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1581
1582         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1583
1584         for key, val in args.items():
1585             if val in ('undefined', 'void 0'):
1586                 args[key] = 'null'
1587
1588         ret = self._parse_json(js_to_json(js, args), video_id)
1589         if return_full_data:
1590             return ret
1591         return ret['data'][0]
1592
1593     @staticmethod
1594     def _hidden_inputs(html):
1595         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1596         hidden_inputs = {}
1597         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1598             attrs = extract_attributes(input)
1599             if not input:
1600                 continue
1601             if attrs.get('type') not in ('hidden', 'submit'):
1602                 continue
1603             name = attrs.get('name') or attrs.get('id')
1604             value = attrs.get('value')
1605             if name and value is not None:
1606                 hidden_inputs[name] = value
1607         return hidden_inputs
1608
1609     def _form_hidden_inputs(self, form_id, html):
1610         form = self._search_regex(
1611             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1612             html, '%s form' % form_id, group='form')
1613         return self._hidden_inputs(form)
1614
1615     class FormatSort:
1616         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1617
1618         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1619                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1620                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1621         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1622                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1623                         'fps', 'fs_approx', 'source', 'id')
1624
1625         settings = {
1626             'vcodec': {'type': 'ordered', 'regex': True,
1627                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1628             'acodec': {'type': 'ordered', 'regex': True,
1629                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1630             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1631                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1632             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1633                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1634             'vext': {'type': 'ordered', 'field': 'video_ext',
1635                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1636                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1637             'aext': {'type': 'ordered', 'field': 'audio_ext',
1638                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1639                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1640             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1641             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1642                            'field': ('vcodec', 'acodec'),
1643                            'function': lambda it: int(any(v != 'none' for v in it))},
1644             'ie_pref': {'priority': True, 'type': 'extractor'},
1645             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1646             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1647             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1648             'quality': {'convert': 'float', 'default': -1},
1649             'filesize': {'convert': 'bytes'},
1650             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1651             'id': {'convert': 'string', 'field': 'format_id'},
1652             'height': {'convert': 'float_none'},
1653             'width': {'convert': 'float_none'},
1654             'fps': {'convert': 'float_none'},
1655             'tbr': {'convert': 'float_none'},
1656             'vbr': {'convert': 'float_none'},
1657             'abr': {'convert': 'float_none'},
1658             'asr': {'convert': 'float_none'},
1659             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1660
1661             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1662             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1663             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1664             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1665             'res': {'type': 'multiple', 'field': ('height', 'width'),
1666                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1667
1668             # For compatibility with youtube-dl
1669             'format_id': {'type': 'alias', 'field': 'id'},
1670             'preference': {'type': 'alias', 'field': 'ie_pref'},
1671             'language_preference': {'type': 'alias', 'field': 'lang'},
1672             'source_preference': {'type': 'alias', 'field': 'source'},
1673             'protocol': {'type': 'alias', 'field': 'proto'},
1674             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1675
1676             # Deprecated
1677             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1678             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1679             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1680             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1681             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1682             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1683             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1684             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1685             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1686             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1687             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1688             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1689             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1690             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1691             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1692             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1693             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1694             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1695             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1696             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1697         }
1698
1699         def __init__(self, ie, field_preference):
1700             self._order = []
1701             self.ydl = ie._downloader
1702             self.evaluate_params(self.ydl.params, field_preference)
1703             if ie.get_param('verbose'):
1704                 self.print_verbose_info(self.ydl.write_debug)
1705
1706         def _get_field_setting(self, field, key):
1707             if field not in self.settings:
1708                 if key in ('forced', 'priority'):
1709                     return False
1710                 self.ydl.deprecation_warning(
1711                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1712                     'and may be removed in a future version')
1713                 self.settings[field] = {}
1714             propObj = self.settings[field]
1715             if key not in propObj:
1716                 type = propObj.get('type')
1717                 if key == 'field':
1718                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1719                 elif key == 'convert':
1720                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1721                 else:
1722                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1723                 propObj[key] = default
1724             return propObj[key]
1725
1726         def _resolve_field_value(self, field, value, convertNone=False):
1727             if value is None:
1728                 if not convertNone:
1729                     return None
1730             else:
1731                 value = value.lower()
1732             conversion = self._get_field_setting(field, 'convert')
1733             if conversion == 'ignore':
1734                 return None
1735             if conversion == 'string':
1736                 return value
1737             elif conversion == 'float_none':
1738                 return float_or_none(value)
1739             elif conversion == 'bytes':
1740                 return FileDownloader.parse_bytes(value)
1741             elif conversion == 'order':
1742                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1743                 use_regex = self._get_field_setting(field, 'regex')
1744                 list_length = len(order_list)
1745                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1746                 if use_regex and value is not None:
1747                     for i, regex in enumerate(order_list):
1748                         if regex and re.match(regex, value):
1749                             return list_length - i
1750                     return list_length - empty_pos  # not in list
1751                 else:  # not regex or  value = None
1752                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1753             else:
1754                 if value.isnumeric():
1755                     return float(value)
1756                 else:
1757                     self.settings[field]['convert'] = 'string'
1758                     return value
1759
1760         def evaluate_params(self, params, sort_extractor):
1761             self._use_free_order = params.get('prefer_free_formats', False)
1762             self._sort_user = params.get('format_sort', [])
1763             self._sort_extractor = sort_extractor
1764
1765             def add_item(field, reverse, closest, limit_text):
1766                 field = field.lower()
1767                 if field in self._order:
1768                     return
1769                 self._order.append(field)
1770                 limit = self._resolve_field_value(field, limit_text)
1771                 data = {
1772                     'reverse': reverse,
1773                     'closest': False if limit is None else closest,
1774                     'limit_text': limit_text,
1775                     'limit': limit}
1776                 if field in self.settings:
1777                     self.settings[field].update(data)
1778                 else:
1779                     self.settings[field] = data
1780
1781             sort_list = (
1782                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1783                 + (tuple() if params.get('format_sort_force', False)
1784                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1785                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1786
1787             for item in sort_list:
1788                 match = re.match(self.regex, item)
1789                 if match is None:
1790                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1791                 field = match.group('field')
1792                 if field is None:
1793                     continue
1794                 if self._get_field_setting(field, 'type') == 'alias':
1795                     alias, field = field, self._get_field_setting(field, 'field')
1796                     if self._get_field_setting(alias, 'deprecated'):
1797                         self.ydl.deprecation_warning(
1798                             f'Format sorting alias {alias} is deprecated '
1799                             f'and may be removed in a future version. Please use {field} instead')
1800                 reverse = match.group('reverse') is not None
1801                 closest = match.group('separator') == '~'
1802                 limit_text = match.group('limit')
1803
1804                 has_limit = limit_text is not None
1805                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1806                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1807
1808                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1809                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1810                 limit_count = len(limits)
1811                 for (i, f) in enumerate(fields):
1812                     add_item(f, reverse, closest,
1813                              limits[i] if i < limit_count
1814                              else limits[0] if has_limit and not has_multiple_limits
1815                              else None)
1816
1817         def print_verbose_info(self, write_debug):
1818             if self._sort_user:
1819                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1820             if self._sort_extractor:
1821                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1822             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1823                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1824                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1825                               self._get_field_setting(field, 'limit_text'),
1826                               self._get_field_setting(field, 'limit'))
1827                 if self._get_field_setting(field, 'limit_text') is not None else '')
1828                 for field in self._order if self._get_field_setting(field, 'visible')]))
1829
1830         def _calculate_field_preference_from_value(self, format, field, type, value):
1831             reverse = self._get_field_setting(field, 'reverse')
1832             closest = self._get_field_setting(field, 'closest')
1833             limit = self._get_field_setting(field, 'limit')
1834
1835             if type == 'extractor':
1836                 maximum = self._get_field_setting(field, 'max')
1837                 if value is None or (maximum is not None and value >= maximum):
1838                     value = -1
1839             elif type == 'boolean':
1840                 in_list = self._get_field_setting(field, 'in_list')
1841                 not_in_list = self._get_field_setting(field, 'not_in_list')
1842                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1843             elif type == 'ordered':
1844                 value = self._resolve_field_value(field, value, True)
1845
1846             # try to convert to number
1847             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1848             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1849             if is_num:
1850                 value = val_num
1851
1852             return ((-10, 0) if value is None
1853                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1854                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1855                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1856                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1857                     else (-1, value, 0))
1858
1859         def _calculate_field_preference(self, format, field):
1860             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1861             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1862             if type == 'multiple':
1863                 type = 'field'  # Only 'field' is allowed in multiple for now
1864                 actual_fields = self._get_field_setting(field, 'field')
1865
1866                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1867             else:
1868                 value = get_value(field)
1869             return self._calculate_field_preference_from_value(format, field, type, value)
1870
1871         def calculate_preference(self, format):
1872             # Determine missing protocol
1873             if not format.get('protocol'):
1874                 format['protocol'] = determine_protocol(format)
1875
1876             # Determine missing ext
1877             if not format.get('ext') and 'url' in format:
1878                 format['ext'] = determine_ext(format['url'])
1879             if format.get('vcodec') == 'none':
1880                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1881                 format['video_ext'] = 'none'
1882             else:
1883                 format['video_ext'] = format['ext']
1884                 format['audio_ext'] = 'none'
1885             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1886             #    format['preference'] = -1000
1887
1888             # Determine missing bitrates
1889             if format.get('tbr') is None:
1890                 if format.get('vbr') is not None and format.get('abr') is not None:
1891                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1892             else:
1893                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1894                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1895                 if format.get('acodec') != 'none' and format.get('abr') is None:
1896                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1897
1898             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1899
1900     def _sort_formats(self, formats, field_preference=[]):
1901         if not formats:
1902             return
1903         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1904
1905     def _check_formats(self, formats, video_id):
1906         if formats:
1907             formats[:] = filter(
1908                 lambda f: self._is_valid_url(
1909                     f['url'], video_id,
1910                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1911                 formats)
1912
1913     @staticmethod
1914     def _remove_duplicate_formats(formats):
1915         format_urls = set()
1916         unique_formats = []
1917         for f in formats:
1918             if f['url'] not in format_urls:
1919                 format_urls.add(f['url'])
1920                 unique_formats.append(f)
1921         formats[:] = unique_formats
1922
1923     def _is_valid_url(self, url, video_id, item='video', headers={}):
1924         url = self._proto_relative_url(url, scheme='http:')
1925         # For now assume non HTTP(S) URLs always valid
1926         if not (url.startswith('http://') or url.startswith('https://')):
1927             return True
1928         try:
1929             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1930             return True
1931         except ExtractorError as e:
1932             self.to_screen(
1933                 '%s: %s URL is invalid, skipping: %s'
1934                 % (video_id, item, error_to_compat_str(e.cause)))
1935             return False
1936
1937     def http_scheme(self):
1938         """ Either "http:" or "https:", depending on the user's preferences """
1939         return (
1940             'http:'
1941             if self.get_param('prefer_insecure', False)
1942             else 'https:')
1943
1944     def _proto_relative_url(self, url, scheme=None):
1945         if url is None:
1946             return url
1947         if url.startswith('//'):
1948             if scheme is None:
1949                 scheme = self.http_scheme()
1950             return scheme + url
1951         else:
1952             return url
1953
1954     def _sleep(self, timeout, video_id, msg_template=None):
1955         if msg_template is None:
1956             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1957         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1958         self.to_screen(msg)
1959         time.sleep(timeout)
1960
1961     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1962                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1963                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1964         res = self._download_xml_handle(
1965             manifest_url, video_id, 'Downloading f4m manifest',
1966             'Unable to download f4m manifest',
1967             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1968             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1969             transform_source=transform_source,
1970             fatal=fatal, data=data, headers=headers, query=query)
1971         if res is False:
1972             return []
1973
1974         manifest, urlh = res
1975         manifest_url = urlh.geturl()
1976
1977         return self._parse_f4m_formats(
1978             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1979             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1980
1981     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1982                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1983                            fatal=True, m3u8_id=None):
1984         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1985             return []
1986
1987         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1988         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1989         if akamai_pv is not None and ';' in akamai_pv.text:
1990             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1991             if playerVerificationChallenge.strip() != '':
1992                 return []
1993
1994         formats = []
1995         manifest_version = '1.0'
1996         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1997         if not media_nodes:
1998             manifest_version = '2.0'
1999             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2000         # Remove unsupported DRM protected media from final formats
2001         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2002         media_nodes = remove_encrypted_media(media_nodes)
2003         if not media_nodes:
2004             return formats
2005
2006         manifest_base_url = get_base_url(manifest)
2007
2008         bootstrap_info = xpath_element(
2009             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2010             'bootstrap info', default=None)
2011
2012         vcodec = None
2013         mime_type = xpath_text(
2014             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2015             'base URL', default=None)
2016         if mime_type and mime_type.startswith('audio/'):
2017             vcodec = 'none'
2018
2019         for i, media_el in enumerate(media_nodes):
2020             tbr = int_or_none(media_el.attrib.get('bitrate'))
2021             width = int_or_none(media_el.attrib.get('width'))
2022             height = int_or_none(media_el.attrib.get('height'))
2023             format_id = join_nonempty(f4m_id, tbr or i)
2024             # If <bootstrapInfo> is present, the specified f4m is a
2025             # stream-level manifest, and only set-level manifests may refer to
2026             # external resources.  See section 11.4 and section 4 of F4M spec
2027             if bootstrap_info is None:
2028                 media_url = None
2029                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2030                 if manifest_version == '2.0':
2031                     media_url = media_el.attrib.get('href')
2032                 if media_url is None:
2033                     media_url = media_el.attrib.get('url')
2034                 if not media_url:
2035                     continue
2036                 manifest_url = (
2037                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2038                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2039                 # If media_url is itself a f4m manifest do the recursive extraction
2040                 # since bitrates in parent manifest (this one) and media_url manifest
2041                 # may differ leading to inability to resolve the format by requested
2042                 # bitrate in f4m downloader
2043                 ext = determine_ext(manifest_url)
2044                 if ext == 'f4m':
2045                     f4m_formats = self._extract_f4m_formats(
2046                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2047                         transform_source=transform_source, fatal=fatal)
2048                     # Sometimes stream-level manifest contains single media entry that
2049                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2050                     # At the same time parent's media entry in set-level manifest may
2051                     # contain it. We will copy it from parent in such cases.
2052                     if len(f4m_formats) == 1:
2053                         f = f4m_formats[0]
2054                         f.update({
2055                             'tbr': f.get('tbr') or tbr,
2056                             'width': f.get('width') or width,
2057                             'height': f.get('height') or height,
2058                             'format_id': f.get('format_id') if not tbr else format_id,
2059                             'vcodec': vcodec,
2060                         })
2061                     formats.extend(f4m_formats)
2062                     continue
2063                 elif ext == 'm3u8':
2064                     formats.extend(self._extract_m3u8_formats(
2065                         manifest_url, video_id, 'mp4', preference=preference,
2066                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2067                     continue
2068             formats.append({
2069                 'format_id': format_id,
2070                 'url': manifest_url,
2071                 'manifest_url': manifest_url,
2072                 'ext': 'flv' if bootstrap_info is not None else None,
2073                 'protocol': 'f4m',
2074                 'tbr': tbr,
2075                 'width': width,
2076                 'height': height,
2077                 'vcodec': vcodec,
2078                 'preference': preference,
2079                 'quality': quality,
2080             })
2081         return formats
2082
2083     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2084         return {
2085             'format_id': join_nonempty(m3u8_id, 'meta'),
2086             'url': m3u8_url,
2087             'ext': ext,
2088             'protocol': 'm3u8',
2089             'preference': preference - 100 if preference else -100,
2090             'quality': quality,
2091             'resolution': 'multiple',
2092             'format_note': 'Quality selection URL',
2093         }
2094
2095     def _report_ignoring_subs(self, name):
2096         self.report_warning(bug_reports_message(
2097             f'Ignoring subtitle tracks found in the {name} manifest; '
2098             'if any subtitle tracks are missing,'
2099         ), only_once=True)
2100
2101     def _extract_m3u8_formats(self, *args, **kwargs):
2102         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2103         if subs:
2104             self._report_ignoring_subs('HLS')
2105         return fmts
2106
2107     def _extract_m3u8_formats_and_subtitles(
2108             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2109             preference=None, quality=None, m3u8_id=None, note=None,
2110             errnote=None, fatal=True, live=False, data=None, headers={},
2111             query={}):
2112
2113         res = self._download_webpage_handle(
2114             m3u8_url, video_id,
2115             note='Downloading m3u8 information' if note is None else note,
2116             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2117             fatal=fatal, data=data, headers=headers, query=query)
2118
2119         if res is False:
2120             return [], {}
2121
2122         m3u8_doc, urlh = res
2123         m3u8_url = urlh.geturl()
2124
2125         return self._parse_m3u8_formats_and_subtitles(
2126             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2127             preference=preference, quality=quality, m3u8_id=m3u8_id,
2128             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2129             headers=headers, query=query, video_id=video_id)
2130
2131     def _parse_m3u8_formats_and_subtitles(
2132             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2133             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2134             errnote=None, fatal=True, data=None, headers={}, query={},
2135             video_id=None):
2136         formats, subtitles = [], {}
2137
2138         has_drm = re.search('|'.join([
2139             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2140             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2141         ]), m3u8_doc)
2142
2143         def format_url(url):
2144             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2145
2146         if self.get_param('hls_split_discontinuity', False):
2147             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2148                 if not m3u8_doc:
2149                     if not manifest_url:
2150                         return []
2151                     m3u8_doc = self._download_webpage(
2152                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2153                         note=False, errnote='Failed to download m3u8 playlist information')
2154                     if m3u8_doc is False:
2155                         return []
2156                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2157
2158         else:
2159             def _extract_m3u8_playlist_indices(*args, **kwargs):
2160                 return [None]
2161
2162         # References:
2163         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2164         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2165         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2166
2167         # We should try extracting formats only from master playlists [1, 4.3.4],
2168         # i.e. playlists that describe available qualities. On the other hand
2169         # media playlists [1, 4.3.3] should be returned as is since they contain
2170         # just the media without qualities renditions.
2171         # Fortunately, master playlist can be easily distinguished from media
2172         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2173         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2174         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2175         # media playlist and MUST NOT appear in master playlist thus we can
2176         # clearly detect media playlist with this criterion.
2177
2178         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2179             formats = [{
2180                 'format_id': join_nonempty(m3u8_id, idx),
2181                 'format_index': idx,
2182                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2183                 'ext': ext,
2184                 'protocol': entry_protocol,
2185                 'preference': preference,
2186                 'quality': quality,
2187                 'has_drm': has_drm,
2188             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2189
2190             return formats, subtitles
2191
2192         groups = {}
2193         last_stream_inf = {}
2194
2195         def extract_media(x_media_line):
2196             media = parse_m3u8_attributes(x_media_line)
2197             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2198             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2199             if not (media_type and group_id and name):
2200                 return
2201             groups.setdefault(group_id, []).append(media)
2202             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2203             if media_type == 'SUBTITLES':
2204                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2205                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2206                 # However, lack of URI has been spotted in the wild.
2207                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2208                 if not media.get('URI'):
2209                     return
2210                 url = format_url(media['URI'])
2211                 sub_info = {
2212                     'url': url,
2213                     'ext': determine_ext(url),
2214                 }
2215                 if sub_info['ext'] == 'm3u8':
2216                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2217                     # files may contain is WebVTT:
2218                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2219                     sub_info['ext'] = 'vtt'
2220                     sub_info['protocol'] = 'm3u8_native'
2221                 lang = media.get('LANGUAGE') or 'und'
2222                 subtitles.setdefault(lang, []).append(sub_info)
2223             if media_type not in ('VIDEO', 'AUDIO'):
2224                 return
2225             media_url = media.get('URI')
2226             if media_url:
2227                 manifest_url = format_url(media_url)
2228                 formats.extend({
2229                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2230                     'format_note': name,
2231                     'format_index': idx,
2232                     'url': manifest_url,
2233                     'manifest_url': m3u8_url,
2234                     'language': media.get('LANGUAGE'),
2235                     'ext': ext,
2236                     'protocol': entry_protocol,
2237                     'preference': preference,
2238                     'quality': quality,
2239                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2240                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2241
2242         def build_stream_name():
2243             # Despite specification does not mention NAME attribute for
2244             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2245             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2246             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2247             stream_name = last_stream_inf.get('NAME')
2248             if stream_name:
2249                 return stream_name
2250             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2251             # from corresponding rendition group
2252             stream_group_id = last_stream_inf.get('VIDEO')
2253             if not stream_group_id:
2254                 return
2255             stream_group = groups.get(stream_group_id)
2256             if not stream_group:
2257                 return stream_group_id
2258             rendition = stream_group[0]
2259             return rendition.get('NAME') or stream_group_id
2260
2261         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2262         # chance to detect video only formats when EXT-X-STREAM-INF tags
2263         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2264         for line in m3u8_doc.splitlines():
2265             if line.startswith('#EXT-X-MEDIA:'):
2266                 extract_media(line)
2267
2268         for line in m3u8_doc.splitlines():
2269             if line.startswith('#EXT-X-STREAM-INF:'):
2270                 last_stream_inf = parse_m3u8_attributes(line)
2271             elif line.startswith('#') or not line.strip():
2272                 continue
2273             else:
2274                 tbr = float_or_none(
2275                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2276                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2277                 manifest_url = format_url(line.strip())
2278
2279                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2280                     format_id = [m3u8_id, None, idx]
2281                     # Bandwidth of live streams may differ over time thus making
2282                     # format_id unpredictable. So it's better to keep provided
2283                     # format_id intact.
2284                     if not live:
2285                         stream_name = build_stream_name()
2286                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2287                     f = {
2288                         'format_id': join_nonempty(*format_id),
2289                         'format_index': idx,
2290                         'url': manifest_url,
2291                         'manifest_url': m3u8_url,
2292                         'tbr': tbr,
2293                         'ext': ext,
2294                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2295                         'protocol': entry_protocol,
2296                         'preference': preference,
2297                         'quality': quality,
2298                     }
2299                     resolution = last_stream_inf.get('RESOLUTION')
2300                     if resolution:
2301                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2302                         if mobj:
2303                             f['width'] = int(mobj.group('width'))
2304                             f['height'] = int(mobj.group('height'))
2305                     # Unified Streaming Platform
2306                     mobj = re.search(
2307                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2308                     if mobj:
2309                         abr, vbr = mobj.groups()
2310                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2311                         f.update({
2312                             'vbr': vbr,
2313                             'abr': abr,
2314                         })
2315                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2316                     f.update(codecs)
2317                     audio_group_id = last_stream_inf.get('AUDIO')
2318                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2319                     # references a rendition group MUST have a CODECS attribute.
2320                     # However, this is not always respected, for example, [2]
2321                     # contains EXT-X-STREAM-INF tag which references AUDIO
2322                     # rendition group but does not have CODECS and despite
2323                     # referencing an audio group it represents a complete
2324                     # (with audio and video) format. So, for such cases we will
2325                     # ignore references to rendition groups and treat them
2326                     # as complete formats.
2327                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2328                         audio_group = groups.get(audio_group_id)
2329                         if audio_group and audio_group[0].get('URI'):
2330                             # TODO: update acodec for audio only formats with
2331                             # the same GROUP-ID
2332                             f['acodec'] = 'none'
2333                     if not f.get('ext'):
2334                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2335                     formats.append(f)
2336
2337                     # for DailyMotion
2338                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2339                     if progressive_uri:
2340                         http_f = f.copy()
2341                         del http_f['manifest_url']
2342                         http_f.update({
2343                             'format_id': f['format_id'].replace('hls-', 'http-'),
2344                             'protocol': 'http',
2345                             'url': progressive_uri,
2346                         })
2347                         formats.append(http_f)
2348
2349                 last_stream_inf = {}
2350         return formats, subtitles
2351
2352     def _extract_m3u8_vod_duration(
2353             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2354
2355         m3u8_vod = self._download_webpage(
2356             m3u8_vod_url, video_id,
2357             note='Downloading m3u8 VOD manifest' if note is None else note,
2358             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2359             fatal=False, data=data, headers=headers, query=query)
2360
2361         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2362
2363     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2364         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2365             return None
2366
2367         return int(sum(
2368             float(line[len('#EXTINF:'):].split(',')[0])
2369             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2370
2371     @staticmethod
2372     def _xpath_ns(path, namespace=None):
2373         if not namespace:
2374             return path
2375         out = []
2376         for c in path.split('/'):
2377             if not c or c == '.':
2378                 out.append(c)
2379             else:
2380                 out.append('{%s}%s' % (namespace, c))
2381         return '/'.join(out)
2382
2383     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2384         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2385         if res is False:
2386             assert not fatal
2387             return [], {}
2388
2389         smil, urlh = res
2390         smil_url = urlh.geturl()
2391
2392         namespace = self._parse_smil_namespace(smil)
2393
2394         fmts = self._parse_smil_formats(
2395             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2396         subs = self._parse_smil_subtitles(
2397             smil, namespace=namespace)
2398
2399         return fmts, subs
2400
2401     def _extract_smil_formats(self, *args, **kwargs):
2402         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2403         if subs:
2404             self._report_ignoring_subs('SMIL')
2405         return fmts
2406
2407     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2408         res = self._download_smil(smil_url, video_id, fatal=fatal)
2409         if res is False:
2410             return {}
2411
2412         smil, urlh = res
2413         smil_url = urlh.geturl()
2414
2415         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2416
2417     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2418         return self._download_xml_handle(
2419             smil_url, video_id, 'Downloading SMIL file',
2420             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2421
2422     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2423         namespace = self._parse_smil_namespace(smil)
2424
2425         formats = self._parse_smil_formats(
2426             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2427         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2428
2429         video_id = os.path.splitext(url_basename(smil_url))[0]
2430         title = None
2431         description = None
2432         upload_date = None
2433         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2434             name = meta.attrib.get('name')
2435             content = meta.attrib.get('content')
2436             if not name or not content:
2437                 continue
2438             if not title and name == 'title':
2439                 title = content
2440             elif not description and name in ('description', 'abstract'):
2441                 description = content
2442             elif not upload_date and name == 'date':
2443                 upload_date = unified_strdate(content)
2444
2445         thumbnails = [{
2446             'id': image.get('type'),
2447             'url': image.get('src'),
2448             'width': int_or_none(image.get('width')),
2449             'height': int_or_none(image.get('height')),
2450         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2451
2452         return {
2453             'id': video_id,
2454             'title': title or video_id,
2455             'description': description,
2456             'upload_date': upload_date,
2457             'thumbnails': thumbnails,
2458             'formats': formats,
2459             'subtitles': subtitles,
2460         }
2461
2462     def _parse_smil_namespace(self, smil):
2463         return self._search_regex(
2464             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2465
2466     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2467         base = smil_url
2468         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2469             b = meta.get('base') or meta.get('httpBase')
2470             if b:
2471                 base = b
2472                 break
2473
2474         formats = []
2475         rtmp_count = 0
2476         http_count = 0
2477         m3u8_count = 0
2478         imgs_count = 0
2479
2480         srcs = set()
2481         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2482         for medium in media:
2483             src = medium.get('src')
2484             if not src or src in srcs:
2485                 continue
2486             srcs.add(src)
2487
2488             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2489             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2490             width = int_or_none(medium.get('width'))
2491             height = int_or_none(medium.get('height'))
2492             proto = medium.get('proto')
2493             ext = medium.get('ext')
2494             src_ext = determine_ext(src)
2495             streamer = medium.get('streamer') or base
2496
2497             if proto == 'rtmp' or streamer.startswith('rtmp'):
2498                 rtmp_count += 1
2499                 formats.append({
2500                     'url': streamer,
2501                     'play_path': src,
2502                     'ext': 'flv',
2503                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2504                     'tbr': bitrate,
2505                     'filesize': filesize,
2506                     'width': width,
2507                     'height': height,
2508                 })
2509                 if transform_rtmp_url:
2510                     streamer, src = transform_rtmp_url(streamer, src)
2511                     formats[-1].update({
2512                         'url': streamer,
2513                         'play_path': src,
2514                     })
2515                 continue
2516
2517             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2518             src_url = src_url.strip()
2519
2520             if proto == 'm3u8' or src_ext == 'm3u8':
2521                 m3u8_formats = self._extract_m3u8_formats(
2522                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2523                 if len(m3u8_formats) == 1:
2524                     m3u8_count += 1
2525                     m3u8_formats[0].update({
2526                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2527                         'tbr': bitrate,
2528                         'width': width,
2529                         'height': height,
2530                     })
2531                 formats.extend(m3u8_formats)
2532             elif src_ext == 'f4m':
2533                 f4m_url = src_url
2534                 if not f4m_params:
2535                     f4m_params = {
2536                         'hdcore': '3.2.0',
2537                         'plugin': 'flowplayer-3.2.0.1',
2538                     }
2539                 f4m_url += '&' if '?' in f4m_url else '?'
2540                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2541                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2542             elif src_ext == 'mpd':
2543                 formats.extend(self._extract_mpd_formats(
2544                     src_url, video_id, mpd_id='dash', fatal=False))
2545             elif re.search(r'\.ism/[Mm]anifest', src_url):
2546                 formats.extend(self._extract_ism_formats(
2547                     src_url, video_id, ism_id='mss', fatal=False))
2548             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2549                 http_count += 1
2550                 formats.append({
2551                     'url': src_url,
2552                     'ext': ext or src_ext or 'flv',
2553                     'format_id': 'http-%d' % (bitrate or http_count),
2554                     'tbr': bitrate,
2555                     'filesize': filesize,
2556                     'width': width,
2557                     'height': height,
2558                 })
2559
2560         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2561             src = medium.get('src')
2562             if not src or src in srcs:
2563                 continue
2564             srcs.add(src)
2565
2566             imgs_count += 1
2567             formats.append({
2568                 'format_id': 'imagestream-%d' % (imgs_count),
2569                 'url': src,
2570                 'ext': mimetype2ext(medium.get('type')),
2571                 'acodec': 'none',
2572                 'vcodec': 'none',
2573                 'width': int_or_none(medium.get('width')),
2574                 'height': int_or_none(medium.get('height')),
2575                 'format_note': 'SMIL storyboards',
2576             })
2577
2578         return formats
2579
2580     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2581         urls = []
2582         subtitles = {}
2583         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2584             src = textstream.get('src')
2585             if not src or src in urls:
2586                 continue
2587             urls.append(src)
2588             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2589             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2590             subtitles.setdefault(lang, []).append({
2591                 'url': src,
2592                 'ext': ext,
2593             })
2594         return subtitles
2595
2596     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2597         res = self._download_xml_handle(
2598             xspf_url, playlist_id, 'Downloading xpsf playlist',
2599             'Unable to download xspf manifest', fatal=fatal)
2600         if res is False:
2601             return []
2602
2603         xspf, urlh = res
2604         xspf_url = urlh.geturl()
2605
2606         return self._parse_xspf(
2607             xspf, playlist_id, xspf_url=xspf_url,
2608             xspf_base_url=base_url(xspf_url))
2609
2610     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2611         NS_MAP = {
2612             'xspf': 'http://xspf.org/ns/0/',
2613             's1': 'http://static.streamone.nl/player/ns/0',
2614         }
2615
2616         entries = []
2617         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2618             title = xpath_text(
2619                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2620             description = xpath_text(
2621                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2622             thumbnail = xpath_text(
2623                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2624             duration = float_or_none(
2625                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2626
2627             formats = []
2628             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2629                 format_url = urljoin(xspf_base_url, location.text)
2630                 if not format_url:
2631                     continue
2632                 formats.append({
2633                     'url': format_url,
2634                     'manifest_url': xspf_url,
2635                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2636                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2637                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2638                 })
2639             self._sort_formats(formats)
2640
2641             entries.append({
2642                 'id': playlist_id,
2643                 'title': title,
2644                 'description': description,
2645                 'thumbnail': thumbnail,
2646                 'duration': duration,
2647                 'formats': formats,
2648             })
2649         return entries
2650
2651     def _extract_mpd_formats(self, *args, **kwargs):
2652         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2653         if subs:
2654             self._report_ignoring_subs('DASH')
2655         return fmts
2656
2657     def _extract_mpd_formats_and_subtitles(
2658             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2659             fatal=True, data=None, headers={}, query={}):
2660         res = self._download_xml_handle(
2661             mpd_url, video_id,
2662             note='Downloading MPD manifest' if note is None else note,
2663             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2664             fatal=fatal, data=data, headers=headers, query=query)
2665         if res is False:
2666             return [], {}
2667         mpd_doc, urlh = res
2668         if mpd_doc is None:
2669             return [], {}
2670
2671         # We could have been redirected to a new url when we retrieved our mpd file.
2672         mpd_url = urlh.geturl()
2673         mpd_base_url = base_url(mpd_url)
2674
2675         return self._parse_mpd_formats_and_subtitles(
2676             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2677
2678     def _parse_mpd_formats(self, *args, **kwargs):
2679         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2680         if subs:
2681             self._report_ignoring_subs('DASH')
2682         return fmts
2683
2684     def _parse_mpd_formats_and_subtitles(
2685             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2686         """
2687         Parse formats from MPD manifest.
2688         References:
2689          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2690             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2691          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2692         """
2693         if not self.get_param('dynamic_mpd', True):
2694             if mpd_doc.get('type') == 'dynamic':
2695                 return [], {}
2696
2697         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2698
2699         def _add_ns(path):
2700             return self._xpath_ns(path, namespace)
2701
2702         def is_drm_protected(element):
2703             return element.find(_add_ns('ContentProtection')) is not None
2704
2705         def extract_multisegment_info(element, ms_parent_info):
2706             ms_info = ms_parent_info.copy()
2707
2708             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2709             # common attributes and elements.  We will only extract relevant
2710             # for us.
2711             def extract_common(source):
2712                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2713                 if segment_timeline is not None:
2714                     s_e = segment_timeline.findall(_add_ns('S'))
2715                     if s_e:
2716                         ms_info['total_number'] = 0
2717                         ms_info['s'] = []
2718                         for s in s_e:
2719                             r = int(s.get('r', 0))
2720                             ms_info['total_number'] += 1 + r
2721                             ms_info['s'].append({
2722                                 't': int(s.get('t', 0)),
2723                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2724                                 'd': int(s.attrib['d']),
2725                                 'r': r,
2726                             })
2727                 start_number = source.get('startNumber')
2728                 if start_number:
2729                     ms_info['start_number'] = int(start_number)
2730                 timescale = source.get('timescale')
2731                 if timescale:
2732                     ms_info['timescale'] = int(timescale)
2733                 segment_duration = source.get('duration')
2734                 if segment_duration:
2735                     ms_info['segment_duration'] = float(segment_duration)
2736
2737             def extract_Initialization(source):
2738                 initialization = source.find(_add_ns('Initialization'))
2739                 if initialization is not None:
2740                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2741
2742             segment_list = element.find(_add_ns('SegmentList'))
2743             if segment_list is not None:
2744                 extract_common(segment_list)
2745                 extract_Initialization(segment_list)
2746                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2747                 if segment_urls_e:
2748                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2749             else:
2750                 segment_template = element.find(_add_ns('SegmentTemplate'))
2751                 if segment_template is not None:
2752                     extract_common(segment_template)
2753                     media = segment_template.get('media')
2754                     if media:
2755                         ms_info['media'] = media
2756                     initialization = segment_template.get('initialization')
2757                     if initialization:
2758                         ms_info['initialization'] = initialization
2759                     else:
2760                         extract_Initialization(segment_template)
2761             return ms_info
2762
2763         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2764         formats, subtitles = [], {}
2765         stream_numbers = collections.defaultdict(int)
2766         for period in mpd_doc.findall(_add_ns('Period')):
2767             period_duration = parse_duration(period.get('duration')) or mpd_duration
2768             period_ms_info = extract_multisegment_info(period, {
2769                 'start_number': 1,
2770                 'timescale': 1,
2771             })
2772             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2773                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2774                 for representation in adaptation_set.findall(_add_ns('Representation')):
2775                     representation_attrib = adaptation_set.attrib.copy()
2776                     representation_attrib.update(representation.attrib)
2777                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2778                     mime_type = representation_attrib['mimeType']
2779                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2780
2781                     codec_str = representation_attrib.get('codecs', '')
2782                     # Some kind of binary subtitle found in some youtube livestreams
2783                     if mime_type == 'application/x-rawcc':
2784                         codecs = {'scodec': codec_str}
2785                     else:
2786                         codecs = parse_codecs(codec_str)
2787                     if content_type not in ('video', 'audio', 'text'):
2788                         if mime_type == 'image/jpeg':
2789                             content_type = mime_type
2790                         elif codecs.get('vcodec', 'none') != 'none':
2791                             content_type = 'video'
2792                         elif codecs.get('acodec', 'none') != 'none':
2793                             content_type = 'audio'
2794                         elif codecs.get('scodec', 'none') != 'none':
2795                             content_type = 'text'
2796                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2797                             content_type = 'text'
2798                         else:
2799                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2800                             continue
2801
2802                     base_url = ''
2803                     for element in (representation, adaptation_set, period, mpd_doc):
2804                         base_url_e = element.find(_add_ns('BaseURL'))
2805                         if base_url_e is not None:
2806                             base_url = base_url_e.text + base_url
2807                             if re.match(r'^https?://', base_url):
2808                                 break
2809                     if mpd_base_url and base_url.startswith('/'):
2810                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2811                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2812                         if not mpd_base_url.endswith('/'):
2813                             mpd_base_url += '/'
2814                         base_url = mpd_base_url + base_url
2815                     representation_id = representation_attrib.get('id')
2816                     lang = representation_attrib.get('lang')
2817                     url_el = representation.find(_add_ns('BaseURL'))
2818                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2819                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2820                     if representation_id is not None:
2821                         format_id = representation_id
2822                     else:
2823                         format_id = content_type
2824                     if mpd_id:
2825                         format_id = mpd_id + '-' + format_id
2826                     if content_type in ('video', 'audio'):
2827                         f = {
2828                             'format_id': format_id,
2829                             'manifest_url': mpd_url,
2830                             'ext': mimetype2ext(mime_type),
2831                             'width': int_or_none(representation_attrib.get('width')),
2832                             'height': int_or_none(representation_attrib.get('height')),
2833                             'tbr': float_or_none(bandwidth, 1000),
2834                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2835                             'fps': int_or_none(representation_attrib.get('frameRate')),
2836                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2837                             'format_note': 'DASH %s' % content_type,
2838                             'filesize': filesize,
2839                             'container': mimetype2ext(mime_type) + '_dash',
2840                             **codecs
2841                         }
2842                     elif content_type == 'text':
2843                         f = {
2844                             'ext': mimetype2ext(mime_type),
2845                             'manifest_url': mpd_url,
2846                             'filesize': filesize,
2847                         }
2848                     elif content_type == 'image/jpeg':
2849                         # See test case in VikiIE
2850                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2851                         f = {
2852                             'format_id': format_id,
2853                             'ext': 'mhtml',
2854                             'manifest_url': mpd_url,
2855                             'format_note': 'DASH storyboards (jpeg)',
2856                             'acodec': 'none',
2857                             'vcodec': 'none',
2858                         }
2859                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2860                         f['has_drm'] = True
2861                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2862
2863                     def prepare_template(template_name, identifiers):
2864                         tmpl = representation_ms_info[template_name]
2865                         # First of, % characters outside $...$ templates
2866                         # must be escaped by doubling for proper processing
2867                         # by % operator string formatting used further (see
2868                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2869                         t = ''
2870                         in_template = False
2871                         for c in tmpl:
2872                             t += c
2873                             if c == '$':
2874                                 in_template = not in_template
2875                             elif c == '%' and not in_template:
2876                                 t += c
2877                         # Next, $...$ templates are translated to their
2878                         # %(...) counterparts to be used with % operator
2879                         if representation_id is not None:
2880                             t = t.replace('$RepresentationID$', representation_id)
2881                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2882                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2883                         t.replace('$$', '$')
2884                         return t
2885
2886                     # @initialization is a regular template like @media one
2887                     # so it should be handled just the same way (see
2888                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2889                     if 'initialization' in representation_ms_info:
2890                         initialization_template = prepare_template(
2891                             'initialization',
2892                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2893                             # $Time$ shall not be included for @initialization thus
2894                             # only $Bandwidth$ remains
2895                             ('Bandwidth', ))
2896                         representation_ms_info['initialization_url'] = initialization_template % {
2897                             'Bandwidth': bandwidth,
2898                         }
2899
2900                     def location_key(location):
2901                         return 'url' if re.match(r'^https?://', location) else 'path'
2902
2903                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2904
2905                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2906                         media_location_key = location_key(media_template)
2907
2908                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2909                         # can't be used at the same time
2910                         if '%(Number' in media_template and 's' not in representation_ms_info:
2911                             segment_duration = None
2912                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2913                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2914                                 representation_ms_info['total_number'] = int(math.ceil(
2915                                     float_or_none(period_duration, segment_duration, default=0)))
2916                             representation_ms_info['fragments'] = [{
2917                                 media_location_key: media_template % {
2918                                     'Number': segment_number,
2919                                     'Bandwidth': bandwidth,
2920                                 },
2921                                 'duration': segment_duration,
2922                             } for segment_number in range(
2923                                 representation_ms_info['start_number'],
2924                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2925                         else:
2926                             # $Number*$ or $Time$ in media template with S list available
2927                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2928                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2929                             representation_ms_info['fragments'] = []
2930                             segment_time = 0
2931                             segment_d = None
2932                             segment_number = representation_ms_info['start_number']
2933
2934                             def add_segment_url():
2935                                 segment_url = media_template % {
2936                                     'Time': segment_time,
2937                                     'Bandwidth': bandwidth,
2938                                     'Number': segment_number,
2939                                 }
2940                                 representation_ms_info['fragments'].append({
2941                                     media_location_key: segment_url,
2942                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2943                                 })
2944
2945                             for num, s in enumerate(representation_ms_info['s']):
2946                                 segment_time = s.get('t') or segment_time
2947                                 segment_d = s['d']
2948                                 add_segment_url()
2949                                 segment_number += 1
2950                                 for r in range(s.get('r', 0)):
2951                                     segment_time += segment_d
2952                                     add_segment_url()
2953                                     segment_number += 1
2954                                 segment_time += segment_d
2955                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2956                         # No media template
2957                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2958                         # or any YouTube dashsegments video
2959                         fragments = []
2960                         segment_index = 0
2961                         timescale = representation_ms_info['timescale']
2962                         for s in representation_ms_info['s']:
2963                             duration = float_or_none(s['d'], timescale)
2964                             for r in range(s.get('r', 0) + 1):
2965                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2966                                 fragments.append({
2967                                     location_key(segment_uri): segment_uri,
2968                                     'duration': duration,
2969                                 })
2970                                 segment_index += 1
2971                         representation_ms_info['fragments'] = fragments
2972                     elif 'segment_urls' in representation_ms_info:
2973                         # Segment URLs with no SegmentTimeline
2974                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2975                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2976                         fragments = []
2977                         segment_duration = float_or_none(
2978                             representation_ms_info['segment_duration'],
2979                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2980                         for segment_url in representation_ms_info['segment_urls']:
2981                             fragment = {
2982                                 location_key(segment_url): segment_url,
2983                             }
2984                             if segment_duration:
2985                                 fragment['duration'] = segment_duration
2986                             fragments.append(fragment)
2987                         representation_ms_info['fragments'] = fragments
2988                     # If there is a fragments key available then we correctly recognized fragmented media.
2989                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2990                     # assumption is not necessarily correct since we may simply have no support for
2991                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2992                     if 'fragments' in representation_ms_info:
2993                         f.update({
2994                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2995                             'url': mpd_url or base_url,
2996                             'fragment_base_url': base_url,
2997                             'fragments': [],
2998                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2999                         })
3000                         if 'initialization_url' in representation_ms_info:
3001                             initialization_url = representation_ms_info['initialization_url']
3002                             if not f.get('url'):
3003                                 f['url'] = initialization_url
3004                             f['fragments'].append({location_key(initialization_url): initialization_url})
3005                         f['fragments'].extend(representation_ms_info['fragments'])
3006                         if not period_duration:
3007                             period_duration = try_get(
3008                                 representation_ms_info,
3009                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3010                     else:
3011                         # Assuming direct URL to unfragmented media.
3012                         f['url'] = base_url
3013                     if content_type in ('video', 'audio', 'image/jpeg'):
3014                         f['manifest_stream_number'] = stream_numbers[f['url']]
3015                         stream_numbers[f['url']] += 1
3016                         formats.append(f)
3017                     elif content_type == 'text':
3018                         subtitles.setdefault(lang or 'und', []).append(f)
3019
3020         return formats, subtitles
3021
3022     def _extract_ism_formats(self, *args, **kwargs):
3023         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3024         if subs:
3025             self._report_ignoring_subs('ISM')
3026         return fmts
3027
3028     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3029         res = self._download_xml_handle(
3030             ism_url, video_id,
3031             note='Downloading ISM manifest' if note is None else note,
3032             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3033             fatal=fatal, data=data, headers=headers, query=query)
3034         if res is False:
3035             return [], {}
3036         ism_doc, urlh = res
3037         if ism_doc is None:
3038             return [], {}
3039
3040         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3041
3042     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3043         """
3044         Parse formats from ISM manifest.
3045         References:
3046          1. [MS-SSTR]: Smooth Streaming Protocol,
3047             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3048         """
3049         if ism_doc.get('IsLive') == 'TRUE':
3050             return [], {}
3051
3052         duration = int(ism_doc.attrib['Duration'])
3053         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3054
3055         formats = []
3056         subtitles = {}
3057         for stream in ism_doc.findall('StreamIndex'):
3058             stream_type = stream.get('Type')
3059             if stream_type not in ('video', 'audio', 'text'):
3060                 continue
3061             url_pattern = stream.attrib['Url']
3062             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3063             stream_name = stream.get('Name')
3064             stream_language = stream.get('Language', 'und')
3065             for track in stream.findall('QualityLevel'):
3066                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3067                 # TODO: add support for WVC1 and WMAP
3068                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3069                     self.report_warning('%s is not a supported codec' % fourcc)
3070                     continue
3071                 tbr = int(track.attrib['Bitrate']) // 1000
3072                 # [1] does not mention Width and Height attributes. However,
3073                 # they're often present while MaxWidth and MaxHeight are
3074                 # missing, so should be used as fallbacks
3075                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3076                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3077                 sampling_rate = int_or_none(track.get('SamplingRate'))
3078
3079                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3080                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3081
3082                 fragments = []
3083                 fragment_ctx = {
3084                     'time': 0,
3085                 }
3086                 stream_fragments = stream.findall('c')
3087                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3088                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3089                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3090                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3091                     if not fragment_ctx['duration']:
3092                         try:
3093                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3094                         except IndexError:
3095                             next_fragment_time = duration
3096                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3097                     for _ in range(fragment_repeat):
3098                         fragments.append({
3099                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3100                             'duration': fragment_ctx['duration'] / stream_timescale,
3101                         })
3102                         fragment_ctx['time'] += fragment_ctx['duration']
3103
3104                 if stream_type == 'text':
3105                     subtitles.setdefault(stream_language, []).append({
3106                         'ext': 'ismt',
3107                         'protocol': 'ism',
3108                         'url': ism_url,
3109                         'manifest_url': ism_url,
3110                         'fragments': fragments,
3111                         '_download_params': {
3112                             'stream_type': stream_type,
3113                             'duration': duration,
3114                             'timescale': stream_timescale,
3115                             'fourcc': fourcc,
3116                             'language': stream_language,
3117                             'codec_private_data': track.get('CodecPrivateData'),
3118                         }
3119                     })
3120                 elif stream_type in ('video', 'audio'):
3121                     formats.append({
3122                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3123                         'url': ism_url,
3124                         'manifest_url': ism_url,
3125                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3126                         'width': width,
3127                         'height': height,
3128                         'tbr': tbr,
3129                         'asr': sampling_rate,
3130                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3131                         'acodec': 'none' if stream_type == 'video' else fourcc,
3132                         'protocol': 'ism',
3133                         'fragments': fragments,
3134                         'has_drm': ism_doc.find('Protection') is not None,
3135                         '_download_params': {
3136                             'stream_type': stream_type,
3137                             'duration': duration,
3138                             'timescale': stream_timescale,
3139                             'width': width or 0,
3140                             'height': height or 0,
3141                             'fourcc': fourcc,
3142                             'language': stream_language,
3143                             'codec_private_data': track.get('CodecPrivateData'),
3144                             'sampling_rate': sampling_rate,
3145                             'channels': int_or_none(track.get('Channels', 2)),
3146                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3147                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3148                         },
3149                     })
3150         return formats, subtitles
3151
3152     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3153         def absolute_url(item_url):
3154             return urljoin(base_url, item_url)
3155
3156         def parse_content_type(content_type):
3157             if not content_type:
3158                 return {}
3159             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3160             if ctr:
3161                 mimetype, codecs = ctr.groups()
3162                 f = parse_codecs(codecs)
3163                 f['ext'] = mimetype2ext(mimetype)
3164                 return f
3165             return {}
3166
3167         def _media_formats(src, cur_media_type, type_info=None):
3168             type_info = type_info or {}
3169             full_url = absolute_url(src)
3170             ext = type_info.get('ext') or determine_ext(full_url)
3171             if ext == 'm3u8':
3172                 is_plain_url = False
3173                 formats = self._extract_m3u8_formats(
3174                     full_url, video_id, ext='mp4',
3175                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3176                     preference=preference, quality=quality, fatal=False)
3177             elif ext == 'mpd':
3178                 is_plain_url = False
3179                 formats = self._extract_mpd_formats(
3180                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3181             else:
3182                 is_plain_url = True
3183                 formats = [{
3184                     'url': full_url,
3185                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3186                     'ext': ext,
3187                 }]
3188             return is_plain_url, formats
3189
3190         entries = []
3191         # amp-video and amp-audio are very similar to their HTML5 counterparts
3192         # so we wll include them right here (see
3193         # https://www.ampproject.org/docs/reference/components/amp-video)
3194         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3195         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3196         media_tags = [(media_tag, media_tag_name, media_type, '')
3197                       for media_tag, media_tag_name, media_type
3198                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3199         media_tags.extend(re.findall(
3200             # We only allow video|audio followed by a whitespace or '>'.
3201             # Allowing more characters may end up in significant slow down (see
3202             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3203             # http://www.porntrex.com/maps/videositemap.xml).
3204             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3205         for media_tag, _, media_type, media_content in media_tags:
3206             media_info = {
3207                 'formats': [],
3208                 'subtitles': {},
3209             }
3210             media_attributes = extract_attributes(media_tag)
3211             src = strip_or_none(media_attributes.get('src'))
3212             if src:
3213                 f = parse_content_type(media_attributes.get('type'))
3214                 _, formats = _media_formats(src, media_type, f)
3215                 media_info['formats'].extend(formats)
3216             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3217             if media_content:
3218                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3219                     s_attr = extract_attributes(source_tag)
3220                     # data-video-src and data-src are non standard but seen
3221                     # several times in the wild
3222                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3223                     if not src:
3224                         continue
3225                     f = parse_content_type(s_attr.get('type'))
3226                     is_plain_url, formats = _media_formats(src, media_type, f)
3227                     if is_plain_url:
3228                         # width, height, res, label and title attributes are
3229                         # all not standard but seen several times in the wild
3230                         labels = [
3231                             s_attr.get(lbl)
3232                             for lbl in ('label', 'title')
3233                             if str_or_none(s_attr.get(lbl))
3234                         ]
3235                         width = int_or_none(s_attr.get('width'))
3236                         height = (int_or_none(s_attr.get('height'))
3237                                   or int_or_none(s_attr.get('res')))
3238                         if not width or not height:
3239                             for lbl in labels:
3240                                 resolution = parse_resolution(lbl)
3241                                 if not resolution:
3242                                     continue
3243                                 width = width or resolution.get('width')
3244                                 height = height or resolution.get('height')
3245                         for lbl in labels:
3246                             tbr = parse_bitrate(lbl)
3247                             if tbr:
3248                                 break
3249                         else:
3250                             tbr = None
3251                         f.update({
3252                             'width': width,
3253                             'height': height,
3254                             'tbr': tbr,
3255                             'format_id': s_attr.get('label') or s_attr.get('title'),
3256                         })
3257                         f.update(formats[0])
3258                         media_info['formats'].append(f)
3259                     else:
3260                         media_info['formats'].extend(formats)
3261                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3262                     track_attributes = extract_attributes(track_tag)
3263                     kind = track_attributes.get('kind')
3264                     if not kind or kind in ('subtitles', 'captions'):
3265                         src = strip_or_none(track_attributes.get('src'))
3266                         if not src:
3267                             continue
3268                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3269                         media_info['subtitles'].setdefault(lang, []).append({
3270                             'url': absolute_url(src),
3271                         })
3272             for f in media_info['formats']:
3273                 f.setdefault('http_headers', {})['Referer'] = base_url
3274             if media_info['formats'] or media_info['subtitles']:
3275                 entries.append(media_info)
3276         return entries
3277
3278     def _extract_akamai_formats(self, *args, **kwargs):
3279         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3280         if subs:
3281             self._report_ignoring_subs('akamai')
3282         return fmts
3283
3284     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3285         signed = 'hdnea=' in manifest_url
3286         if not signed:
3287             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3288             manifest_url = re.sub(
3289                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3290                 '', manifest_url).strip('?')
3291
3292         formats = []
3293         subtitles = {}
3294
3295         hdcore_sign = 'hdcore=3.7.0'
3296         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3297         hds_host = hosts.get('hds')
3298         if hds_host:
3299             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3300         if 'hdcore=' not in f4m_url:
3301             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3302         f4m_formats = self._extract_f4m_formats(
3303             f4m_url, video_id, f4m_id='hds', fatal=False)
3304         for entry in f4m_formats:
3305             entry.update({'extra_param_to_segment_url': hdcore_sign})
3306         formats.extend(f4m_formats)
3307
3308         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3309         hls_host = hosts.get('hls')
3310         if hls_host:
3311             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3312         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3313             m3u8_url, video_id, 'mp4', 'm3u8_native',
3314             m3u8_id='hls', fatal=False)
3315         formats.extend(m3u8_formats)
3316         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3317
3318         http_host = hosts.get('http')
3319         if http_host and m3u8_formats and not signed:
3320             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3321             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3322             qualities_length = len(qualities)
3323             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3324                 i = 0
3325                 for f in m3u8_formats:
3326                     if f['vcodec'] != 'none':
3327                         for protocol in ('http', 'https'):
3328                             http_f = f.copy()
3329                             del http_f['manifest_url']
3330                             http_url = re.sub(
3331                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3332                             http_f.update({
3333                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3334                                 'url': http_url,
3335                                 'protocol': protocol,
3336                             })
3337                             formats.append(http_f)
3338                         i += 1
3339
3340         return formats, subtitles
3341
3342     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3343         query = compat_urlparse.urlparse(url).query
3344         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3345         mobj = re.search(
3346             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3347         url_base = mobj.group('url')
3348         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3349         formats = []
3350
3351         def manifest_url(manifest):
3352             m_url = f'{http_base_url}/{manifest}'
3353             if query:
3354                 m_url += '?%s' % query
3355             return m_url
3356
3357         if 'm3u8' not in skip_protocols:
3358             formats.extend(self._extract_m3u8_formats(
3359                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3360                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3361         if 'f4m' not in skip_protocols:
3362             formats.extend(self._extract_f4m_formats(
3363                 manifest_url('manifest.f4m'),
3364                 video_id, f4m_id='hds', fatal=False))
3365         if 'dash' not in skip_protocols:
3366             formats.extend(self._extract_mpd_formats(
3367                 manifest_url('manifest.mpd'),
3368                 video_id, mpd_id='dash', fatal=False))
3369         if re.search(r'(?:/smil:|\.smil)', url_base):
3370             if 'smil' not in skip_protocols:
3371                 rtmp_formats = self._extract_smil_formats(
3372                     manifest_url('jwplayer.smil'),
3373                     video_id, fatal=False)
3374                 for rtmp_format in rtmp_formats:
3375                     rtsp_format = rtmp_format.copy()
3376                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3377                     del rtsp_format['play_path']
3378                     del rtsp_format['ext']
3379                     rtsp_format.update({
3380                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3381                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3382                         'protocol': 'rtsp',
3383                     })
3384                     formats.extend([rtmp_format, rtsp_format])
3385         else:
3386             for protocol in ('rtmp', 'rtsp'):
3387                 if protocol not in skip_protocols:
3388                     formats.append({
3389                         'url': f'{protocol}:{url_base}',
3390                         'format_id': protocol,
3391                         'protocol': protocol,
3392                     })
3393         return formats
3394
3395     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3396         mobj = re.search(
3397             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3398             webpage)
3399         if mobj:
3400             try:
3401                 jwplayer_data = self._parse_json(mobj.group('options'),
3402                                                  video_id=video_id,
3403                                                  transform_source=transform_source)
3404             except ExtractorError:
3405                 pass
3406             else:
3407                 if isinstance(jwplayer_data, dict):
3408                     return jwplayer_data
3409
3410     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3411         jwplayer_data = self._find_jwplayer_data(
3412             webpage, video_id, transform_source=js_to_json)
3413         return self._parse_jwplayer_data(
3414             jwplayer_data, video_id, *args, **kwargs)
3415
3416     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3417                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3418         # JWPlayer backward compatibility: flattened playlists
3419         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3420         if 'playlist' not in jwplayer_data:
3421             jwplayer_data = {'playlist': [jwplayer_data]}
3422
3423         entries = []
3424
3425         # JWPlayer backward compatibility: single playlist item
3426         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3427         if not isinstance(jwplayer_data['playlist'], list):
3428             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3429
3430         for video_data in jwplayer_data['playlist']:
3431             # JWPlayer backward compatibility: flattened sources
3432             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3433             if 'sources' not in video_data:
3434                 video_data['sources'] = [video_data]
3435
3436             this_video_id = video_id or video_data['mediaid']
3437
3438             formats = self._parse_jwplayer_formats(
3439                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3440                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3441
3442             subtitles = {}
3443             tracks = video_data.get('tracks')
3444             if tracks and isinstance(tracks, list):
3445                 for track in tracks:
3446                     if not isinstance(track, dict):
3447                         continue
3448                     track_kind = track.get('kind')
3449                     if not track_kind or not isinstance(track_kind, compat_str):
3450                         continue
3451                     if track_kind.lower() not in ('captions', 'subtitles'):
3452                         continue
3453                     track_url = urljoin(base_url, track.get('file'))
3454                     if not track_url:
3455                         continue
3456                     subtitles.setdefault(track.get('label') or 'en', []).append({
3457                         'url': self._proto_relative_url(track_url)
3458                     })
3459
3460             entry = {
3461                 'id': this_video_id,
3462                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3463                 'description': clean_html(video_data.get('description')),
3464                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3465                 'timestamp': int_or_none(video_data.get('pubdate')),
3466                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3467                 'subtitles': subtitles,
3468             }
3469             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3470             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3471                 entry.update({
3472                     '_type': 'url_transparent',
3473                     'url': formats[0]['url'],
3474                 })
3475             else:
3476                 self._sort_formats(formats)
3477                 entry['formats'] = formats
3478             entries.append(entry)
3479         if len(entries) == 1:
3480             return entries[0]
3481         else:
3482             return self.playlist_result(entries)
3483
3484     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3485                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3486         urls = []
3487         formats = []
3488         for source in jwplayer_sources_data:
3489             if not isinstance(source, dict):
3490                 continue
3491             source_url = urljoin(
3492                 base_url, self._proto_relative_url(source.get('file')))
3493             if not source_url or source_url in urls:
3494                 continue
3495             urls.append(source_url)
3496             source_type = source.get('type') or ''
3497             ext = mimetype2ext(source_type) or determine_ext(source_url)
3498             if source_type == 'hls' or ext == 'm3u8':
3499                 formats.extend(self._extract_m3u8_formats(
3500                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3501                     m3u8_id=m3u8_id, fatal=False))
3502             elif source_type == 'dash' or ext == 'mpd':
3503                 formats.extend(self._extract_mpd_formats(
3504                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3505             elif ext == 'smil':
3506                 formats.extend(self._extract_smil_formats(
3507                     source_url, video_id, fatal=False))
3508             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3509             elif source_type.startswith('audio') or ext in (
3510                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3511                 formats.append({
3512                     'url': source_url,
3513                     'vcodec': 'none',
3514                     'ext': ext,
3515                 })
3516             else:
3517                 height = int_or_none(source.get('height'))
3518                 if height is None:
3519                     # Often no height is provided but there is a label in
3520                     # format like "1080p", "720p SD", or 1080.
3521                     height = int_or_none(self._search_regex(
3522                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3523                         'height', default=None))
3524                 a_format = {
3525                     'url': source_url,
3526                     'width': int_or_none(source.get('width')),
3527                     'height': height,
3528                     'tbr': int_or_none(source.get('bitrate')),
3529                     'ext': ext,
3530                 }
3531                 if source_url.startswith('rtmp'):
3532                     a_format['ext'] = 'flv'
3533                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3534                     # of jwplayer.flash.swf
3535                     rtmp_url_parts = re.split(
3536                         r'((?:mp4|mp3|flv):)', source_url, 1)
3537                     if len(rtmp_url_parts) == 3:
3538                         rtmp_url, prefix, play_path = rtmp_url_parts
3539                         a_format.update({
3540                             'url': rtmp_url,
3541                             'play_path': prefix + play_path,
3542                         })
3543                     if rtmp_params:
3544                         a_format.update(rtmp_params)
3545                 formats.append(a_format)
3546         return formats
3547
3548     def _live_title(self, name):
3549         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3550         return name
3551
3552     def _int(self, v, name, fatal=False, **kwargs):
3553         res = int_or_none(v, **kwargs)
3554         if res is None:
3555             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3556             if fatal:
3557                 raise ExtractorError(msg)
3558             else:
3559                 self.report_warning(msg)
3560         return res
3561
3562     def _float(self, v, name, fatal=False, **kwargs):
3563         res = float_or_none(v, **kwargs)
3564         if res is None:
3565             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3566             if fatal:
3567                 raise ExtractorError(msg)
3568             else:
3569                 self.report_warning(msg)
3570         return res
3571
3572     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3573                     path='/', secure=False, discard=False, rest={}, **kwargs):
3574         cookie = compat_cookiejar_Cookie(
3575             0, name, value, port, port is not None, domain, True,
3576             domain.startswith('.'), path, True, secure, expire_time,
3577             discard, None, None, rest)
3578         self._downloader.cookiejar.set_cookie(cookie)
3579
3580     def _get_cookies(self, url):
3581         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3582         return compat_cookies_SimpleCookie(self._downloader._calc_cookies(url))
3583
3584     def _apply_first_set_cookie_header(self, url_handle, cookie):
3585         """
3586         Apply first Set-Cookie header instead of the last. Experimental.
3587
3588         Some sites (e.g. [1-3]) may serve two cookies under the same name
3589         in Set-Cookie header and expect the first (old) one to be set rather
3590         than second (new). However, as of RFC6265 the newer one cookie
3591         should be set into cookie store what actually happens.
3592         We will workaround this issue by resetting the cookie to
3593         the first one manually.
3594         1. https://new.vk.com/
3595         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3596         3. https://learning.oreilly.com/
3597         """
3598         for header, cookies in url_handle.headers.items():
3599             if header.lower() != 'set-cookie':
3600                 continue
3601             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3602             cookie_value = re.search(
3603                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3604             if cookie_value:
3605                 value, domain = cookie_value.groups()
3606                 self._set_cookie(domain, cookie, value)
3607                 break
3608
3609     @classmethod
3610     def get_testcases(cls, include_onlymatching=False):
3611         t = getattr(cls, '_TEST', None)
3612         if t:
3613             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3614             tests = [t]
3615         else:
3616             tests = getattr(cls, '_TESTS', [])
3617         for t in tests:
3618             if not include_onlymatching and t.get('only_matching', False):
3619                 continue
3620             t['name'] = cls.ie_key()
3621             yield t
3622
3623     @classproperty
3624     def age_limit(cls):
3625         """Get age limit from the testcases"""
3626         return max(traverse_obj(
3627             tuple(cls.get_testcases(include_onlymatching=False)),
3628             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3629
3630     @classmethod
3631     def is_suitable(cls, age_limit):
3632         """Test whether the extractor is generally suitable for the given age limit"""
3633         return not age_restricted(cls.age_limit, age_limit)
3634
3635     @classmethod
3636     def description(cls, *, markdown=True, search_examples=None):
3637         """Description of the extractor"""
3638         desc = ''
3639         if cls._NETRC_MACHINE:
3640             if markdown:
3641                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3642             else:
3643                 desc += f' [{cls._NETRC_MACHINE}]'
3644         if cls.IE_DESC is False:
3645             desc += ' [HIDDEN]'
3646         elif cls.IE_DESC:
3647             desc += f' {cls.IE_DESC}'
3648         if cls.SEARCH_KEY:
3649             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3650             if search_examples:
3651                 _COUNTS = ('', '5', '10', 'all')
3652                 desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3653         if not cls.working():
3654             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3655
3656         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3657         return f'{name}:{desc}' if desc else name
3658
3659     def extract_subtitles(self, *args, **kwargs):
3660         if (self.get_param('writesubtitles', False)
3661                 or self.get_param('listsubtitles')):
3662             return self._get_subtitles(*args, **kwargs)
3663         return {}
3664
3665     def _get_subtitles(self, *args, **kwargs):
3666         raise NotImplementedError('This method must be implemented by subclasses')
3667
3668     def extract_comments(self, *args, **kwargs):
3669         if not self.get_param('getcomments'):
3670             return None
3671         generator = self._get_comments(*args, **kwargs)
3672
3673         def extractor():
3674             comments = []
3675             interrupted = True
3676             try:
3677                 while True:
3678                     comments.append(next(generator))
3679             except StopIteration:
3680                 interrupted = False
3681             except KeyboardInterrupt:
3682                 self.to_screen('Interrupted by user')
3683             except Exception as e:
3684                 if self.get_param('ignoreerrors') is not True:
3685                     raise
3686                 self._downloader.report_error(e)
3687             comment_count = len(comments)
3688             self.to_screen(f'Extracted {comment_count} comments')
3689             return {
3690                 'comments': comments,
3691                 'comment_count': None if interrupted else comment_count
3692             }
3693         return extractor
3694
3695     def _get_comments(self, *args, **kwargs):
3696         raise NotImplementedError('This method must be implemented by subclasses')
3697
3698     @staticmethod
3699     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3700         """ Merge subtitle items for one language. Items with duplicated URLs/data
3701         will be dropped. """
3702         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3703         ret = list(subtitle_list1)
3704         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3705         return ret
3706
3707     @classmethod
3708     def _merge_subtitles(cls, *dicts, target=None):
3709         """ Merge subtitle dictionaries, language by language. """
3710         if target is None:
3711             target = {}
3712         for d in dicts:
3713             for lang, subs in d.items():
3714                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3715         return target
3716
3717     def extract_automatic_captions(self, *args, **kwargs):
3718         if (self.get_param('writeautomaticsub', False)
3719                 or self.get_param('listsubtitles')):
3720             return self._get_automatic_captions(*args, **kwargs)
3721         return {}
3722
3723     def _get_automatic_captions(self, *args, **kwargs):
3724         raise NotImplementedError('This method must be implemented by subclasses')
3725
3726     @functools.cached_property
3727     def _cookies_passed(self):
3728         """Whether cookies have been passed to YoutubeDL"""
3729         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3730
3731     def mark_watched(self, *args, **kwargs):
3732         if not self.get_param('mark_watched', False):
3733             return
3734         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3735             self._mark_watched(*args, **kwargs)
3736
3737     def _mark_watched(self, *args, **kwargs):
3738         raise NotImplementedError('This method must be implemented by subclasses')
3739
3740     def geo_verification_headers(self):
3741         headers = {}
3742         geo_verification_proxy = self.get_param('geo_verification_proxy')
3743         if geo_verification_proxy:
3744             headers['Ytdl-request-proxy'] = geo_verification_proxy
3745         return headers
3746
3747     def _generic_id(self, url):
3748         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3749
3750     def _generic_title(self, url):
3751         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3752
3753     @staticmethod
3754     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3755         all_known = all(map(
3756             lambda x: x is not None,
3757             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3758         return (
3759             'private' if is_private
3760             else 'premium_only' if needs_premium
3761             else 'subscriber_only' if needs_subscription
3762             else 'needs_auth' if needs_auth
3763             else 'unlisted' if is_unlisted
3764             else 'public' if all_known
3765             else None)
3766
3767     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3768         '''
3769         @returns            A list of values for the extractor argument given by "key"
3770                             or "default" if no such key is present
3771         @param default      The default value to return when the key is not present (default: [])
3772         @param casesense    When false, the values are converted to lower case
3773         '''
3774         val = traverse_obj(
3775             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3776         if val is None:
3777             return [] if default is NO_DEFAULT else default
3778         return list(val) if casesense else [x.lower() for x in val]
3779
3780     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3781         if not playlist_id or not video_id:
3782             return not video_id
3783
3784         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3785         if no_playlist is not None:
3786             return not no_playlist
3787
3788         video_id = '' if video_id is True else f' {video_id}'
3789         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3790         if self.get_param('noplaylist'):
3791             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3792             return False
3793         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3794         return True
3795
3796
3797 class SearchInfoExtractor(InfoExtractor):
3798     """
3799     Base class for paged search queries extractors.
3800     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3801     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3802     """
3803
3804     _MAX_RESULTS = float('inf')
3805
3806     @classmethod
3807     def _make_valid_url(cls):
3808         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3809
3810     def _real_extract(self, query):
3811         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3812         if prefix == '':
3813             return self._get_n_results(query, 1)
3814         elif prefix == 'all':
3815             return self._get_n_results(query, self._MAX_RESULTS)
3816         else:
3817             n = int(prefix)
3818             if n <= 0:
3819                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3820             elif n > self._MAX_RESULTS:
3821                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3822                 n = self._MAX_RESULTS
3823             return self._get_n_results(query, n)
3824
3825     def _get_n_results(self, query, n):
3826         """Get a specified number of results for a query.
3827         Either this function or _search_results must be overridden by subclasses """
3828         return self.playlist_result(
3829             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3830             query, query)
3831
3832     def _search_results(self, query):
3833         """Returns an iterator of search results"""
3834         raise NotImplementedError('This method must be implemented by subclasses')
3835
3836     @classproperty
3837     def SEARCH_KEY(cls):
3838         return cls._SEARCH_KEY