yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import itertools
   9 import json
  10 import math
  11 import netrc
  12 import os
  13 import random
  14 import sys
  15 import time
  16 import urllib.parse
  17 import urllib.request
  18 import xml.etree.ElementTree
  19
  20 from ..compat import functools, re  # isort: split
  21 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  22 from ..downloader import FileDownloader
  23 from ..downloader.f4m import get_base_url, remove_encrypted_media
  24 from ..utils import (
  25     JSON_LD_RE,
  26     NO_DEFAULT,
  27     ExtractorError,
  28     GeoRestrictedError,
  29     GeoUtils,
  30     LenientJSONDecoder,
  31     RegexNotFoundError,
  32     UnsupportedError,
  33     age_restricted,
  34     base_url,
  35     bug_reports_message,
  36     classproperty,
  37     clean_html,
  38     determine_ext,
  39     determine_protocol,
  40     dict_get,
  41     encode_data_uri,
  42     error_to_compat_str,
  43     extract_attributes,
  44     filter_dict,
  45     fix_xml_ampersands,
  46     float_or_none,
  47     format_field,
  48     int_or_none,
  49     join_nonempty,
  50     js_to_json,
  51     mimetype2ext,
  52     network_exceptions,
  53     orderedSet,
  54     parse_bitrate,
  55     parse_codecs,
  56     parse_duration,
  57     parse_iso8601,
  58     parse_m3u8_attributes,
  59     parse_resolution,
  60     sanitize_filename,
  61     sanitized_Request,
  62     str_or_none,
  63     str_to_int,
  64     strip_or_none,
  65     traverse_obj,
  66     try_get,
  67     unescapeHTML,
  68     unified_strdate,
  69     unified_timestamp,
  70     update_Request,
  71     update_url_query,
  72     url_basename,
  73     url_or_none,
  74     urljoin,
  75     variadic,
  76     xpath_element,
  77     xpath_text,
  78     xpath_with_ns,
  79 )
  80
  81
  82 class InfoExtractor:
  83     """Information Extractor class.
  84
  85     Information extractors are the classes that, given a URL, extract
  86     information about the video (or videos) the URL refers to. This
  87     information includes the real video URL, the video title, author and
  88     others. The information is stored in a dictionary which is then
  89     passed to the YoutubeDL. The YoutubeDL processes this
  90     information possibly downloading the video to the file system, among
  91     other possible outcomes.
  92
  93     The type field determines the type of the result.
  94     By far the most common value (and the default if _type is missing) is
  95     "video", which indicates a single video.
  96
  97     For a video, the dictionaries must include the following fields:
  98
  99     id:             Video identifier.
 100     title:          Video title, unescaped. Set to an empty string if video has
 101                     no title as opposed to "None" which signifies that the
 102                     extractor failed to obtain a title
 103
 104     Additionally, it must contain either a formats entry or a url one:
 105
 106     formats:        A list of dictionaries for each format available, ordered
 107                     from worst to best quality.
 108
 109                     Potential fields:
 110                     * url        The mandatory URL representing the media:
 111                                    for plain file media - HTTP URL of this file,
 112                                    for RTMP - RTMP URL,
 113                                    for HLS - URL of the M3U8 media playlist,
 114                                    for HDS - URL of the F4M manifest,
 115                                    for DASH
 116                                      - HTTP URL to plain file media (in case of
 117                                        unfragmented media)
 118                                      - URL of the MPD manifest or base URL
 119                                        representing the media if MPD manifest
 120                                        is parsed from a string (in case of
 121                                        fragmented media)
 122                                    for MSS - URL of the ISM manifest.
 123                     * manifest_url
 124                                  The URL of the manifest file in case of
 125                                  fragmented media:
 126                                    for HLS - URL of the M3U8 master playlist,
 127                                    for HDS - URL of the F4M manifest,
 128                                    for DASH - URL of the MPD manifest,
 129                                    for MSS - URL of the ISM manifest.
 130                     * manifest_stream_number  (For internal use only)
 131                                  The index of the stream in the manifest file
 132                     * ext        Will be calculated from URL if missing
 133                     * format     A human-readable description of the format
 134                                  ("mp4 container with h264/opus").
 135                                  Calculated from the format_id, width, height.
 136                                  and format_note fields if missing.
 137                     * format_id  A short description of the format
 138                                  ("mp4_h264_opus" or "19").
 139                                 Technically optional, but strongly recommended.
 140                     * format_note Additional info about the format
 141                                  ("3D" or "DASH video")
 142                     * width      Width of the video, if known
 143                     * height     Height of the video, if known
 144                     * resolution Textual description of width and height
 145                     * dynamic_range The dynamic range of the video. One of:
 146                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 147                     * tbr        Average bitrate of audio and video in KBit/s
 148                     * abr        Average audio bitrate in KBit/s
 149                     * acodec     Name of the audio codec in use
 150                     * asr        Audio sampling rate in Hertz
 151                     * vbr        Average video bitrate in KBit/s
 152                     * fps        Frame rate
 153                     * vcodec     Name of the video codec in use
 154                     * container  Name of the container format
 155                     * filesize   The number of bytes, if known in advance
 156                     * filesize_approx  An estimate for the number of bytes
 157                     * player_url SWF Player URL (used for rtmpdump).
 158                     * protocol   The protocol that will be used for the actual
 159                                  download, lower-case. One of "http", "https" or
 160                                  one of the protocols defined in downloader.PROTOCOL_MAP
 161                     * fragment_base_url
 162                                  Base URL for fragments. Each fragment's path
 163                                  value (if present) will be relative to
 164                                  this URL.
 165                     * fragments  A list of fragments of a fragmented media.
 166                                  Each fragment entry must contain either an url
 167                                  or a path. If an url is present it should be
 168                                  considered by a client. Otherwise both path and
 169                                  fragment_base_url must be present. Here is
 170                                  the list of all potential fields:
 171                                  * "url" - fragment's URL
 172                                  * "path" - fragment's path relative to
 173                                             fragment_base_url
 174                                  * "duration" (optional, int or float)
 175                                  * "filesize" (optional, int)
 176                     * is_from_start  Is a live format that can be downloaded
 177                                 from the start. Boolean
 178                     * preference Order number of this format. If this field is
 179                                  present and not None, the formats get sorted
 180                                  by this field, regardless of all other values.
 181                                  -1 for default (order by other properties),
 182                                  -2 or smaller for less than default.
 183                                  < -1000 to hide the format (if there is
 184                                     another one which is strictly better)
 185                     * language   Language code, e.g. "de" or "en-US".
 186                     * language_preference  Is this in the language mentioned in
 187                                  the URL?
 188                                  10 if it's what the URL is about,
 189                                  -1 for default (don't know),
 190                                  -10 otherwise, other values reserved for now.
 191                     * quality    Order number of the video quality of this
 192                                  format, irrespective of the file format.
 193                                  -1 for default (order by other properties),
 194                                  -2 or smaller for less than default.
 195                     * source_preference  Order number for this video source
 196                                   (quality takes higher priority)
 197                                  -1 for default (order by other properties),
 198                                  -2 or smaller for less than default.
 199                     * http_headers  A dictionary of additional HTTP headers
 200                                  to add to the request.
 201                     * stretched_ratio  If given and not 1, indicates that the
 202                                  video's pixels are not square.
 203                                  width : height ratio as float.
 204                     * no_resume  The server does not support resuming the
 205                                  (HTTP or RTMP) download. Boolean.
 206                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 207                     * downloader_options  A dictionary of downloader options
 208                                  (For internal use only)
 209                                  * http_chunk_size Chunk size for HTTP downloads
 210                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 211                     RTMP formats can also have the additional fields: page_url,
 212                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 213                     rtmp_protocol, rtmp_real_time
 214
 215     url:            Final video URL.
 216     ext:            Video filename extension.
 217     format:         The video format, defaults to ext (used for --get-format)
 218     player_url:     SWF Player URL (used for rtmpdump).
 219
 220     The following fields are optional:
 221
 222     direct:         True if a direct video file was given (must only be set by GenericIE)
 223     alt_title:      A secondary title of the video.
 224     display_id      An alternative identifier for the video, not necessarily
 225                     unique, but available before title. Typically, id is
 226                     something like "4234987", title "Dancing naked mole rats",
 227                     and display_id "dancing-naked-mole-rats"
 228     thumbnails:     A list of dictionaries, with the following entries:
 229                         * "id" (optional, string) - Thumbnail format ID
 230                         * "url"
 231                         * "preference" (optional, int) - quality of the image
 232                         * "width" (optional, int)
 233                         * "height" (optional, int)
 234                         * "resolution" (optional, string "{width}x{height}",
 235                                         deprecated)
 236                         * "filesize" (optional, int)
 237                         * "http_headers" (dict) - HTTP headers for the request
 238     thumbnail:      Full URL to a video thumbnail image.
 239     description:    Full video description.
 240     uploader:       Full name of the video uploader.
 241     license:        License name the video is licensed under.
 242     creator:        The creator of the video.
 243     timestamp:      UNIX timestamp of the moment the video was uploaded
 244     upload_date:    Video upload date in UTC (YYYYMMDD).
 245                     If not explicitly set, calculated from timestamp
 246     release_timestamp: UNIX timestamp of the moment the video was released.
 247                     If it is not clear whether to use timestamp or this, use the former
 248     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 249                     If not explicitly set, calculated from release_timestamp
 250     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 251     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 252                     If not explicitly set, calculated from modified_timestamp
 253     uploader_id:    Nickname or id of the video uploader.
 254     uploader_url:   Full URL to a personal webpage of the video uploader.
 255     channel:        Full name of the channel the video is uploaded on.
 256                     Note that channel fields may or may not repeat uploader
 257                     fields. This depends on a particular extractor.
 258     channel_id:     Id of the channel.
 259     channel_url:    Full URL to a channel webpage.
 260     channel_follower_count: Number of followers of the channel.
 261     location:       Physical location where the video was filmed.
 262     subtitles:      The available subtitles as a dictionary in the format
 263                     {tag: subformats}. "tag" is usually a language code, and
 264                     "subformats" is a list sorted from lower to higher
 265                     preference, each element is a dictionary with the "ext"
 266                     entry and one of:
 267                         * "data": The subtitles file contents
 268                         * "url": A URL pointing to the subtitles file
 269                     It can optionally also have:
 270                         * "name": Name or description of the subtitles
 271                         * "http_headers": A dictionary of additional HTTP headers
 272                                   to add to the request.
 273                     "ext" will be calculated from URL if missing
 274     automatic_captions: Like 'subtitles'; contains automatically generated
 275                     captions instead of normal subtitles
 276     duration:       Length of the video in seconds, as an integer or float.
 277     view_count:     How many users have watched the video on the platform.
 278     like_count:     Number of positive ratings of the video
 279     dislike_count:  Number of negative ratings of the video
 280     repost_count:   Number of reposts of the video
 281     average_rating: Average rating give by users, the scale used depends on the webpage
 282     comment_count:  Number of comments on the video
 283     comments:       A list of comments, each with one or more of the following
 284                     properties (all but one of text or html optional):
 285                         * "author" - human-readable name of the comment author
 286                         * "author_id" - user ID of the comment author
 287                         * "author_thumbnail" - The thumbnail of the comment author
 288                         * "id" - Comment ID
 289                         * "html" - Comment as HTML
 290                         * "text" - Plain text of the comment
 291                         * "timestamp" - UNIX timestamp of comment
 292                         * "parent" - ID of the comment this one is replying to.
 293                                      Set to "root" to indicate that this is a
 294                                      comment to the original video.
 295                         * "like_count" - Number of positive ratings of the comment
 296                         * "dislike_count" - Number of negative ratings of the comment
 297                         * "is_favorited" - Whether the comment is marked as
 298                                            favorite by the video uploader
 299                         * "author_is_uploader" - Whether the comment is made by
 300                                                  the video uploader
 301     age_limit:      Age restriction for the video, as an integer (years)
 302     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 303                     should allow to get the same result again. (It will be set
 304                     by YoutubeDL if it's missing)
 305     categories:     A list of categories that the video falls in, for example
 306                     ["Sports", "Berlin"]
 307     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 308     cast:           A list of the video cast
 309     is_live:        True, False, or None (=unknown). Whether this video is a
 310                     live stream that goes on instead of a fixed-length video.
 311     was_live:       True, False, or None (=unknown). Whether this video was
 312                     originally a live stream.
 313     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 314                     If absent, automatically set from is_live, was_live
 315     start_time:     Time in seconds where the reproduction should start, as
 316                     specified in the URL.
 317     end_time:       Time in seconds where the reproduction should end, as
 318                     specified in the URL.
 319     chapters:       A list of dictionaries, with the following entries:
 320                         * "start_time" - The start time of the chapter in seconds
 321                         * "end_time" - The end time of the chapter in seconds
 322                         * "title" (optional, string)
 323     playable_in_embed: Whether this video is allowed to play in embedded
 324                     players on other sites. Can be True (=always allowed),
 325                     False (=never allowed), None (=unknown), or a string
 326                     specifying the criteria for embedability (Eg: 'whitelist')
 327     availability:   Under what condition the video is available. One of
 328                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 329                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 330                     to set it
 331     __post_extractor: A function to be called just before the metadata is
 332                     written to either disk, logger or console. The function
 333                     must return a dict which will be added to the info_dict.
 334                     This is usefull for additional information that is
 335                     time-consuming to extract. Note that the fields thus
 336                     extracted will not be available to output template and
 337                     match_filter. So, only "comments" and "comment_count" are
 338                     currently allowed to be extracted via this method.
 339
 340     The following fields should only be used when the video belongs to some logical
 341     chapter or section:
 342
 343     chapter:        Name or title of the chapter the video belongs to.
 344     chapter_number: Number of the chapter the video belongs to, as an integer.
 345     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 346
 347     The following fields should only be used when the video is an episode of some
 348     series, programme or podcast:
 349
 350     series:         Title of the series or programme the video episode belongs to.
 351     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 352     season:         Title of the season the video episode belongs to.
 353     season_number:  Number of the season the video episode belongs to, as an integer.
 354     season_id:      Id of the season the video episode belongs to, as a unicode string.
 355     episode:        Title of the video episode. Unlike mandatory video title field,
 356                     this field should denote the exact title of the video episode
 357                     without any kind of decoration.
 358     episode_number: Number of the video episode within a season, as an integer.
 359     episode_id:     Id of the video episode, as a unicode string.
 360
 361     The following fields should only be used when the media is a track or a part of
 362     a music album:
 363
 364     track:          Title of the track.
 365     track_number:   Number of the track within an album or a disc, as an integer.
 366     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 367                     as a unicode string.
 368     artist:         Artist(s) of the track.
 369     genre:          Genre(s) of the track.
 370     album:          Title of the album the track belongs to.
 371     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 372     album_artist:   List of all artists appeared on the album (e.g.
 373                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 374                     and compilations).
 375     disc_number:    Number of the disc or other physical medium the track belongs to,
 376                     as an integer.
 377     release_year:   Year (YYYY) when the album was released.
 378     composer:       Composer of the piece
 379
 380     The following fields should only be set for clips that should be cut from the original video:
 381
 382     section_start:  Start time of the section in seconds
 383     section_end:    End time of the section in seconds
 384
 385     Unless mentioned otherwise, the fields should be Unicode strings.
 386
 387     Unless mentioned otherwise, None is equivalent to absence of information.
 388
 389
 390     _type "playlist" indicates multiple videos.
 391     There must be a key "entries", which is a list, an iterable, or a PagedList
 392     object, each element of which is a valid dictionary by this specification.
 393
 394     Additionally, playlists can have "id", "title", and any other relevent
 395     attributes with the same semantics as videos (see above).
 396
 397     It can also have the following optional fields:
 398
 399     playlist_count: The total number of videos in a playlist. If not given,
 400                     YoutubeDL tries to calculate it from "entries"
 401
 402
 403     _type "multi_video" indicates that there are multiple videos that
 404     form a single show, for examples multiple acts of an opera or TV episode.
 405     It must have an entries key like a playlist and contain all the keys
 406     required for a video at the same time.
 407
 408
 409     _type "url" indicates that the video must be extracted from another
 410     location, possibly by a different extractor. Its only required key is:
 411     "url" - the next URL to extract.
 412     The key "ie_key" can be set to the class name (minus the trailing "IE",
 413     e.g. "Youtube") if the extractor class is known in advance.
 414     Additionally, the dictionary may have any properties of the resolved entity
 415     known in advance, for example "title" if the title of the referred video is
 416     known ahead of time.
 417
 418
 419     _type "url_transparent" entities have the same specification as "url", but
 420     indicate that the given additional information is more precise than the one
 421     associated with the resolved URL.
 422     This is useful when a site employs a video service that hosts the video and
 423     its technical metadata, but that video service does not embed a useful
 424     title, description etc.
 425
 426
 427     Subclasses of this should define a _VALID_URL regexp and, re-define the
 428     _real_extract() and (optionally) _real_initialize() methods.
 429     Probably, they should also be added to the list of extractors.
 430
 431     Subclasses may also override suitable() if necessary, but ensure the function
 432     signature is preserved and that this function imports everything it needs
 433     (except other extractors), so that lazy_extractors works correctly.
 434
 435     To support username + password (or netrc) login, the extractor must define a
 436     _NETRC_MACHINE and re-define _perform_login(username, password) and
 437     (optionally) _initialize_pre_login() methods. The _perform_login method will
 438     be called between _initialize_pre_login and _real_initialize if credentials
 439     are passed by the user. In cases where it is necessary to have the login
 440     process as part of the extraction rather than initialization, _perform_login
 441     can be left undefined.
 442
 443     _GEO_BYPASS attribute may be set to False in order to disable
 444     geo restriction bypass mechanisms for a particular extractor.
 445     Though it won't disable explicit geo restriction bypass based on
 446     country code provided with geo_bypass_country.
 447
 448     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 449     countries for this extractor. One of these countries will be used by
 450     geo restriction bypass mechanism right away in order to bypass
 451     geo restriction, of course, if the mechanism is not disabled.
 452
 453     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 454     IP blocks in CIDR notation for this extractor. One of these IP blocks
 455     will be used by geo restriction bypass mechanism similarly
 456     to _GEO_COUNTRIES.
 457
 458     The _WORKING attribute should be set to False for broken IEs
 459     in order to warn the users and skip the tests.
 460     """
 461
 462     _ready = False
 463     _downloader = None
 464     _x_forwarded_for_ip = None
 465     _GEO_BYPASS = True
 466     _GEO_COUNTRIES = None
 467     _GEO_IP_BLOCKS = None
 468     _WORKING = True
 469     _NETRC_MACHINE = None
 470     IE_DESC = None
 471     SEARCH_KEY = None
 472
 473     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 474         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 475         return {
 476             None: '',
 477             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 478             'password': f'Use {password_hint}',
 479             'cookies': (
 480                 'Use --cookies-from-browser or --cookies for the authentication. '
 481                 'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 482         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 483
 484     def __init__(self, downloader=None):
 485         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 486         If a downloader is not passed during initialization,
 487         it must be set using "set_downloader()" before "extract()" is called"""
 488         self._ready = False
 489         self._x_forwarded_for_ip = None
 490         self._printed_messages = set()
 491         self.set_downloader(downloader)
 492
 493     @classmethod
 494     def _match_valid_url(cls, url):
 495         # This does not use has/getattr intentionally - we want to know whether
 496         # we have cached the regexp for *this* class, whereas getattr would also
 497         # match the superclass
 498         if '_VALID_URL_RE' not in cls.__dict__:
 499             if '_VALID_URL' not in cls.__dict__:
 500                 cls._VALID_URL = cls._make_valid_url()
 501             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 502         return cls._VALID_URL_RE.match(url)
 503
 504     @classmethod
 505     def suitable(cls, url):
 506         """Receives a URL and returns True if suitable for this IE."""
 507         # This function must import everything it needs (except other extractors),
 508         # so that lazy_extractors works correctly
 509         return cls._match_valid_url(url) is not None
 510
 511     @classmethod
 512     def _match_id(cls, url):
 513         return cls._match_valid_url(url).group('id')
 514
 515     @classmethod
 516     def get_temp_id(cls, url):
 517         try:
 518             return cls._match_id(url)
 519         except (IndexError, AttributeError):
 520             return None
 521
 522     @classmethod
 523     def working(cls):
 524         """Getter method for _WORKING."""
 525         return cls._WORKING
 526
 527     @classmethod
 528     def supports_login(cls):
 529         return bool(cls._NETRC_MACHINE)
 530
 531     def initialize(self):
 532         """Initializes an instance (authentication, etc)."""
 533         self._printed_messages = set()
 534         self._initialize_geo_bypass({
 535             'countries': self._GEO_COUNTRIES,
 536             'ip_blocks': self._GEO_IP_BLOCKS,
 537         })
 538         if not self._ready:
 539             self._initialize_pre_login()
 540             if self.supports_login():
 541                 username, password = self._get_login_info()
 542                 if username:
 543                     self._perform_login(username, password)
 544             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 545                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 546             self._real_initialize()
 547             self._ready = True
 548
 549     def _initialize_geo_bypass(self, geo_bypass_context):
 550         """
 551         Initialize geo restriction bypass mechanism.
 552
 553         This method is used to initialize geo bypass mechanism based on faking
 554         X-Forwarded-For HTTP header. A random country from provided country list
 555         is selected and a random IP belonging to this country is generated. This
 556         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 557         HTTP requests.
 558
 559         This method will be used for initial geo bypass mechanism initialization
 560         during the instance initialization with _GEO_COUNTRIES and
 561         _GEO_IP_BLOCKS.
 562
 563         You may also manually call it from extractor's code if geo bypass
 564         information is not available beforehand (e.g. obtained during
 565         extraction) or due to some other reason. In this case you should pass
 566         this information in geo bypass context passed as first argument. It may
 567         contain following fields:
 568
 569         countries:  List of geo unrestricted countries (similar
 570                     to _GEO_COUNTRIES)
 571         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 572                     (similar to _GEO_IP_BLOCKS)
 573
 574         """
 575         if not self._x_forwarded_for_ip:
 576
 577             # Geo bypass mechanism is explicitly disabled by user
 578             if not self.get_param('geo_bypass', True):
 579                 return
 580
 581             if not geo_bypass_context:
 582                 geo_bypass_context = {}
 583
 584             # Backward compatibility: previously _initialize_geo_bypass
 585             # expected a list of countries, some 3rd party code may still use
 586             # it this way
 587             if isinstance(geo_bypass_context, (list, tuple)):
 588                 geo_bypass_context = {
 589                     'countries': geo_bypass_context,
 590                 }
 591
 592             # The whole point of geo bypass mechanism is to fake IP
 593             # as X-Forwarded-For HTTP header based on some IP block or
 594             # country code.
 595
 596             # Path 1: bypassing based on IP block in CIDR notation
 597
 598             # Explicit IP block specified by user, use it right away
 599             # regardless of whether extractor is geo bypassable or not
 600             ip_block = self.get_param('geo_bypass_ip_block', None)
 601
 602             # Otherwise use random IP block from geo bypass context but only
 603             # if extractor is known as geo bypassable
 604             if not ip_block:
 605                 ip_blocks = geo_bypass_context.get('ip_blocks')
 606                 if self._GEO_BYPASS and ip_blocks:
 607                     ip_block = random.choice(ip_blocks)
 608
 609             if ip_block:
 610                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 611                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 612                 return
 613
 614             # Path 2: bypassing based on country code
 615
 616             # Explicit country code specified by user, use it right away
 617             # regardless of whether extractor is geo bypassable or not
 618             country = self.get_param('geo_bypass_country', None)
 619
 620             # Otherwise use random country code from geo bypass context but
 621             # only if extractor is known as geo bypassable
 622             if not country:
 623                 countries = geo_bypass_context.get('countries')
 624                 if self._GEO_BYPASS and countries:
 625                     country = random.choice(countries)
 626
 627             if country:
 628                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 629                 self._downloader.write_debug(
 630                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 631
 632     def extract(self, url):
 633         """Extracts URL information and returns it in list of dicts."""
 634         try:
 635             for _ in range(2):
 636                 try:
 637                     self.initialize()
 638                     self.write_debug('Extracting URL: %s' % url)
 639                     ie_result = self._real_extract(url)
 640                     if ie_result is None:
 641                         return None
 642                     if self._x_forwarded_for_ip:
 643                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 644                     subtitles = ie_result.get('subtitles')
 645                     if (subtitles and 'live_chat' in subtitles
 646                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 647                         del subtitles['live_chat']
 648                     return ie_result
 649                 except GeoRestrictedError as e:
 650                     if self.__maybe_fake_ip_and_retry(e.countries):
 651                         continue
 652                     raise
 653         except UnsupportedError:
 654             raise
 655         except ExtractorError as e:
 656             kwargs = {
 657                 'video_id': e.video_id or self.get_temp_id(url),
 658                 'ie': self.IE_NAME,
 659                 'tb': e.traceback or sys.exc_info()[2],
 660                 'expected': e.expected,
 661                 'cause': e.cause
 662             }
 663             if hasattr(e, 'countries'):
 664                 kwargs['countries'] = e.countries
 665             raise type(e)(e.orig_msg, **kwargs)
 666         except http.client.IncompleteRead as e:
 667             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 668         except (KeyError, StopIteration) as e:
 669             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 670
 671     def __maybe_fake_ip_and_retry(self, countries):
 672         if (not self.get_param('geo_bypass_country', None)
 673                 and self._GEO_BYPASS
 674                 and self.get_param('geo_bypass', True)
 675                 and not self._x_forwarded_for_ip
 676                 and countries):
 677             country_code = random.choice(countries)
 678             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 679             if self._x_forwarded_for_ip:
 680                 self.report_warning(
 681                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 682                     % (self._x_forwarded_for_ip, country_code.upper()))
 683                 return True
 684         return False
 685
 686     def set_downloader(self, downloader):
 687         """Sets a YoutubeDL instance as the downloader for this IE."""
 688         self._downloader = downloader
 689
 690     @property
 691     def cache(self):
 692         return self._downloader.cache
 693
 694     @property
 695     def cookiejar(self):
 696         return self._downloader.cookiejar
 697
 698     def _initialize_pre_login(self):
 699         """ Intialization before login. Redefine in subclasses."""
 700         pass
 701
 702     def _perform_login(self, username, password):
 703         """ Login with username and password. Redefine in subclasses."""
 704         pass
 705
 706     def _real_initialize(self):
 707         """Real initialization process. Redefine in subclasses."""
 708         pass
 709
 710     def _real_extract(self, url):
 711         """Real extraction process. Redefine in subclasses."""
 712         raise NotImplementedError('This method must be implemented by subclasses')
 713
 714     @classmethod
 715     def ie_key(cls):
 716         """A string for getting the InfoExtractor with get_info_extractor"""
 717         return cls.__name__[:-2]
 718
 719     @classproperty
 720     def IE_NAME(cls):
 721         return cls.__name__[:-2]
 722
 723     @staticmethod
 724     def __can_accept_status_code(err, expected_status):
 725         assert isinstance(err, urllib.error.HTTPError)
 726         if expected_status is None:
 727             return False
 728         elif callable(expected_status):
 729             return expected_status(err.code) is True
 730         else:
 731             return err.code in variadic(expected_status)
 732
 733     def _create_request(self, url_or_request, data=None, headers={}, query={}):
 734         if isinstance(url_or_request, urllib.request.Request):
 735             return update_Request(url_or_request, data=data, headers=headers, query=query)
 736         if query:
 737             url_or_request = update_url_query(url_or_request, query)
 738         return sanitized_Request(url_or_request, data, headers)
 739
 740     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 741         """
 742         Return the response handle.
 743
 744         See _download_webpage docstring for arguments specification.
 745         """
 746         if not self._downloader._first_webpage_request:
 747             sleep_interval = self.get_param('sleep_interval_requests') or 0
 748             if sleep_interval > 0:
 749                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 750                 time.sleep(sleep_interval)
 751         else:
 752             self._downloader._first_webpage_request = False
 753
 754         if note is None:
 755             self.report_download_webpage(video_id)
 756         elif note is not False:
 757             if video_id is None:
 758                 self.to_screen(str(note))
 759             else:
 760                 self.to_screen(f'{video_id}: {note}')
 761
 762         # Some sites check X-Forwarded-For HTTP header in order to figure out
 763         # the origin of the client behind proxy. This allows bypassing geo
 764         # restriction by faking this header's value to IP that belongs to some
 765         # geo unrestricted country. We will do so once we encounter any
 766         # geo restriction error.
 767         if self._x_forwarded_for_ip:
 768             if 'X-Forwarded-For' not in headers:
 769                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 770
 771         try:
 772             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 773         except network_exceptions as err:
 774             if isinstance(err, urllib.error.HTTPError):
 775                 if self.__can_accept_status_code(err, expected_status):
 776                     # Retain reference to error to prevent file object from
 777                     # being closed before it can be read. Works around the
 778                     # effects of <https://bugs.python.org/issue15002>
 779                     # introduced in Python 3.4.1.
 780                     err.fp._error = err
 781                     return err.fp
 782
 783             if errnote is False:
 784                 return False
 785             if errnote is None:
 786                 errnote = 'Unable to download webpage'
 787
 788             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 789             if fatal:
 790                 raise ExtractorError(errmsg, cause=err)
 791             else:
 792                 self.report_warning(errmsg)
 793                 return False
 794
 795     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 796                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 797         """
 798         Return a tuple (page content as string, URL handle).
 799
 800         Arguments:
 801         url_or_request -- plain text URL as a string or
 802             a urllib.request.Request object
 803         video_id -- Video/playlist/item identifier (string)
 804
 805         Keyword arguments:
 806         note -- note printed before downloading (string)
 807         errnote -- note printed in case of an error (string)
 808         fatal -- flag denoting whether error should be considered fatal,
 809             i.e. whether it should cause ExtractionError to be raised,
 810             otherwise a warning will be reported and extraction continued
 811         encoding -- encoding for a page content decoding, guessed automatically
 812             when not explicitly specified
 813         data -- POST data (bytes)
 814         headers -- HTTP headers (dict)
 815         query -- URL query (dict)
 816         expected_status -- allows to accept failed HTTP requests (non 2xx
 817             status code) by explicitly specifying a set of accepted status
 818             codes. Can be any of the following entities:
 819                 - an integer type specifying an exact failed status code to
 820                   accept
 821                 - a list or a tuple of integer types specifying a list of
 822                   failed status codes to accept
 823                 - a callable accepting an actual failed status code and
 824                   returning True if it should be accepted
 825             Note that this argument does not affect success status codes (2xx)
 826             which are always accepted.
 827         """
 828
 829         # Strip hashes from the URL (#1038)
 830         if isinstance(url_or_request, str):
 831             url_or_request = url_or_request.partition('#')[0]
 832
 833         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 834         if urlh is False:
 835             assert not fatal
 836             return False
 837         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 838         return (content, urlh)
 839
 840     @staticmethod
 841     def _guess_encoding_from_content(content_type, webpage_bytes):
 842         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 843         if m:
 844             encoding = m.group(1)
 845         else:
 846             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 847                           webpage_bytes[:1024])
 848             if m:
 849                 encoding = m.group(1).decode('ascii')
 850             elif webpage_bytes.startswith(b'\xff\xfe'):
 851                 encoding = 'utf-16'
 852             else:
 853                 encoding = 'utf-8'
 854
 855         return encoding
 856
 857     def __check_blocked(self, content):
 858         first_block = content[:512]
 859         if ('<title>Access to this site is blocked</title>' in content
 860                 and 'Websense' in first_block):
 861             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 862             blocked_iframe = self._html_search_regex(
 863                 r'<iframe src="([^"]+)"', content,
 864                 'Websense information URL', default=None)
 865             if blocked_iframe:
 866                 msg += ' Visit %s for more details' % blocked_iframe
 867             raise ExtractorError(msg, expected=True)
 868         if '<title>The URL you requested has been blocked</title>' in first_block:
 869             msg = (
 870                 'Access to this webpage has been blocked by Indian censorship. '
 871                 'Use a VPN or proxy server (with --proxy) to route around it.')
 872             block_msg = self._html_search_regex(
 873                 r'</h1><p>(.*?)</p>',
 874                 content, 'block message', default=None)
 875             if block_msg:
 876                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 877             raise ExtractorError(msg, expected=True)
 878         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 879                 and 'blocklist.rkn.gov.ru' in content):
 880             raise ExtractorError(
 881                 'Access to this webpage has been blocked by decision of the Russian government. '
 882                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 883                 expected=True)
 884
 885     def _request_dump_filename(self, url, video_id):
 886         basen = f'{video_id}_{url}'
 887         trim_length = self.get_param('trim_file_name') or 240
 888         if len(basen) > trim_length:
 889             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 890             basen = basen[:trim_length - len(h)] + h
 891         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 892         # Working around MAX_PATH limitation on Windows (see
 893         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 894         if compat_os_name == 'nt':
 895             absfilepath = os.path.abspath(filename)
 896             if len(absfilepath) > 259:
 897                 filename = fR'\\?\{absfilepath}'
 898         return filename
 899
 900     def __decode_webpage(self, webpage_bytes, encoding, headers):
 901         if not encoding:
 902             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 903         try:
 904             return webpage_bytes.decode(encoding, 'replace')
 905         except LookupError:
 906             return webpage_bytes.decode('utf-8', 'replace')
 907
 908     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 909         webpage_bytes = urlh.read()
 910         if prefix is not None:
 911             webpage_bytes = prefix + webpage_bytes
 912         if self.get_param('dump_intermediate_pages', False):
 913             self.to_screen('Dumping request to ' + urlh.geturl())
 914             dump = base64.b64encode(webpage_bytes).decode('ascii')
 915             self._downloader.to_screen(dump)
 916         if self.get_param('write_pages'):
 917             filename = self._request_dump_filename(urlh.geturl(), video_id)
 918             self.to_screen(f'Saving request to {filename}')
 919             with open(filename, 'wb') as outf:
 920                 outf.write(webpage_bytes)
 921
 922         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 923         self.__check_blocked(content)
 924
 925         return content
 926
 927     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 928         if transform_source:
 929             xml_string = transform_source(xml_string)
 930         try:
 931             return compat_etree_fromstring(xml_string.encode('utf-8'))
 932         except xml.etree.ElementTree.ParseError as ve:
 933             errmsg = '%s: Failed to parse XML ' % video_id
 934             if fatal:
 935                 raise ExtractorError(errmsg, cause=ve)
 936             else:
 937                 self.report_warning(errmsg + str(ve))
 938
 939     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, **parser_kwargs):
 940         try:
 941             return json.loads(
 942                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 943         except ValueError as ve:
 944             errmsg = f'{video_id}: Failed to parse JSON'
 945             if fatal:
 946                 raise ExtractorError(errmsg, cause=ve)
 947             else:
 948                 self.report_warning(f'{errmsg}: {ve}')
 949
 950     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 951         return self._parse_json(
 952             data[data.find('{'):data.rfind('}') + 1],
 953             video_id, transform_source, fatal)
 954
 955     def __create_download_methods(name, parser, note, errnote, return_value):
 956
 957         def parse(ie, content, *args, **kwargs):
 958             if parser is None:
 959                 return content
 960             # parser is fetched by name so subclasses can override it
 961             return getattr(ie, parser)(content, *args, **kwargs)
 962
 963         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 964                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 965             res = self._download_webpage_handle(
 966                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 967                 data=data, headers=headers, query=query, expected_status=expected_status)
 968             if res is False:
 969                 return res
 970             content, urlh = res
 971             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal), urlh
 972
 973         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 974                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 975             if self.get_param('load_pages'):
 976                 url_or_request = self._create_request(url_or_request, data, headers, query)
 977                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
 978                 self.to_screen(f'Loading request from {filename}')
 979                 try:
 980                     with open(filename, 'rb') as dumpf:
 981                         webpage_bytes = dumpf.read()
 982                 except OSError as e:
 983                     self.report_warning(f'Unable to load request from disk: {e}')
 984                 else:
 985                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
 986                     return parse(self, content, video_id, transform_source, fatal)
 987             kwargs = {
 988                 'note': note,
 989                 'errnote': errnote,
 990                 'transform_source': transform_source,
 991                 'fatal': fatal,
 992                 'encoding': encoding,
 993                 'data': data,
 994                 'headers': headers,
 995                 'query': query,
 996                 'expected_status': expected_status,
 997             }
 998             if parser is None:
 999                 kwargs.pop('transform_source')
1000             # The method is fetched by name so subclasses can override _download_..._handle
1001             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1002             return res if res is False else res[0]
1003
1004         def impersonate(func, name, return_value):
1005             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1006             func.__doc__ = f'''
1007                 @param transform_source     Apply this transformation before parsing
1008                 @returns                    {return_value}
1009
1010                 See _download_webpage_handle docstring for other arguments specification
1011             '''
1012
1013         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1014         impersonate(download_content, f'_download_{name}', f'{return_value}')
1015         return download_handle, download_content
1016
1017     _download_xml_handle, _download_xml = __create_download_methods(
1018         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1019     _download_json_handle, _download_json = __create_download_methods(
1020         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1021     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1022         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1023     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1024
1025     def _download_webpage(
1026             self, url_or_request, video_id, note=None, errnote=None,
1027             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1028         """
1029         Return the data of the page as a string.
1030
1031         Keyword arguments:
1032         tries -- number of tries
1033         timeout -- sleep interval between tries
1034
1035         See _download_webpage_handle docstring for other arguments specification.
1036         """
1037
1038         R''' # NB: These are unused; should they be deprecated?
1039         if tries != 1:
1040             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1041         if timeout is NO_DEFAULT:
1042             timeout = 5
1043         else:
1044             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1045         '''
1046
1047         try_count = 0
1048         while True:
1049             try:
1050                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1051             except http.client.IncompleteRead as e:
1052                 try_count += 1
1053                 if try_count >= tries:
1054                     raise e
1055                 self._sleep(timeout, video_id)
1056
1057     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1058         idstr = format_field(video_id, None, '%s: ')
1059         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1060         if only_once:
1061             if f'WARNING: {msg}' in self._printed_messages:
1062                 return
1063             self._printed_messages.add(f'WARNING: {msg}')
1064         self._downloader.report_warning(msg, *args, **kwargs)
1065
1066     def to_screen(self, msg, *args, **kwargs):
1067         """Print msg to screen, prefixing it with '[ie_name]'"""
1068         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1069
1070     def write_debug(self, msg, *args, **kwargs):
1071         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1072
1073     def get_param(self, name, default=None, *args, **kwargs):
1074         if self._downloader:
1075             return self._downloader.params.get(name, default, *args, **kwargs)
1076         return default
1077
1078     def report_drm(self, video_id, partial=False):
1079         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1080
1081     def report_extraction(self, id_or_name):
1082         """Report information extraction."""
1083         self.to_screen('%s: Extracting information' % id_or_name)
1084
1085     def report_download_webpage(self, video_id):
1086         """Report webpage download."""
1087         self.to_screen('%s: Downloading webpage' % video_id)
1088
1089     def report_age_confirmation(self):
1090         """Report attempt to confirm age."""
1091         self.to_screen('Confirming age')
1092
1093     def report_login(self):
1094         """Report attempt to log in."""
1095         self.to_screen('Logging in')
1096
1097     def raise_login_required(
1098             self, msg='This video is only available for registered users',
1099             metadata_available=False, method=NO_DEFAULT):
1100         if metadata_available and (
1101                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1102             self.report_warning(msg)
1103             return
1104         msg += format_field(self._login_hint(method), None, '. %s')
1105         raise ExtractorError(msg, expected=True)
1106
1107     def raise_geo_restricted(
1108             self, msg='This video is not available from your location due to geo restriction',
1109             countries=None, metadata_available=False):
1110         if metadata_available and (
1111                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1112             self.report_warning(msg)
1113         else:
1114             raise GeoRestrictedError(msg, countries=countries)
1115
1116     def raise_no_formats(self, msg, expected=False, video_id=None):
1117         if expected and (
1118                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1119             self.report_warning(msg, video_id)
1120         elif isinstance(msg, ExtractorError):
1121             raise msg
1122         else:
1123             raise ExtractorError(msg, expected=expected, video_id=video_id)
1124
1125     # Methods for following #608
1126     @staticmethod
1127     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1128         """Returns a URL that points to a page that should be processed"""
1129         if ie is not None:
1130             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1131         if video_id is not None:
1132             kwargs['id'] = video_id
1133         if video_title is not None:
1134             kwargs['title'] = video_title
1135         return {
1136             **kwargs,
1137             '_type': 'url_transparent' if url_transparent else 'url',
1138             'url': url,
1139         }
1140
1141     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1142         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1143                 for m in orderedSet(map(getter, matches) if getter else matches))
1144         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1145
1146     @staticmethod
1147     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1148         """Returns a playlist"""
1149         if playlist_id:
1150             kwargs['id'] = playlist_id
1151         if playlist_title:
1152             kwargs['title'] = playlist_title
1153         if playlist_description is not None:
1154             kwargs['description'] = playlist_description
1155         return {
1156             **kwargs,
1157             '_type': 'multi_video' if multi_video else 'playlist',
1158             'entries': entries,
1159         }
1160
1161     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1162         """
1163         Perform a regex search on the given string, using a single or a list of
1164         patterns returning the first matching group.
1165         In case of failure return a default value or raise a WARNING or a
1166         RegexNotFoundError, depending on fatal, specifying the field name.
1167         """
1168         if string is None:
1169             mobj = None
1170         elif isinstance(pattern, (str, re.Pattern)):
1171             mobj = re.search(pattern, string, flags)
1172         else:
1173             for p in pattern:
1174                 mobj = re.search(p, string, flags)
1175                 if mobj:
1176                     break
1177
1178         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1179
1180         if mobj:
1181             if group is None:
1182                 # return the first matching group
1183                 return next(g for g in mobj.groups() if g is not None)
1184             elif isinstance(group, (list, tuple)):
1185                 return tuple(mobj.group(g) for g in group)
1186             else:
1187                 return mobj.group(group)
1188         elif default is not NO_DEFAULT:
1189             return default
1190         elif fatal:
1191             raise RegexNotFoundError('Unable to extract %s' % _name)
1192         else:
1193             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1194             return None
1195
1196     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1197                      contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs):
1198         """Searches string for the JSON object specified by start_pattern"""
1199         # NB: end_pattern is only used to reduce the size of the initial match
1200         if default is NO_DEFAULT:
1201             default, has_default = {}, False
1202         else:
1203             fatal, has_default = False, True
1204
1205         json_string = self._search_regex(
1206             rf'{start_pattern}\s*(?P<json>{{\s*{contains_pattern}\s*}})\s*{end_pattern}',
1207             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1208         if not json_string:
1209             return default
1210
1211         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1212         try:
1213             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1214         except ExtractorError as e:
1215             if fatal:
1216                 raise ExtractorError(
1217                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1218             elif not has_default:
1219                 self.report_warning(
1220                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1221         return default
1222
1223     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1224         """
1225         Like _search_regex, but strips HTML tags and unescapes entities.
1226         """
1227         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1228         if res:
1229             return clean_html(res).strip()
1230         else:
1231             return res
1232
1233     def _get_netrc_login_info(self, netrc_machine=None):
1234         username = None
1235         password = None
1236         netrc_machine = netrc_machine or self._NETRC_MACHINE
1237
1238         if self.get_param('usenetrc', False):
1239             try:
1240                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1241                 if os.path.isdir(netrc_file):
1242                     netrc_file = os.path.join(netrc_file, '.netrc')
1243                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1244                 if info is not None:
1245                     username = info[0]
1246                     password = info[2]
1247                 else:
1248                     raise netrc.NetrcParseError(
1249                         'No authenticators for %s' % netrc_machine)
1250             except (OSError, netrc.NetrcParseError) as err:
1251                 self.report_warning(
1252                     'parsing .netrc: %s' % error_to_compat_str(err))
1253
1254         return username, password
1255
1256     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1257         """
1258         Get the login info as (username, password)
1259         First look for the manually specified credentials using username_option
1260         and password_option as keys in params dictionary. If no such credentials
1261         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1262         value.
1263         If there's no info available, return (None, None)
1264         """
1265
1266         # Attempt to use provided username and password or .netrc data
1267         username = self.get_param(username_option)
1268         if username is not None:
1269             password = self.get_param(password_option)
1270         else:
1271             username, password = self._get_netrc_login_info(netrc_machine)
1272
1273         return username, password
1274
1275     def _get_tfa_info(self, note='two-factor verification code'):
1276         """
1277         Get the two-factor authentication info
1278         TODO - asking the user will be required for sms/phone verify
1279         currently just uses the command line option
1280         If there's no info available, return None
1281         """
1282
1283         tfa = self.get_param('twofactor')
1284         if tfa is not None:
1285             return tfa
1286
1287         return getpass.getpass('Type %s and press [Return]: ' % note)
1288
1289     # Helper functions for extracting OpenGraph info
1290     @staticmethod
1291     def _og_regexes(prop):
1292         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1293         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1294                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1295         template = r'<meta[^>]+?%s[^>]+?%s'
1296         return [
1297             template % (property_re, content_re),
1298             template % (content_re, property_re),
1299         ]
1300
1301     @staticmethod
1302     def _meta_regex(prop):
1303         return r'''(?isx)<meta
1304                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1305                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1306
1307     def _og_search_property(self, prop, html, name=None, **kargs):
1308         prop = variadic(prop)
1309         if name is None:
1310             name = 'OpenGraph %s' % prop[0]
1311         og_regexes = []
1312         for p in prop:
1313             og_regexes.extend(self._og_regexes(p))
1314         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1315         if escaped is None:
1316             return None
1317         return unescapeHTML(escaped)
1318
1319     def _og_search_thumbnail(self, html, **kargs):
1320         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1321
1322     def _og_search_description(self, html, **kargs):
1323         return self._og_search_property('description', html, fatal=False, **kargs)
1324
1325     def _og_search_title(self, html, *, fatal=False, **kargs):
1326         return self._og_search_property('title', html, fatal=fatal, **kargs)
1327
1328     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1329         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1330         if secure:
1331             regexes = self._og_regexes('video:secure_url') + regexes
1332         return self._html_search_regex(regexes, html, name, **kargs)
1333
1334     def _og_search_url(self, html, **kargs):
1335         return self._og_search_property('url', html, **kargs)
1336
1337     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1338         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1339
1340     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1341         name = variadic(name)
1342         if display_name is None:
1343             display_name = name[0]
1344         return self._html_search_regex(
1345             [self._meta_regex(n) for n in name],
1346             html, display_name, fatal=fatal, group='content', **kwargs)
1347
1348     def _dc_search_uploader(self, html):
1349         return self._html_search_meta('dc.creator', html, 'uploader')
1350
1351     def _rta_search(self, html):
1352         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1353         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1354                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1355                      html):
1356             return 18
1357         return 0
1358
1359     def _media_rating_search(self, html):
1360         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1361         rating = self._html_search_meta('rating', html)
1362
1363         if not rating:
1364             return None
1365
1366         RATING_TABLE = {
1367             'safe for kids': 0,
1368             'general': 8,
1369             '14 years': 14,
1370             'mature': 17,
1371             'restricted': 19,
1372         }
1373         return RATING_TABLE.get(rating.lower())
1374
1375     def _family_friendly_search(self, html):
1376         # See http://schema.org/VideoObject
1377         family_friendly = self._html_search_meta(
1378             'isFamilyFriendly', html, default=None)
1379
1380         if not family_friendly:
1381             return None
1382
1383         RATING_TABLE = {
1384             '1': 0,
1385             'true': 0,
1386             '0': 18,
1387             'false': 18,
1388         }
1389         return RATING_TABLE.get(family_friendly.lower())
1390
1391     def _twitter_search_player(self, html):
1392         return self._html_search_meta('twitter:player', html,
1393                                       'twitter card player')
1394
1395     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1396         """Yield all json ld objects in the html"""
1397         if default is not NO_DEFAULT:
1398             fatal = False
1399         for mobj in re.finditer(JSON_LD_RE, html):
1400             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1401             for json_ld in variadic(json_ld_item):
1402                 if isinstance(json_ld, dict):
1403                     yield json_ld
1404
1405     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1406         """Search for a video in any json ld in the html"""
1407         if default is not NO_DEFAULT:
1408             fatal = False
1409         info = self._json_ld(
1410             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1411             video_id, fatal=fatal, expected_type=expected_type)
1412         if info:
1413             return info
1414         if default is not NO_DEFAULT:
1415             return default
1416         elif fatal:
1417             raise RegexNotFoundError('Unable to extract JSON-LD')
1418         else:
1419             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1420             return {}
1421
1422     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1423         if isinstance(json_ld, str):
1424             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1425         if not json_ld:
1426             return {}
1427         info = {}
1428         if not isinstance(json_ld, (list, tuple, dict)):
1429             return info
1430         if isinstance(json_ld, dict):
1431             json_ld = [json_ld]
1432
1433         INTERACTION_TYPE_MAP = {
1434             'CommentAction': 'comment',
1435             'AgreeAction': 'like',
1436             'DisagreeAction': 'dislike',
1437             'LikeAction': 'like',
1438             'DislikeAction': 'dislike',
1439             'ListenAction': 'view',
1440             'WatchAction': 'view',
1441             'ViewAction': 'view',
1442         }
1443
1444         def is_type(e, *expected_types):
1445             type = variadic(traverse_obj(e, '@type'))
1446             return any(x in type for x in expected_types)
1447
1448         def extract_interaction_type(e):
1449             interaction_type = e.get('interactionType')
1450             if isinstance(interaction_type, dict):
1451                 interaction_type = interaction_type.get('@type')
1452             return str_or_none(interaction_type)
1453
1454         def extract_interaction_statistic(e):
1455             interaction_statistic = e.get('interactionStatistic')
1456             if isinstance(interaction_statistic, dict):
1457                 interaction_statistic = [interaction_statistic]
1458             if not isinstance(interaction_statistic, list):
1459                 return
1460             for is_e in interaction_statistic:
1461                 if not is_type(is_e, 'InteractionCounter'):
1462                     continue
1463                 interaction_type = extract_interaction_type(is_e)
1464                 if not interaction_type:
1465                     continue
1466                 # For interaction count some sites provide string instead of
1467                 # an integer (as per spec) with non digit characters (e.g. ",")
1468                 # so extracting count with more relaxed str_to_int
1469                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1470                 if interaction_count is None:
1471                     continue
1472                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1473                 if not count_kind:
1474                     continue
1475                 count_key = '%s_count' % count_kind
1476                 if info.get(count_key) is not None:
1477                     continue
1478                 info[count_key] = interaction_count
1479
1480         def extract_chapter_information(e):
1481             chapters = [{
1482                 'title': part.get('name'),
1483                 'start_time': part.get('startOffset'),
1484                 'end_time': part.get('endOffset'),
1485             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1486             for idx, (last_c, current_c, next_c) in enumerate(zip(
1487                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1488                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1489                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1490                 if None in current_c.values():
1491                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1492                     return
1493             if chapters:
1494                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1495                 info['chapters'] = chapters
1496
1497         def extract_video_object(e):
1498             assert is_type(e, 'VideoObject')
1499             author = e.get('author')
1500             info.update({
1501                 'url': url_or_none(e.get('contentUrl')),
1502                 'title': unescapeHTML(e.get('name')),
1503                 'description': unescapeHTML(e.get('description')),
1504                 'thumbnails': [{'url': url}
1505                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1506                                if url_or_none(url)],
1507                 'duration': parse_duration(e.get('duration')),
1508                 'timestamp': unified_timestamp(e.get('uploadDate')),
1509                 # author can be an instance of 'Organization' or 'Person' types.
1510                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1511                 # however some websites are using 'Text' type instead.
1512                 # 1. https://schema.org/VideoObject
1513                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1514                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1515                 'tbr': int_or_none(e.get('bitrate')),
1516                 'width': int_or_none(e.get('width')),
1517                 'height': int_or_none(e.get('height')),
1518                 'view_count': int_or_none(e.get('interactionCount')),
1519             })
1520             extract_interaction_statistic(e)
1521             extract_chapter_information(e)
1522
1523         def traverse_json_ld(json_ld, at_top_level=True):
1524             for e in json_ld:
1525                 if at_top_level and '@context' not in e:
1526                     continue
1527                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1528                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1529                     break
1530                 if expected_type is not None and not is_type(e, expected_type):
1531                     continue
1532                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1533                 if rating is not None:
1534                     info['average_rating'] = rating
1535                 if is_type(e, 'TVEpisode', 'Episode'):
1536                     episode_name = unescapeHTML(e.get('name'))
1537                     info.update({
1538                         'episode': episode_name,
1539                         'episode_number': int_or_none(e.get('episodeNumber')),
1540                         'description': unescapeHTML(e.get('description')),
1541                     })
1542                     if not info.get('title') and episode_name:
1543                         info['title'] = episode_name
1544                     part_of_season = e.get('partOfSeason')
1545                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1546                         info.update({
1547                             'season': unescapeHTML(part_of_season.get('name')),
1548                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1549                         })
1550                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1551                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1552                         info['series'] = unescapeHTML(part_of_series.get('name'))
1553                 elif is_type(e, 'Movie'):
1554                     info.update({
1555                         'title': unescapeHTML(e.get('name')),
1556                         'description': unescapeHTML(e.get('description')),
1557                         'duration': parse_duration(e.get('duration')),
1558                         'timestamp': unified_timestamp(e.get('dateCreated')),
1559                     })
1560                 elif is_type(e, 'Article', 'NewsArticle'):
1561                     info.update({
1562                         'timestamp': parse_iso8601(e.get('datePublished')),
1563                         'title': unescapeHTML(e.get('headline')),
1564                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1565                     })
1566                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1567                         extract_video_object(e['video'][0])
1568                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1569                         extract_video_object(e['subjectOf'][0])
1570                 elif is_type(e, 'VideoObject'):
1571                     extract_video_object(e)
1572                     if expected_type is None:
1573                         continue
1574                     else:
1575                         break
1576                 video = e.get('video')
1577                 if is_type(video, 'VideoObject'):
1578                     extract_video_object(video)
1579                 if expected_type is None:
1580                     continue
1581                 else:
1582                     break
1583         traverse_json_ld(json_ld)
1584
1585         return filter_dict(info)
1586
1587     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1588         return self._parse_json(
1589             self._search_regex(
1590                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1591                 webpage, 'next.js data', fatal=fatal, **kw),
1592             video_id, transform_source=transform_source, fatal=fatal)
1593
1594     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1595         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1596         rectx = re.escape(context_name)
1597         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1598         js, arg_keys, arg_vals = self._search_regex(
1599             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1600             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
1601
1602         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1603
1604         for key, val in args.items():
1605             if val in ('undefined', 'void 0'):
1606                 args[key] = 'null'
1607
1608         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1609         return traverse_obj(ret, traverse) or {}
1610
1611     @staticmethod
1612     def _hidden_inputs(html):
1613         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1614         hidden_inputs = {}
1615         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1616             attrs = extract_attributes(input)
1617             if not input:
1618                 continue
1619             if attrs.get('type') not in ('hidden', 'submit'):
1620                 continue
1621             name = attrs.get('name') or attrs.get('id')
1622             value = attrs.get('value')
1623             if name and value is not None:
1624                 hidden_inputs[name] = value
1625         return hidden_inputs
1626
1627     def _form_hidden_inputs(self, form_id, html):
1628         form = self._search_regex(
1629             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1630             html, '%s form' % form_id, group='form')
1631         return self._hidden_inputs(form)
1632
1633     class FormatSort:
1634         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1635
1636         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1637                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1638                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1639         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1640                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1641                         'fps', 'fs_approx', 'source', 'id')
1642
1643         settings = {
1644             'vcodec': {'type': 'ordered', 'regex': True,
1645                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1646             'acodec': {'type': 'ordered', 'regex': True,
1647                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1648             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1649                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1650             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1651                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1652             'vext': {'type': 'ordered', 'field': 'video_ext',
1653                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1654                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1655             'aext': {'type': 'ordered', 'field': 'audio_ext',
1656                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1657                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1658             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1659             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1660                            'field': ('vcodec', 'acodec'),
1661                            'function': lambda it: int(any(v != 'none' for v in it))},
1662             'ie_pref': {'priority': True, 'type': 'extractor'},
1663             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1664             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1665             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1666             'quality': {'convert': 'float', 'default': -1},
1667             'filesize': {'convert': 'bytes'},
1668             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1669             'id': {'convert': 'string', 'field': 'format_id'},
1670             'height': {'convert': 'float_none'},
1671             'width': {'convert': 'float_none'},
1672             'fps': {'convert': 'float_none'},
1673             'tbr': {'convert': 'float_none'},
1674             'vbr': {'convert': 'float_none'},
1675             'abr': {'convert': 'float_none'},
1676             'asr': {'convert': 'float_none'},
1677             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1678
1679             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1680             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1681             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1682             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1683             'res': {'type': 'multiple', 'field': ('height', 'width'),
1684                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1685
1686             # For compatibility with youtube-dl
1687             'format_id': {'type': 'alias', 'field': 'id'},
1688             'preference': {'type': 'alias', 'field': 'ie_pref'},
1689             'language_preference': {'type': 'alias', 'field': 'lang'},
1690             'source_preference': {'type': 'alias', 'field': 'source'},
1691             'protocol': {'type': 'alias', 'field': 'proto'},
1692             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1693
1694             # Deprecated
1695             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1696             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1697             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1698             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1699             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1700             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1701             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1702             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1703             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1704             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1705             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1706             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1707             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1708             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1709             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1710             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1711             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1712             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1713             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1714             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1715         }
1716
1717         def __init__(self, ie, field_preference):
1718             self._order = []
1719             self.ydl = ie._downloader
1720             self.evaluate_params(self.ydl.params, field_preference)
1721             if ie.get_param('verbose'):
1722                 self.print_verbose_info(self.ydl.write_debug)
1723
1724         def _get_field_setting(self, field, key):
1725             if field not in self.settings:
1726                 if key in ('forced', 'priority'):
1727                     return False
1728                 self.ydl.deprecation_warning(
1729                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1730                     'and may be removed in a future version')
1731                 self.settings[field] = {}
1732             propObj = self.settings[field]
1733             if key not in propObj:
1734                 type = propObj.get('type')
1735                 if key == 'field':
1736                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1737                 elif key == 'convert':
1738                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1739                 else:
1740                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1741                 propObj[key] = default
1742             return propObj[key]
1743
1744         def _resolve_field_value(self, field, value, convertNone=False):
1745             if value is None:
1746                 if not convertNone:
1747                     return None
1748             else:
1749                 value = value.lower()
1750             conversion = self._get_field_setting(field, 'convert')
1751             if conversion == 'ignore':
1752                 return None
1753             if conversion == 'string':
1754                 return value
1755             elif conversion == 'float_none':
1756                 return float_or_none(value)
1757             elif conversion == 'bytes':
1758                 return FileDownloader.parse_bytes(value)
1759             elif conversion == 'order':
1760                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1761                 use_regex = self._get_field_setting(field, 'regex')
1762                 list_length = len(order_list)
1763                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1764                 if use_regex and value is not None:
1765                     for i, regex in enumerate(order_list):
1766                         if regex and re.match(regex, value):
1767                             return list_length - i
1768                     return list_length - empty_pos  # not in list
1769                 else:  # not regex or  value = None
1770                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1771             else:
1772                 if value.isnumeric():
1773                     return float(value)
1774                 else:
1775                     self.settings[field]['convert'] = 'string'
1776                     return value
1777
1778         def evaluate_params(self, params, sort_extractor):
1779             self._use_free_order = params.get('prefer_free_formats', False)
1780             self._sort_user = params.get('format_sort', [])
1781             self._sort_extractor = sort_extractor
1782
1783             def add_item(field, reverse, closest, limit_text):
1784                 field = field.lower()
1785                 if field in self._order:
1786                     return
1787                 self._order.append(field)
1788                 limit = self._resolve_field_value(field, limit_text)
1789                 data = {
1790                     'reverse': reverse,
1791                     'closest': False if limit is None else closest,
1792                     'limit_text': limit_text,
1793                     'limit': limit}
1794                 if field in self.settings:
1795                     self.settings[field].update(data)
1796                 else:
1797                     self.settings[field] = data
1798
1799             sort_list = (
1800                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1801                 + (tuple() if params.get('format_sort_force', False)
1802                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1803                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1804
1805             for item in sort_list:
1806                 match = re.match(self.regex, item)
1807                 if match is None:
1808                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1809                 field = match.group('field')
1810                 if field is None:
1811                     continue
1812                 if self._get_field_setting(field, 'type') == 'alias':
1813                     alias, field = field, self._get_field_setting(field, 'field')
1814                     if self._get_field_setting(alias, 'deprecated'):
1815                         self.ydl.deprecation_warning(
1816                             f'Format sorting alias {alias} is deprecated '
1817                             f'and may be removed in a future version. Please use {field} instead')
1818                 reverse = match.group('reverse') is not None
1819                 closest = match.group('separator') == '~'
1820                 limit_text = match.group('limit')
1821
1822                 has_limit = limit_text is not None
1823                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1824                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1825
1826                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1827                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1828                 limit_count = len(limits)
1829                 for (i, f) in enumerate(fields):
1830                     add_item(f, reverse, closest,
1831                              limits[i] if i < limit_count
1832                              else limits[0] if has_limit and not has_multiple_limits
1833                              else None)
1834
1835         def print_verbose_info(self, write_debug):
1836             if self._sort_user:
1837                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1838             if self._sort_extractor:
1839                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1840             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1841                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1842                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1843                               self._get_field_setting(field, 'limit_text'),
1844                               self._get_field_setting(field, 'limit'))
1845                 if self._get_field_setting(field, 'limit_text') is not None else '')
1846                 for field in self._order if self._get_field_setting(field, 'visible')]))
1847
1848         def _calculate_field_preference_from_value(self, format, field, type, value):
1849             reverse = self._get_field_setting(field, 'reverse')
1850             closest = self._get_field_setting(field, 'closest')
1851             limit = self._get_field_setting(field, 'limit')
1852
1853             if type == 'extractor':
1854                 maximum = self._get_field_setting(field, 'max')
1855                 if value is None or (maximum is not None and value >= maximum):
1856                     value = -1
1857             elif type == 'boolean':
1858                 in_list = self._get_field_setting(field, 'in_list')
1859                 not_in_list = self._get_field_setting(field, 'not_in_list')
1860                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1861             elif type == 'ordered':
1862                 value = self._resolve_field_value(field, value, True)
1863
1864             # try to convert to number
1865             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1866             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1867             if is_num:
1868                 value = val_num
1869
1870             return ((-10, 0) if value is None
1871                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1872                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1873                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1874                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1875                     else (-1, value, 0))
1876
1877         def _calculate_field_preference(self, format, field):
1878             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1879             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1880             if type == 'multiple':
1881                 type = 'field'  # Only 'field' is allowed in multiple for now
1882                 actual_fields = self._get_field_setting(field, 'field')
1883
1884                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1885             else:
1886                 value = get_value(field)
1887             return self._calculate_field_preference_from_value(format, field, type, value)
1888
1889         def calculate_preference(self, format):
1890             # Determine missing protocol
1891             if not format.get('protocol'):
1892                 format['protocol'] = determine_protocol(format)
1893
1894             # Determine missing ext
1895             if not format.get('ext') and 'url' in format:
1896                 format['ext'] = determine_ext(format['url'])
1897             if format.get('vcodec') == 'none':
1898                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1899                 format['video_ext'] = 'none'
1900             else:
1901                 format['video_ext'] = format['ext']
1902                 format['audio_ext'] = 'none'
1903             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1904             #    format['preference'] = -1000
1905
1906             # Determine missing bitrates
1907             if format.get('tbr') is None:
1908                 if format.get('vbr') is not None and format.get('abr') is not None:
1909                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1910             else:
1911                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1912                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1913                 if format.get('acodec') != 'none' and format.get('abr') is None:
1914                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1915
1916             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1917
1918     def _sort_formats(self, formats, field_preference=[]):
1919         if not formats:
1920             return
1921         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1922
1923     def _check_formats(self, formats, video_id):
1924         if formats:
1925             formats[:] = filter(
1926                 lambda f: self._is_valid_url(
1927                     f['url'], video_id,
1928                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1929                 formats)
1930
1931     @staticmethod
1932     def _remove_duplicate_formats(formats):
1933         format_urls = set()
1934         unique_formats = []
1935         for f in formats:
1936             if f['url'] not in format_urls:
1937                 format_urls.add(f['url'])
1938                 unique_formats.append(f)
1939         formats[:] = unique_formats
1940
1941     def _is_valid_url(self, url, video_id, item='video', headers={}):
1942         url = self._proto_relative_url(url, scheme='http:')
1943         # For now assume non HTTP(S) URLs always valid
1944         if not (url.startswith('http://') or url.startswith('https://')):
1945             return True
1946         try:
1947             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1948             return True
1949         except ExtractorError as e:
1950             self.to_screen(
1951                 '%s: %s URL is invalid, skipping: %s'
1952                 % (video_id, item, error_to_compat_str(e.cause)))
1953             return False
1954
1955     def http_scheme(self):
1956         """ Either "http:" or "https:", depending on the user's preferences """
1957         return (
1958             'http:'
1959             if self.get_param('prefer_insecure', False)
1960             else 'https:')
1961
1962     def _proto_relative_url(self, url, scheme=None):
1963         if url is None:
1964             return url
1965         if url.startswith('//'):
1966             if scheme is None:
1967                 scheme = self.http_scheme()
1968             return scheme + url
1969         else:
1970             return url
1971
1972     def _sleep(self, timeout, video_id, msg_template=None):
1973         if msg_template is None:
1974             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1975         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1976         self.to_screen(msg)
1977         time.sleep(timeout)
1978
1979     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1980                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1981                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1982         res = self._download_xml_handle(
1983             manifest_url, video_id, 'Downloading f4m manifest',
1984             'Unable to download f4m manifest',
1985             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1986             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1987             transform_source=transform_source,
1988             fatal=fatal, data=data, headers=headers, query=query)
1989         if res is False:
1990             return []
1991
1992         manifest, urlh = res
1993         manifest_url = urlh.geturl()
1994
1995         return self._parse_f4m_formats(
1996             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1997             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1998
1999     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2000                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2001                            fatal=True, m3u8_id=None):
2002         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2003             return []
2004
2005         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2006         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2007         if akamai_pv is not None and ';' in akamai_pv.text:
2008             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2009             if playerVerificationChallenge.strip() != '':
2010                 return []
2011
2012         formats = []
2013         manifest_version = '1.0'
2014         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2015         if not media_nodes:
2016             manifest_version = '2.0'
2017             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2018         # Remove unsupported DRM protected media from final formats
2019         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2020         media_nodes = remove_encrypted_media(media_nodes)
2021         if not media_nodes:
2022             return formats
2023
2024         manifest_base_url = get_base_url(manifest)
2025
2026         bootstrap_info = xpath_element(
2027             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2028             'bootstrap info', default=None)
2029
2030         vcodec = None
2031         mime_type = xpath_text(
2032             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2033             'base URL', default=None)
2034         if mime_type and mime_type.startswith('audio/'):
2035             vcodec = 'none'
2036
2037         for i, media_el in enumerate(media_nodes):
2038             tbr = int_or_none(media_el.attrib.get('bitrate'))
2039             width = int_or_none(media_el.attrib.get('width'))
2040             height = int_or_none(media_el.attrib.get('height'))
2041             format_id = join_nonempty(f4m_id, tbr or i)
2042             # If <bootstrapInfo> is present, the specified f4m is a
2043             # stream-level manifest, and only set-level manifests may refer to
2044             # external resources.  See section 11.4 and section 4 of F4M spec
2045             if bootstrap_info is None:
2046                 media_url = None
2047                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2048                 if manifest_version == '2.0':
2049                     media_url = media_el.attrib.get('href')
2050                 if media_url is None:
2051                     media_url = media_el.attrib.get('url')
2052                 if not media_url:
2053                     continue
2054                 manifest_url = (
2055                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2056                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2057                 # If media_url is itself a f4m manifest do the recursive extraction
2058                 # since bitrates in parent manifest (this one) and media_url manifest
2059                 # may differ leading to inability to resolve the format by requested
2060                 # bitrate in f4m downloader
2061                 ext = determine_ext(manifest_url)
2062                 if ext == 'f4m':
2063                     f4m_formats = self._extract_f4m_formats(
2064                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2065                         transform_source=transform_source, fatal=fatal)
2066                     # Sometimes stream-level manifest contains single media entry that
2067                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2068                     # At the same time parent's media entry in set-level manifest may
2069                     # contain it. We will copy it from parent in such cases.
2070                     if len(f4m_formats) == 1:
2071                         f = f4m_formats[0]
2072                         f.update({
2073                             'tbr': f.get('tbr') or tbr,
2074                             'width': f.get('width') or width,
2075                             'height': f.get('height') or height,
2076                             'format_id': f.get('format_id') if not tbr else format_id,
2077                             'vcodec': vcodec,
2078                         })
2079                     formats.extend(f4m_formats)
2080                     continue
2081                 elif ext == 'm3u8':
2082                     formats.extend(self._extract_m3u8_formats(
2083                         manifest_url, video_id, 'mp4', preference=preference,
2084                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2085                     continue
2086             formats.append({
2087                 'format_id': format_id,
2088                 'url': manifest_url,
2089                 'manifest_url': manifest_url,
2090                 'ext': 'flv' if bootstrap_info is not None else None,
2091                 'protocol': 'f4m',
2092                 'tbr': tbr,
2093                 'width': width,
2094                 'height': height,
2095                 'vcodec': vcodec,
2096                 'preference': preference,
2097                 'quality': quality,
2098             })
2099         return formats
2100
2101     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2102         return {
2103             'format_id': join_nonempty(m3u8_id, 'meta'),
2104             'url': m3u8_url,
2105             'ext': ext,
2106             'protocol': 'm3u8',
2107             'preference': preference - 100 if preference else -100,
2108             'quality': quality,
2109             'resolution': 'multiple',
2110             'format_note': 'Quality selection URL',
2111         }
2112
2113     def _report_ignoring_subs(self, name):
2114         self.report_warning(bug_reports_message(
2115             f'Ignoring subtitle tracks found in the {name} manifest; '
2116             'if any subtitle tracks are missing,'
2117         ), only_once=True)
2118
2119     def _extract_m3u8_formats(self, *args, **kwargs):
2120         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2121         if subs:
2122             self._report_ignoring_subs('HLS')
2123         return fmts
2124
2125     def _extract_m3u8_formats_and_subtitles(
2126             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2127             preference=None, quality=None, m3u8_id=None, note=None,
2128             errnote=None, fatal=True, live=False, data=None, headers={},
2129             query={}):
2130
2131         res = self._download_webpage_handle(
2132             m3u8_url, video_id,
2133             note='Downloading m3u8 information' if note is None else note,
2134             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2135             fatal=fatal, data=data, headers=headers, query=query)
2136
2137         if res is False:
2138             return [], {}
2139
2140         m3u8_doc, urlh = res
2141         m3u8_url = urlh.geturl()
2142
2143         return self._parse_m3u8_formats_and_subtitles(
2144             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2145             preference=preference, quality=quality, m3u8_id=m3u8_id,
2146             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2147             headers=headers, query=query, video_id=video_id)
2148
2149     def _parse_m3u8_formats_and_subtitles(
2150             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2151             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2152             errnote=None, fatal=True, data=None, headers={}, query={},
2153             video_id=None):
2154         formats, subtitles = [], {}
2155
2156         has_drm = re.search('|'.join([
2157             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2158             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2159         ]), m3u8_doc)
2160
2161         def format_url(url):
2162             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2163
2164         if self.get_param('hls_split_discontinuity', False):
2165             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2166                 if not m3u8_doc:
2167                     if not manifest_url:
2168                         return []
2169                     m3u8_doc = self._download_webpage(
2170                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2171                         note=False, errnote='Failed to download m3u8 playlist information')
2172                     if m3u8_doc is False:
2173                         return []
2174                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2175
2176         else:
2177             def _extract_m3u8_playlist_indices(*args, **kwargs):
2178                 return [None]
2179
2180         # References:
2181         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2182         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2183         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2184
2185         # We should try extracting formats only from master playlists [1, 4.3.4],
2186         # i.e. playlists that describe available qualities. On the other hand
2187         # media playlists [1, 4.3.3] should be returned as is since they contain
2188         # just the media without qualities renditions.
2189         # Fortunately, master playlist can be easily distinguished from media
2190         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2191         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2192         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2193         # media playlist and MUST NOT appear in master playlist thus we can
2194         # clearly detect media playlist with this criterion.
2195
2196         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2197             formats = [{
2198                 'format_id': join_nonempty(m3u8_id, idx),
2199                 'format_index': idx,
2200                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2201                 'ext': ext,
2202                 'protocol': entry_protocol,
2203                 'preference': preference,
2204                 'quality': quality,
2205                 'has_drm': has_drm,
2206             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2207
2208             return formats, subtitles
2209
2210         groups = {}
2211         last_stream_inf = {}
2212
2213         def extract_media(x_media_line):
2214             media = parse_m3u8_attributes(x_media_line)
2215             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2216             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2217             if not (media_type and group_id and name):
2218                 return
2219             groups.setdefault(group_id, []).append(media)
2220             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2221             if media_type == 'SUBTITLES':
2222                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2223                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2224                 # However, lack of URI has been spotted in the wild.
2225                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2226                 if not media.get('URI'):
2227                     return
2228                 url = format_url(media['URI'])
2229                 sub_info = {
2230                     'url': url,
2231                     'ext': determine_ext(url),
2232                 }
2233                 if sub_info['ext'] == 'm3u8':
2234                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2235                     # files may contain is WebVTT:
2236                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2237                     sub_info['ext'] = 'vtt'
2238                     sub_info['protocol'] = 'm3u8_native'
2239                 lang = media.get('LANGUAGE') or 'und'
2240                 subtitles.setdefault(lang, []).append(sub_info)
2241             if media_type not in ('VIDEO', 'AUDIO'):
2242                 return
2243             media_url = media.get('URI')
2244             if media_url:
2245                 manifest_url = format_url(media_url)
2246                 formats.extend({
2247                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2248                     'format_note': name,
2249                     'format_index': idx,
2250                     'url': manifest_url,
2251                     'manifest_url': m3u8_url,
2252                     'language': media.get('LANGUAGE'),
2253                     'ext': ext,
2254                     'protocol': entry_protocol,
2255                     'preference': preference,
2256                     'quality': quality,
2257                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2258                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2259
2260         def build_stream_name():
2261             # Despite specification does not mention NAME attribute for
2262             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2263             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2264             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2265             stream_name = last_stream_inf.get('NAME')
2266             if stream_name:
2267                 return stream_name
2268             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2269             # from corresponding rendition group
2270             stream_group_id = last_stream_inf.get('VIDEO')
2271             if not stream_group_id:
2272                 return
2273             stream_group = groups.get(stream_group_id)
2274             if not stream_group:
2275                 return stream_group_id
2276             rendition = stream_group[0]
2277             return rendition.get('NAME') or stream_group_id
2278
2279         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2280         # chance to detect video only formats when EXT-X-STREAM-INF tags
2281         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2282         for line in m3u8_doc.splitlines():
2283             if line.startswith('#EXT-X-MEDIA:'):
2284                 extract_media(line)
2285
2286         for line in m3u8_doc.splitlines():
2287             if line.startswith('#EXT-X-STREAM-INF:'):
2288                 last_stream_inf = parse_m3u8_attributes(line)
2289             elif line.startswith('#') or not line.strip():
2290                 continue
2291             else:
2292                 tbr = float_or_none(
2293                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2294                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2295                 manifest_url = format_url(line.strip())
2296
2297                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2298                     format_id = [m3u8_id, None, idx]
2299                     # Bandwidth of live streams may differ over time thus making
2300                     # format_id unpredictable. So it's better to keep provided
2301                     # format_id intact.
2302                     if not live:
2303                         stream_name = build_stream_name()
2304                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2305                     f = {
2306                         'format_id': join_nonempty(*format_id),
2307                         'format_index': idx,
2308                         'url': manifest_url,
2309                         'manifest_url': m3u8_url,
2310                         'tbr': tbr,
2311                         'ext': ext,
2312                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2313                         'protocol': entry_protocol,
2314                         'preference': preference,
2315                         'quality': quality,
2316                     }
2317                     resolution = last_stream_inf.get('RESOLUTION')
2318                     if resolution:
2319                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2320                         if mobj:
2321                             f['width'] = int(mobj.group('width'))
2322                             f['height'] = int(mobj.group('height'))
2323                     # Unified Streaming Platform
2324                     mobj = re.search(
2325                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2326                     if mobj:
2327                         abr, vbr = mobj.groups()
2328                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2329                         f.update({
2330                             'vbr': vbr,
2331                             'abr': abr,
2332                         })
2333                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2334                     f.update(codecs)
2335                     audio_group_id = last_stream_inf.get('AUDIO')
2336                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2337                     # references a rendition group MUST have a CODECS attribute.
2338                     # However, this is not always respected, for example, [2]
2339                     # contains EXT-X-STREAM-INF tag which references AUDIO
2340                     # rendition group but does not have CODECS and despite
2341                     # referencing an audio group it represents a complete
2342                     # (with audio and video) format. So, for such cases we will
2343                     # ignore references to rendition groups and treat them
2344                     # as complete formats.
2345                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2346                         audio_group = groups.get(audio_group_id)
2347                         if audio_group and audio_group[0].get('URI'):
2348                             # TODO: update acodec for audio only formats with
2349                             # the same GROUP-ID
2350                             f['acodec'] = 'none'
2351                     if not f.get('ext'):
2352                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2353                     formats.append(f)
2354
2355                     # for DailyMotion
2356                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2357                     if progressive_uri:
2358                         http_f = f.copy()
2359                         del http_f['manifest_url']
2360                         http_f.update({
2361                             'format_id': f['format_id'].replace('hls-', 'http-'),
2362                             'protocol': 'http',
2363                             'url': progressive_uri,
2364                         })
2365                         formats.append(http_f)
2366
2367                 last_stream_inf = {}
2368         return formats, subtitles
2369
2370     def _extract_m3u8_vod_duration(
2371             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2372
2373         m3u8_vod = self._download_webpage(
2374             m3u8_vod_url, video_id,
2375             note='Downloading m3u8 VOD manifest' if note is None else note,
2376             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2377             fatal=False, data=data, headers=headers, query=query)
2378
2379         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2380
2381     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2382         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2383             return None
2384
2385         return int(sum(
2386             float(line[len('#EXTINF:'):].split(',')[0])
2387             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2388
2389     @staticmethod
2390     def _xpath_ns(path, namespace=None):
2391         if not namespace:
2392             return path
2393         out = []
2394         for c in path.split('/'):
2395             if not c or c == '.':
2396                 out.append(c)
2397             else:
2398                 out.append('{%s}%s' % (namespace, c))
2399         return '/'.join(out)
2400
2401     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2402         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2403         if res is False:
2404             assert not fatal
2405             return [], {}
2406
2407         smil, urlh = res
2408         smil_url = urlh.geturl()
2409
2410         namespace = self._parse_smil_namespace(smil)
2411
2412         fmts = self._parse_smil_formats(
2413             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2414         subs = self._parse_smil_subtitles(
2415             smil, namespace=namespace)
2416
2417         return fmts, subs
2418
2419     def _extract_smil_formats(self, *args, **kwargs):
2420         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2421         if subs:
2422             self._report_ignoring_subs('SMIL')
2423         return fmts
2424
2425     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2426         res = self._download_smil(smil_url, video_id, fatal=fatal)
2427         if res is False:
2428             return {}
2429
2430         smil, urlh = res
2431         smil_url = urlh.geturl()
2432
2433         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2434
2435     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2436         return self._download_xml_handle(
2437             smil_url, video_id, 'Downloading SMIL file',
2438             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2439
2440     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2441         namespace = self._parse_smil_namespace(smil)
2442
2443         formats = self._parse_smil_formats(
2444             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2445         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2446
2447         video_id = os.path.splitext(url_basename(smil_url))[0]
2448         title = None
2449         description = None
2450         upload_date = None
2451         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2452             name = meta.attrib.get('name')
2453             content = meta.attrib.get('content')
2454             if not name or not content:
2455                 continue
2456             if not title and name == 'title':
2457                 title = content
2458             elif not description and name in ('description', 'abstract'):
2459                 description = content
2460             elif not upload_date and name == 'date':
2461                 upload_date = unified_strdate(content)
2462
2463         thumbnails = [{
2464             'id': image.get('type'),
2465             'url': image.get('src'),
2466             'width': int_or_none(image.get('width')),
2467             'height': int_or_none(image.get('height')),
2468         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2469
2470         return {
2471             'id': video_id,
2472             'title': title or video_id,
2473             'description': description,
2474             'upload_date': upload_date,
2475             'thumbnails': thumbnails,
2476             'formats': formats,
2477             'subtitles': subtitles,
2478         }
2479
2480     def _parse_smil_namespace(self, smil):
2481         return self._search_regex(
2482             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2483
2484     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2485         base = smil_url
2486         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2487             b = meta.get('base') or meta.get('httpBase')
2488             if b:
2489                 base = b
2490                 break
2491
2492         formats = []
2493         rtmp_count = 0
2494         http_count = 0
2495         m3u8_count = 0
2496         imgs_count = 0
2497
2498         srcs = set()
2499         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2500         for medium in media:
2501             src = medium.get('src')
2502             if not src or src in srcs:
2503                 continue
2504             srcs.add(src)
2505
2506             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2507             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2508             width = int_or_none(medium.get('width'))
2509             height = int_or_none(medium.get('height'))
2510             proto = medium.get('proto')
2511             ext = medium.get('ext')
2512             src_ext = determine_ext(src)
2513             streamer = medium.get('streamer') or base
2514
2515             if proto == 'rtmp' or streamer.startswith('rtmp'):
2516                 rtmp_count += 1
2517                 formats.append({
2518                     'url': streamer,
2519                     'play_path': src,
2520                     'ext': 'flv',
2521                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2522                     'tbr': bitrate,
2523                     'filesize': filesize,
2524                     'width': width,
2525                     'height': height,
2526                 })
2527                 if transform_rtmp_url:
2528                     streamer, src = transform_rtmp_url(streamer, src)
2529                     formats[-1].update({
2530                         'url': streamer,
2531                         'play_path': src,
2532                     })
2533                 continue
2534
2535             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2536             src_url = src_url.strip()
2537
2538             if proto == 'm3u8' or src_ext == 'm3u8':
2539                 m3u8_formats = self._extract_m3u8_formats(
2540                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2541                 if len(m3u8_formats) == 1:
2542                     m3u8_count += 1
2543                     m3u8_formats[0].update({
2544                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2545                         'tbr': bitrate,
2546                         'width': width,
2547                         'height': height,
2548                     })
2549                 formats.extend(m3u8_formats)
2550             elif src_ext == 'f4m':
2551                 f4m_url = src_url
2552                 if not f4m_params:
2553                     f4m_params = {
2554                         'hdcore': '3.2.0',
2555                         'plugin': 'flowplayer-3.2.0.1',
2556                     }
2557                 f4m_url += '&' if '?' in f4m_url else '?'
2558                 f4m_url += urllib.parse.urlencode(f4m_params)
2559                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2560             elif src_ext == 'mpd':
2561                 formats.extend(self._extract_mpd_formats(
2562                     src_url, video_id, mpd_id='dash', fatal=False))
2563             elif re.search(r'\.ism/[Mm]anifest', src_url):
2564                 formats.extend(self._extract_ism_formats(
2565                     src_url, video_id, ism_id='mss', fatal=False))
2566             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2567                 http_count += 1
2568                 formats.append({
2569                     'url': src_url,
2570                     'ext': ext or src_ext or 'flv',
2571                     'format_id': 'http-%d' % (bitrate or http_count),
2572                     'tbr': bitrate,
2573                     'filesize': filesize,
2574                     'width': width,
2575                     'height': height,
2576                 })
2577
2578         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2579             src = medium.get('src')
2580             if not src or src in srcs:
2581                 continue
2582             srcs.add(src)
2583
2584             imgs_count += 1
2585             formats.append({
2586                 'format_id': 'imagestream-%d' % (imgs_count),
2587                 'url': src,
2588                 'ext': mimetype2ext(medium.get('type')),
2589                 'acodec': 'none',
2590                 'vcodec': 'none',
2591                 'width': int_or_none(medium.get('width')),
2592                 'height': int_or_none(medium.get('height')),
2593                 'format_note': 'SMIL storyboards',
2594             })
2595
2596         return formats
2597
2598     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2599         urls = []
2600         subtitles = {}
2601         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2602             src = textstream.get('src')
2603             if not src or src in urls:
2604                 continue
2605             urls.append(src)
2606             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2607             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2608             subtitles.setdefault(lang, []).append({
2609                 'url': src,
2610                 'ext': ext,
2611             })
2612         return subtitles
2613
2614     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2615         res = self._download_xml_handle(
2616             xspf_url, playlist_id, 'Downloading xpsf playlist',
2617             'Unable to download xspf manifest', fatal=fatal)
2618         if res is False:
2619             return []
2620
2621         xspf, urlh = res
2622         xspf_url = urlh.geturl()
2623
2624         return self._parse_xspf(
2625             xspf, playlist_id, xspf_url=xspf_url,
2626             xspf_base_url=base_url(xspf_url))
2627
2628     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2629         NS_MAP = {
2630             'xspf': 'http://xspf.org/ns/0/',
2631             's1': 'http://static.streamone.nl/player/ns/0',
2632         }
2633
2634         entries = []
2635         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2636             title = xpath_text(
2637                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2638             description = xpath_text(
2639                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2640             thumbnail = xpath_text(
2641                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2642             duration = float_or_none(
2643                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2644
2645             formats = []
2646             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2647                 format_url = urljoin(xspf_base_url, location.text)
2648                 if not format_url:
2649                     continue
2650                 formats.append({
2651                     'url': format_url,
2652                     'manifest_url': xspf_url,
2653                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2654                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2655                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2656                 })
2657             self._sort_formats(formats)
2658
2659             entries.append({
2660                 'id': playlist_id,
2661                 'title': title,
2662                 'description': description,
2663                 'thumbnail': thumbnail,
2664                 'duration': duration,
2665                 'formats': formats,
2666             })
2667         return entries
2668
2669     def _extract_mpd_formats(self, *args, **kwargs):
2670         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2671         if subs:
2672             self._report_ignoring_subs('DASH')
2673         return fmts
2674
2675     def _extract_mpd_formats_and_subtitles(
2676             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2677             fatal=True, data=None, headers={}, query={}):
2678         res = self._download_xml_handle(
2679             mpd_url, video_id,
2680             note='Downloading MPD manifest' if note is None else note,
2681             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2682             fatal=fatal, data=data, headers=headers, query=query)
2683         if res is False:
2684             return [], {}
2685         mpd_doc, urlh = res
2686         if mpd_doc is None:
2687             return [], {}
2688
2689         # We could have been redirected to a new url when we retrieved our mpd file.
2690         mpd_url = urlh.geturl()
2691         mpd_base_url = base_url(mpd_url)
2692
2693         return self._parse_mpd_formats_and_subtitles(
2694             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2695
2696     def _parse_mpd_formats(self, *args, **kwargs):
2697         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2698         if subs:
2699             self._report_ignoring_subs('DASH')
2700         return fmts
2701
2702     def _parse_mpd_formats_and_subtitles(
2703             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2704         """
2705         Parse formats from MPD manifest.
2706         References:
2707          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2708             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2709          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2710         """
2711         if not self.get_param('dynamic_mpd', True):
2712             if mpd_doc.get('type') == 'dynamic':
2713                 return [], {}
2714
2715         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2716
2717         def _add_ns(path):
2718             return self._xpath_ns(path, namespace)
2719
2720         def is_drm_protected(element):
2721             return element.find(_add_ns('ContentProtection')) is not None
2722
2723         def extract_multisegment_info(element, ms_parent_info):
2724             ms_info = ms_parent_info.copy()
2725
2726             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2727             # common attributes and elements.  We will only extract relevant
2728             # for us.
2729             def extract_common(source):
2730                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2731                 if segment_timeline is not None:
2732                     s_e = segment_timeline.findall(_add_ns('S'))
2733                     if s_e:
2734                         ms_info['total_number'] = 0
2735                         ms_info['s'] = []
2736                         for s in s_e:
2737                             r = int(s.get('r', 0))
2738                             ms_info['total_number'] += 1 + r
2739                             ms_info['s'].append({
2740                                 't': int(s.get('t', 0)),
2741                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2742                                 'd': int(s.attrib['d']),
2743                                 'r': r,
2744                             })
2745                 start_number = source.get('startNumber')
2746                 if start_number:
2747                     ms_info['start_number'] = int(start_number)
2748                 timescale = source.get('timescale')
2749                 if timescale:
2750                     ms_info['timescale'] = int(timescale)
2751                 segment_duration = source.get('duration')
2752                 if segment_duration:
2753                     ms_info['segment_duration'] = float(segment_duration)
2754
2755             def extract_Initialization(source):
2756                 initialization = source.find(_add_ns('Initialization'))
2757                 if initialization is not None:
2758                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2759
2760             segment_list = element.find(_add_ns('SegmentList'))
2761             if segment_list is not None:
2762                 extract_common(segment_list)
2763                 extract_Initialization(segment_list)
2764                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2765                 if segment_urls_e:
2766                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2767             else:
2768                 segment_template = element.find(_add_ns('SegmentTemplate'))
2769                 if segment_template is not None:
2770                     extract_common(segment_template)
2771                     media = segment_template.get('media')
2772                     if media:
2773                         ms_info['media'] = media
2774                     initialization = segment_template.get('initialization')
2775                     if initialization:
2776                         ms_info['initialization'] = initialization
2777                     else:
2778                         extract_Initialization(segment_template)
2779             return ms_info
2780
2781         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2782         formats, subtitles = [], {}
2783         stream_numbers = collections.defaultdict(int)
2784         for period in mpd_doc.findall(_add_ns('Period')):
2785             period_duration = parse_duration(period.get('duration')) or mpd_duration
2786             period_ms_info = extract_multisegment_info(period, {
2787                 'start_number': 1,
2788                 'timescale': 1,
2789             })
2790             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2791                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2792                 for representation in adaptation_set.findall(_add_ns('Representation')):
2793                     representation_attrib = adaptation_set.attrib.copy()
2794                     representation_attrib.update(representation.attrib)
2795                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2796                     mime_type = representation_attrib['mimeType']
2797                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2798
2799                     codec_str = representation_attrib.get('codecs', '')
2800                     # Some kind of binary subtitle found in some youtube livestreams
2801                     if mime_type == 'application/x-rawcc':
2802                         codecs = {'scodec': codec_str}
2803                     else:
2804                         codecs = parse_codecs(codec_str)
2805                     if content_type not in ('video', 'audio', 'text'):
2806                         if mime_type == 'image/jpeg':
2807                             content_type = mime_type
2808                         elif codecs.get('vcodec', 'none') != 'none':
2809                             content_type = 'video'
2810                         elif codecs.get('acodec', 'none') != 'none':
2811                             content_type = 'audio'
2812                         elif codecs.get('scodec', 'none') != 'none':
2813                             content_type = 'text'
2814                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2815                             content_type = 'text'
2816                         else:
2817                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2818                             continue
2819
2820                     base_url = ''
2821                     for element in (representation, adaptation_set, period, mpd_doc):
2822                         base_url_e = element.find(_add_ns('BaseURL'))
2823                         if base_url_e is not None:
2824                             base_url = base_url_e.text + base_url
2825                             if re.match(r'^https?://', base_url):
2826                                 break
2827                     if mpd_base_url and base_url.startswith('/'):
2828                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2829                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2830                         if not mpd_base_url.endswith('/'):
2831                             mpd_base_url += '/'
2832                         base_url = mpd_base_url + base_url
2833                     representation_id = representation_attrib.get('id')
2834                     lang = representation_attrib.get('lang')
2835                     url_el = representation.find(_add_ns('BaseURL'))
2836                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2837                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2838                     if representation_id is not None:
2839                         format_id = representation_id
2840                     else:
2841                         format_id = content_type
2842                     if mpd_id:
2843                         format_id = mpd_id + '-' + format_id
2844                     if content_type in ('video', 'audio'):
2845                         f = {
2846                             'format_id': format_id,
2847                             'manifest_url': mpd_url,
2848                             'ext': mimetype2ext(mime_type),
2849                             'width': int_or_none(representation_attrib.get('width')),
2850                             'height': int_or_none(representation_attrib.get('height')),
2851                             'tbr': float_or_none(bandwidth, 1000),
2852                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2853                             'fps': int_or_none(representation_attrib.get('frameRate')),
2854                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2855                             'format_note': 'DASH %s' % content_type,
2856                             'filesize': filesize,
2857                             'container': mimetype2ext(mime_type) + '_dash',
2858                             **codecs
2859                         }
2860                     elif content_type == 'text':
2861                         f = {
2862                             'ext': mimetype2ext(mime_type),
2863                             'manifest_url': mpd_url,
2864                             'filesize': filesize,
2865                         }
2866                     elif content_type == 'image/jpeg':
2867                         # See test case in VikiIE
2868                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2869                         f = {
2870                             'format_id': format_id,
2871                             'ext': 'mhtml',
2872                             'manifest_url': mpd_url,
2873                             'format_note': 'DASH storyboards (jpeg)',
2874                             'acodec': 'none',
2875                             'vcodec': 'none',
2876                         }
2877                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2878                         f['has_drm'] = True
2879                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2880
2881                     def prepare_template(template_name, identifiers):
2882                         tmpl = representation_ms_info[template_name]
2883                         # First of, % characters outside $...$ templates
2884                         # must be escaped by doubling for proper processing
2885                         # by % operator string formatting used further (see
2886                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2887                         t = ''
2888                         in_template = False
2889                         for c in tmpl:
2890                             t += c
2891                             if c == '$':
2892                                 in_template = not in_template
2893                             elif c == '%' and not in_template:
2894                                 t += c
2895                         # Next, $...$ templates are translated to their
2896                         # %(...) counterparts to be used with % operator
2897                         if representation_id is not None:
2898                             t = t.replace('$RepresentationID$', representation_id)
2899                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2900                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2901                         t.replace('$$', '$')
2902                         return t
2903
2904                     # @initialization is a regular template like @media one
2905                     # so it should be handled just the same way (see
2906                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2907                     if 'initialization' in representation_ms_info:
2908                         initialization_template = prepare_template(
2909                             'initialization',
2910                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2911                             # $Time$ shall not be included for @initialization thus
2912                             # only $Bandwidth$ remains
2913                             ('Bandwidth', ))
2914                         representation_ms_info['initialization_url'] = initialization_template % {
2915                             'Bandwidth': bandwidth,
2916                         }
2917
2918                     def location_key(location):
2919                         return 'url' if re.match(r'^https?://', location) else 'path'
2920
2921                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2922
2923                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2924                         media_location_key = location_key(media_template)
2925
2926                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2927                         # can't be used at the same time
2928                         if '%(Number' in media_template and 's' not in representation_ms_info:
2929                             segment_duration = None
2930                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2931                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2932                                 representation_ms_info['total_number'] = int(math.ceil(
2933                                     float_or_none(period_duration, segment_duration, default=0)))
2934                             representation_ms_info['fragments'] = [{
2935                                 media_location_key: media_template % {
2936                                     'Number': segment_number,
2937                                     'Bandwidth': bandwidth,
2938                                 },
2939                                 'duration': segment_duration,
2940                             } for segment_number in range(
2941                                 representation_ms_info['start_number'],
2942                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2943                         else:
2944                             # $Number*$ or $Time$ in media template with S list available
2945                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2946                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2947                             representation_ms_info['fragments'] = []
2948                             segment_time = 0
2949                             segment_d = None
2950                             segment_number = representation_ms_info['start_number']
2951
2952                             def add_segment_url():
2953                                 segment_url = media_template % {
2954                                     'Time': segment_time,
2955                                     'Bandwidth': bandwidth,
2956                                     'Number': segment_number,
2957                                 }
2958                                 representation_ms_info['fragments'].append({
2959                                     media_location_key: segment_url,
2960                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2961                                 })
2962
2963                             for num, s in enumerate(representation_ms_info['s']):
2964                                 segment_time = s.get('t') or segment_time
2965                                 segment_d = s['d']
2966                                 add_segment_url()
2967                                 segment_number += 1
2968                                 for r in range(s.get('r', 0)):
2969                                     segment_time += segment_d
2970                                     add_segment_url()
2971                                     segment_number += 1
2972                                 segment_time += segment_d
2973                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2974                         # No media template
2975                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2976                         # or any YouTube dashsegments video
2977                         fragments = []
2978                         segment_index = 0
2979                         timescale = representation_ms_info['timescale']
2980                         for s in representation_ms_info['s']:
2981                             duration = float_or_none(s['d'], timescale)
2982                             for r in range(s.get('r', 0) + 1):
2983                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2984                                 fragments.append({
2985                                     location_key(segment_uri): segment_uri,
2986                                     'duration': duration,
2987                                 })
2988                                 segment_index += 1
2989                         representation_ms_info['fragments'] = fragments
2990                     elif 'segment_urls' in representation_ms_info:
2991                         # Segment URLs with no SegmentTimeline
2992                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2993                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2994                         fragments = []
2995                         segment_duration = float_or_none(
2996                             representation_ms_info['segment_duration'],
2997                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2998                         for segment_url in representation_ms_info['segment_urls']:
2999                             fragment = {
3000                                 location_key(segment_url): segment_url,
3001                             }
3002                             if segment_duration:
3003                                 fragment['duration'] = segment_duration
3004                             fragments.append(fragment)
3005                         representation_ms_info['fragments'] = fragments
3006                     # If there is a fragments key available then we correctly recognized fragmented media.
3007                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3008                     # assumption is not necessarily correct since we may simply have no support for
3009                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3010                     if 'fragments' in representation_ms_info:
3011                         f.update({
3012                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3013                             'url': mpd_url or base_url,
3014                             'fragment_base_url': base_url,
3015                             'fragments': [],
3016                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3017                         })
3018                         if 'initialization_url' in representation_ms_info:
3019                             initialization_url = representation_ms_info['initialization_url']
3020                             if not f.get('url'):
3021                                 f['url'] = initialization_url
3022                             f['fragments'].append({location_key(initialization_url): initialization_url})
3023                         f['fragments'].extend(representation_ms_info['fragments'])
3024                         if not period_duration:
3025                             period_duration = try_get(
3026                                 representation_ms_info,
3027                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3028                     else:
3029                         # Assuming direct URL to unfragmented media.
3030                         f['url'] = base_url
3031                     if content_type in ('video', 'audio', 'image/jpeg'):
3032                         f['manifest_stream_number'] = stream_numbers[f['url']]
3033                         stream_numbers[f['url']] += 1
3034                         formats.append(f)
3035                     elif content_type == 'text':
3036                         subtitles.setdefault(lang or 'und', []).append(f)
3037
3038         return formats, subtitles
3039
3040     def _extract_ism_formats(self, *args, **kwargs):
3041         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3042         if subs:
3043             self._report_ignoring_subs('ISM')
3044         return fmts
3045
3046     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3047         res = self._download_xml_handle(
3048             ism_url, video_id,
3049             note='Downloading ISM manifest' if note is None else note,
3050             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3051             fatal=fatal, data=data, headers=headers, query=query)
3052         if res is False:
3053             return [], {}
3054         ism_doc, urlh = res
3055         if ism_doc is None:
3056             return [], {}
3057
3058         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3059
3060     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3061         """
3062         Parse formats from ISM manifest.
3063         References:
3064          1. [MS-SSTR]: Smooth Streaming Protocol,
3065             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3066         """
3067         if ism_doc.get('IsLive') == 'TRUE':
3068             return [], {}
3069
3070         duration = int(ism_doc.attrib['Duration'])
3071         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3072
3073         formats = []
3074         subtitles = {}
3075         for stream in ism_doc.findall('StreamIndex'):
3076             stream_type = stream.get('Type')
3077             if stream_type not in ('video', 'audio', 'text'):
3078                 continue
3079             url_pattern = stream.attrib['Url']
3080             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3081             stream_name = stream.get('Name')
3082             stream_language = stream.get('Language', 'und')
3083             for track in stream.findall('QualityLevel'):
3084                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3085                 # TODO: add support for WVC1 and WMAP
3086                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3087                     self.report_warning('%s is not a supported codec' % fourcc)
3088                     continue
3089                 tbr = int(track.attrib['Bitrate']) // 1000
3090                 # [1] does not mention Width and Height attributes. However,
3091                 # they're often present while MaxWidth and MaxHeight are
3092                 # missing, so should be used as fallbacks
3093                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3094                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3095                 sampling_rate = int_or_none(track.get('SamplingRate'))
3096
3097                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3098                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3099
3100                 fragments = []
3101                 fragment_ctx = {
3102                     'time': 0,
3103                 }
3104                 stream_fragments = stream.findall('c')
3105                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3106                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3107                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3108                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3109                     if not fragment_ctx['duration']:
3110                         try:
3111                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3112                         except IndexError:
3113                             next_fragment_time = duration
3114                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3115                     for _ in range(fragment_repeat):
3116                         fragments.append({
3117                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3118                             'duration': fragment_ctx['duration'] / stream_timescale,
3119                         })
3120                         fragment_ctx['time'] += fragment_ctx['duration']
3121
3122                 if stream_type == 'text':
3123                     subtitles.setdefault(stream_language, []).append({
3124                         'ext': 'ismt',
3125                         'protocol': 'ism',
3126                         'url': ism_url,
3127                         'manifest_url': ism_url,
3128                         'fragments': fragments,
3129                         '_download_params': {
3130                             'stream_type': stream_type,
3131                             'duration': duration,
3132                             'timescale': stream_timescale,
3133                             'fourcc': fourcc,
3134                             'language': stream_language,
3135                             'codec_private_data': track.get('CodecPrivateData'),
3136                         }
3137                     })
3138                 elif stream_type in ('video', 'audio'):
3139                     formats.append({
3140                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3141                         'url': ism_url,
3142                         'manifest_url': ism_url,
3143                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3144                         'width': width,
3145                         'height': height,
3146                         'tbr': tbr,
3147                         'asr': sampling_rate,
3148                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3149                         'acodec': 'none' if stream_type == 'video' else fourcc,
3150                         'protocol': 'ism',
3151                         'fragments': fragments,
3152                         'has_drm': ism_doc.find('Protection') is not None,
3153                         '_download_params': {
3154                             'stream_type': stream_type,
3155                             'duration': duration,
3156                             'timescale': stream_timescale,
3157                             'width': width or 0,
3158                             'height': height or 0,
3159                             'fourcc': fourcc,
3160                             'language': stream_language,
3161                             'codec_private_data': track.get('CodecPrivateData'),
3162                             'sampling_rate': sampling_rate,
3163                             'channels': int_or_none(track.get('Channels', 2)),
3164                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3165                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3166                         },
3167                     })
3168         return formats, subtitles
3169
3170     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3171         def absolute_url(item_url):
3172             return urljoin(base_url, item_url)
3173
3174         def parse_content_type(content_type):
3175             if not content_type:
3176                 return {}
3177             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3178             if ctr:
3179                 mimetype, codecs = ctr.groups()
3180                 f = parse_codecs(codecs)
3181                 f['ext'] = mimetype2ext(mimetype)
3182                 return f
3183             return {}
3184
3185         def _media_formats(src, cur_media_type, type_info=None):
3186             type_info = type_info or {}
3187             full_url = absolute_url(src)
3188             ext = type_info.get('ext') or determine_ext(full_url)
3189             if ext == 'm3u8':
3190                 is_plain_url = False
3191                 formats = self._extract_m3u8_formats(
3192                     full_url, video_id, ext='mp4',
3193                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3194                     preference=preference, quality=quality, fatal=False)
3195             elif ext == 'mpd':
3196                 is_plain_url = False
3197                 formats = self._extract_mpd_formats(
3198                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3199             else:
3200                 is_plain_url = True
3201                 formats = [{
3202                     'url': full_url,
3203                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3204                     'ext': ext,
3205                 }]
3206             return is_plain_url, formats
3207
3208         entries = []
3209         # amp-video and amp-audio are very similar to their HTML5 counterparts
3210         # so we wll include them right here (see
3211         # https://www.ampproject.org/docs/reference/components/amp-video)
3212         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3213         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3214         media_tags = [(media_tag, media_tag_name, media_type, '')
3215                       for media_tag, media_tag_name, media_type
3216                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3217         media_tags.extend(re.findall(
3218             # We only allow video|audio followed by a whitespace or '>'.
3219             # Allowing more characters may end up in significant slow down (see
3220             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3221             # http://www.porntrex.com/maps/videositemap.xml).
3222             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3223         for media_tag, _, media_type, media_content in media_tags:
3224             media_info = {
3225                 'formats': [],
3226                 'subtitles': {},
3227             }
3228             media_attributes = extract_attributes(media_tag)
3229             src = strip_or_none(media_attributes.get('src'))
3230             if src:
3231                 f = parse_content_type(media_attributes.get('type'))
3232                 _, formats = _media_formats(src, media_type, f)
3233                 media_info['formats'].extend(formats)
3234             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3235             if media_content:
3236                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3237                     s_attr = extract_attributes(source_tag)
3238                     # data-video-src and data-src are non standard but seen
3239                     # several times in the wild
3240                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3241                     if not src:
3242                         continue
3243                     f = parse_content_type(s_attr.get('type'))
3244                     is_plain_url, formats = _media_formats(src, media_type, f)
3245                     if is_plain_url:
3246                         # width, height, res, label and title attributes are
3247                         # all not standard but seen several times in the wild
3248                         labels = [
3249                             s_attr.get(lbl)
3250                             for lbl in ('label', 'title')
3251                             if str_or_none(s_attr.get(lbl))
3252                         ]
3253                         width = int_or_none(s_attr.get('width'))
3254                         height = (int_or_none(s_attr.get('height'))
3255                                   or int_or_none(s_attr.get('res')))
3256                         if not width or not height:
3257                             for lbl in labels:
3258                                 resolution = parse_resolution(lbl)
3259                                 if not resolution:
3260                                     continue
3261                                 width = width or resolution.get('width')
3262                                 height = height or resolution.get('height')
3263                         for lbl in labels:
3264                             tbr = parse_bitrate(lbl)
3265                             if tbr:
3266                                 break
3267                         else:
3268                             tbr = None
3269                         f.update({
3270                             'width': width,
3271                             'height': height,
3272                             'tbr': tbr,
3273                             'format_id': s_attr.get('label') or s_attr.get('title'),
3274                         })
3275                         f.update(formats[0])
3276                         media_info['formats'].append(f)
3277                     else:
3278                         media_info['formats'].extend(formats)
3279                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3280                     track_attributes = extract_attributes(track_tag)
3281                     kind = track_attributes.get('kind')
3282                     if not kind or kind in ('subtitles', 'captions'):
3283                         src = strip_or_none(track_attributes.get('src'))
3284                         if not src:
3285                             continue
3286                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3287                         media_info['subtitles'].setdefault(lang, []).append({
3288                             'url': absolute_url(src),
3289                         })
3290             for f in media_info['formats']:
3291                 f.setdefault('http_headers', {})['Referer'] = base_url
3292             if media_info['formats'] or media_info['subtitles']:
3293                 entries.append(media_info)
3294         return entries
3295
3296     def _extract_akamai_formats(self, *args, **kwargs):
3297         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3298         if subs:
3299             self._report_ignoring_subs('akamai')
3300         return fmts
3301
3302     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3303         signed = 'hdnea=' in manifest_url
3304         if not signed:
3305             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3306             manifest_url = re.sub(
3307                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3308                 '', manifest_url).strip('?')
3309
3310         formats = []
3311         subtitles = {}
3312
3313         hdcore_sign = 'hdcore=3.7.0'
3314         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3315         hds_host = hosts.get('hds')
3316         if hds_host:
3317             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3318         if 'hdcore=' not in f4m_url:
3319             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3320         f4m_formats = self._extract_f4m_formats(
3321             f4m_url, video_id, f4m_id='hds', fatal=False)
3322         for entry in f4m_formats:
3323             entry.update({'extra_param_to_segment_url': hdcore_sign})
3324         formats.extend(f4m_formats)
3325
3326         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3327         hls_host = hosts.get('hls')
3328         if hls_host:
3329             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3330         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3331             m3u8_url, video_id, 'mp4', 'm3u8_native',
3332             m3u8_id='hls', fatal=False)
3333         formats.extend(m3u8_formats)
3334         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3335
3336         http_host = hosts.get('http')
3337         if http_host and m3u8_formats and not signed:
3338             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3339             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3340             qualities_length = len(qualities)
3341             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3342                 i = 0
3343                 for f in m3u8_formats:
3344                     if f['vcodec'] != 'none':
3345                         for protocol in ('http', 'https'):
3346                             http_f = f.copy()
3347                             del http_f['manifest_url']
3348                             http_url = re.sub(
3349                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3350                             http_f.update({
3351                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3352                                 'url': http_url,
3353                                 'protocol': protocol,
3354                             })
3355                             formats.append(http_f)
3356                         i += 1
3357
3358         return formats, subtitles
3359
3360     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3361         query = urllib.parse.urlparse(url).query
3362         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3363         mobj = re.search(
3364             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3365         url_base = mobj.group('url')
3366         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3367         formats = []
3368
3369         def manifest_url(manifest):
3370             m_url = f'{http_base_url}/{manifest}'
3371             if query:
3372                 m_url += '?%s' % query
3373             return m_url
3374
3375         if 'm3u8' not in skip_protocols:
3376             formats.extend(self._extract_m3u8_formats(
3377                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3378                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3379         if 'f4m' not in skip_protocols:
3380             formats.extend(self._extract_f4m_formats(
3381                 manifest_url('manifest.f4m'),
3382                 video_id, f4m_id='hds', fatal=False))
3383         if 'dash' not in skip_protocols:
3384             formats.extend(self._extract_mpd_formats(
3385                 manifest_url('manifest.mpd'),
3386                 video_id, mpd_id='dash', fatal=False))
3387         if re.search(r'(?:/smil:|\.smil)', url_base):
3388             if 'smil' not in skip_protocols:
3389                 rtmp_formats = self._extract_smil_formats(
3390                     manifest_url('jwplayer.smil'),
3391                     video_id, fatal=False)
3392                 for rtmp_format in rtmp_formats:
3393                     rtsp_format = rtmp_format.copy()
3394                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3395                     del rtsp_format['play_path']
3396                     del rtsp_format['ext']
3397                     rtsp_format.update({
3398                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3399                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3400                         'protocol': 'rtsp',
3401                     })
3402                     formats.extend([rtmp_format, rtsp_format])
3403         else:
3404             for protocol in ('rtmp', 'rtsp'):
3405                 if protocol not in skip_protocols:
3406                     formats.append({
3407                         'url': f'{protocol}:{url_base}',
3408                         'format_id': protocol,
3409                         'protocol': protocol,
3410                     })
3411         return formats
3412
3413     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3414         mobj = re.search(
3415             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3416             webpage)
3417         if mobj:
3418             try:
3419                 jwplayer_data = self._parse_json(mobj.group('options'),
3420                                                  video_id=video_id,
3421                                                  transform_source=transform_source)
3422             except ExtractorError:
3423                 pass
3424             else:
3425                 if isinstance(jwplayer_data, dict):
3426                     return jwplayer_data
3427
3428     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3429         jwplayer_data = self._find_jwplayer_data(
3430             webpage, video_id, transform_source=js_to_json)
3431         return self._parse_jwplayer_data(
3432             jwplayer_data, video_id, *args, **kwargs)
3433
3434     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3435                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3436         # JWPlayer backward compatibility: flattened playlists
3437         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3438         if 'playlist' not in jwplayer_data:
3439             jwplayer_data = {'playlist': [jwplayer_data]}
3440
3441         entries = []
3442
3443         # JWPlayer backward compatibility: single playlist item
3444         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3445         if not isinstance(jwplayer_data['playlist'], list):
3446             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3447
3448         for video_data in jwplayer_data['playlist']:
3449             # JWPlayer backward compatibility: flattened sources
3450             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3451             if 'sources' not in video_data:
3452                 video_data['sources'] = [video_data]
3453
3454             this_video_id = video_id or video_data['mediaid']
3455
3456             formats = self._parse_jwplayer_formats(
3457                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3458                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3459
3460             subtitles = {}
3461             tracks = video_data.get('tracks')
3462             if tracks and isinstance(tracks, list):
3463                 for track in tracks:
3464                     if not isinstance(track, dict):
3465                         continue
3466                     track_kind = track.get('kind')
3467                     if not track_kind or not isinstance(track_kind, str):
3468                         continue
3469                     if track_kind.lower() not in ('captions', 'subtitles'):
3470                         continue
3471                     track_url = urljoin(base_url, track.get('file'))
3472                     if not track_url:
3473                         continue
3474                     subtitles.setdefault(track.get('label') or 'en', []).append({
3475                         'url': self._proto_relative_url(track_url)
3476                     })
3477
3478             entry = {
3479                 'id': this_video_id,
3480                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3481                 'description': clean_html(video_data.get('description')),
3482                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3483                 'timestamp': int_or_none(video_data.get('pubdate')),
3484                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3485                 'subtitles': subtitles,
3486             }
3487             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3488             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3489                 entry.update({
3490                     '_type': 'url_transparent',
3491                     'url': formats[0]['url'],
3492                 })
3493             else:
3494                 self._sort_formats(formats)
3495                 entry['formats'] = formats
3496             entries.append(entry)
3497         if len(entries) == 1:
3498             return entries[0]
3499         else:
3500             return self.playlist_result(entries)
3501
3502     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3503                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3504         urls = []
3505         formats = []
3506         for source in jwplayer_sources_data:
3507             if not isinstance(source, dict):
3508                 continue
3509             source_url = urljoin(
3510                 base_url, self._proto_relative_url(source.get('file')))
3511             if not source_url or source_url in urls:
3512                 continue
3513             urls.append(source_url)
3514             source_type = source.get('type') or ''
3515             ext = mimetype2ext(source_type) or determine_ext(source_url)
3516             if source_type == 'hls' or ext == 'm3u8':
3517                 formats.extend(self._extract_m3u8_formats(
3518                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3519                     m3u8_id=m3u8_id, fatal=False))
3520             elif source_type == 'dash' or ext == 'mpd':
3521                 formats.extend(self._extract_mpd_formats(
3522                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3523             elif ext == 'smil':
3524                 formats.extend(self._extract_smil_formats(
3525                     source_url, video_id, fatal=False))
3526             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3527             elif source_type.startswith('audio') or ext in (
3528                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3529                 formats.append({
3530                     'url': source_url,
3531                     'vcodec': 'none',
3532                     'ext': ext,
3533                 })
3534             else:
3535                 height = int_or_none(source.get('height'))
3536                 if height is None:
3537                     # Often no height is provided but there is a label in
3538                     # format like "1080p", "720p SD", or 1080.
3539                     height = int_or_none(self._search_regex(
3540                         r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
3541                         'height', default=None))
3542                 a_format = {
3543                     'url': source_url,
3544                     'width': int_or_none(source.get('width')),
3545                     'height': height,
3546                     'tbr': int_or_none(source.get('bitrate')),
3547                     'ext': ext,
3548                 }
3549                 if source_url.startswith('rtmp'):
3550                     a_format['ext'] = 'flv'
3551                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3552                     # of jwplayer.flash.swf
3553                     rtmp_url_parts = re.split(
3554                         r'((?:mp4|mp3|flv):)', source_url, 1)
3555                     if len(rtmp_url_parts) == 3:
3556                         rtmp_url, prefix, play_path = rtmp_url_parts
3557                         a_format.update({
3558                             'url': rtmp_url,
3559                             'play_path': prefix + play_path,
3560                         })
3561                     if rtmp_params:
3562                         a_format.update(rtmp_params)
3563                 formats.append(a_format)
3564         return formats
3565
3566     def _live_title(self, name):
3567         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3568         return name
3569
3570     def _int(self, v, name, fatal=False, **kwargs):
3571         res = int_or_none(v, **kwargs)
3572         if res is None:
3573             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3574             if fatal:
3575                 raise ExtractorError(msg)
3576             else:
3577                 self.report_warning(msg)
3578         return res
3579
3580     def _float(self, v, name, fatal=False, **kwargs):
3581         res = float_or_none(v, **kwargs)
3582         if res is None:
3583             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3584             if fatal:
3585                 raise ExtractorError(msg)
3586             else:
3587                 self.report_warning(msg)
3588         return res
3589
3590     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3591                     path='/', secure=False, discard=False, rest={}, **kwargs):
3592         cookie = http.cookiejar.Cookie(
3593             0, name, value, port, port is not None, domain, True,
3594             domain.startswith('.'), path, True, secure, expire_time,
3595             discard, None, None, rest)
3596         self.cookiejar.set_cookie(cookie)
3597
3598     def _get_cookies(self, url):
3599         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3600         return http.cookies.SimpleCookie(self._downloader._calc_cookies(url))
3601
3602     def _apply_first_set_cookie_header(self, url_handle, cookie):
3603         """
3604         Apply first Set-Cookie header instead of the last. Experimental.
3605
3606         Some sites (e.g. [1-3]) may serve two cookies under the same name
3607         in Set-Cookie header and expect the first (old) one to be set rather
3608         than second (new). However, as of RFC6265 the newer one cookie
3609         should be set into cookie store what actually happens.
3610         We will workaround this issue by resetting the cookie to
3611         the first one manually.
3612         1. https://new.vk.com/
3613         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3614         3. https://learning.oreilly.com/
3615         """
3616         for header, cookies in url_handle.headers.items():
3617             if header.lower() != 'set-cookie':
3618                 continue
3619             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3620             cookie_value = re.search(
3621                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3622             if cookie_value:
3623                 value, domain = cookie_value.groups()
3624                 self._set_cookie(domain, cookie, value)
3625                 break
3626
3627     @classmethod
3628     def get_testcases(cls, include_onlymatching=False):
3629         t = getattr(cls, '_TEST', None)
3630         if t:
3631             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3632             tests = [t]
3633         else:
3634             tests = getattr(cls, '_TESTS', [])
3635         for t in tests:
3636             if not include_onlymatching and t.get('only_matching', False):
3637                 continue
3638             t['name'] = cls.ie_key()
3639             yield t
3640
3641     @classproperty
3642     def age_limit(cls):
3643         """Get age limit from the testcases"""
3644         return max(traverse_obj(
3645             tuple(cls.get_testcases(include_onlymatching=False)),
3646             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3647
3648     @classmethod
3649     def is_suitable(cls, age_limit):
3650         """Test whether the extractor is generally suitable for the given age limit"""
3651         return not age_restricted(cls.age_limit, age_limit)
3652
3653     @classmethod
3654     def description(cls, *, markdown=True, search_examples=None):
3655         """Description of the extractor"""
3656         desc = ''
3657         if cls._NETRC_MACHINE:
3658             if markdown:
3659                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3660             else:
3661                 desc += f' [{cls._NETRC_MACHINE}]'
3662         if cls.IE_DESC is False:
3663             desc += ' [HIDDEN]'
3664         elif cls.IE_DESC:
3665             desc += f' {cls.IE_DESC}'
3666         if cls.SEARCH_KEY:
3667             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3668             if search_examples:
3669                 _COUNTS = ('', '5', '10', 'all')
3670                 desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3671         if not cls.working():
3672             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3673
3674         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3675         return f'{name}:{desc}' if desc else name
3676
3677     def extract_subtitles(self, *args, **kwargs):
3678         if (self.get_param('writesubtitles', False)
3679                 or self.get_param('listsubtitles')):
3680             return self._get_subtitles(*args, **kwargs)
3681         return {}
3682
3683     def _get_subtitles(self, *args, **kwargs):
3684         raise NotImplementedError('This method must be implemented by subclasses')
3685
3686     def extract_comments(self, *args, **kwargs):
3687         if not self.get_param('getcomments'):
3688             return None
3689         generator = self._get_comments(*args, **kwargs)
3690
3691         def extractor():
3692             comments = []
3693             interrupted = True
3694             try:
3695                 while True:
3696                     comments.append(next(generator))
3697             except StopIteration:
3698                 interrupted = False
3699             except KeyboardInterrupt:
3700                 self.to_screen('Interrupted by user')
3701             except Exception as e:
3702                 if self.get_param('ignoreerrors') is not True:
3703                     raise
3704                 self._downloader.report_error(e)
3705             comment_count = len(comments)
3706             self.to_screen(f'Extracted {comment_count} comments')
3707             return {
3708                 'comments': comments,
3709                 'comment_count': None if interrupted else comment_count
3710             }
3711         return extractor
3712
3713     def _get_comments(self, *args, **kwargs):
3714         raise NotImplementedError('This method must be implemented by subclasses')
3715
3716     @staticmethod
3717     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3718         """ Merge subtitle items for one language. Items with duplicated URLs/data
3719         will be dropped. """
3720         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3721         ret = list(subtitle_list1)
3722         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3723         return ret
3724
3725     @classmethod
3726     def _merge_subtitles(cls, *dicts, target=None):
3727         """ Merge subtitle dictionaries, language by language. """
3728         if target is None:
3729             target = {}
3730         for d in dicts:
3731             for lang, subs in d.items():
3732                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3733         return target
3734
3735     def extract_automatic_captions(self, *args, **kwargs):
3736         if (self.get_param('writeautomaticsub', False)
3737                 or self.get_param('listsubtitles')):
3738             return self._get_automatic_captions(*args, **kwargs)
3739         return {}
3740
3741     def _get_automatic_captions(self, *args, **kwargs):
3742         raise NotImplementedError('This method must be implemented by subclasses')
3743
3744     @functools.cached_property
3745     def _cookies_passed(self):
3746         """Whether cookies have been passed to YoutubeDL"""
3747         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3748
3749     def mark_watched(self, *args, **kwargs):
3750         if not self.get_param('mark_watched', False):
3751             return
3752         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3753             self._mark_watched(*args, **kwargs)
3754
3755     def _mark_watched(self, *args, **kwargs):
3756         raise NotImplementedError('This method must be implemented by subclasses')
3757
3758     def geo_verification_headers(self):
3759         headers = {}
3760         geo_verification_proxy = self.get_param('geo_verification_proxy')
3761         if geo_verification_proxy:
3762             headers['Ytdl-request-proxy'] = geo_verification_proxy
3763         return headers
3764
3765     def _generic_id(self, url):
3766         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3767
3768     def _generic_title(self, url):
3769         return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3770
3771     @staticmethod
3772     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3773         all_known = all(map(
3774             lambda x: x is not None,
3775             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3776         return (
3777             'private' if is_private
3778             else 'premium_only' if needs_premium
3779             else 'subscriber_only' if needs_subscription
3780             else 'needs_auth' if needs_auth
3781             else 'unlisted' if is_unlisted
3782             else 'public' if all_known
3783             else None)
3784
3785     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3786         '''
3787         @returns            A list of values for the extractor argument given by "key"
3788                             or "default" if no such key is present
3789         @param default      The default value to return when the key is not present (default: [])
3790         @param casesense    When false, the values are converted to lower case
3791         '''
3792         val = traverse_obj(
3793             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3794         if val is None:
3795             return [] if default is NO_DEFAULT else default
3796         return list(val) if casesense else [x.lower() for x in val]
3797
3798     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3799         if not playlist_id or not video_id:
3800             return not video_id
3801
3802         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3803         if no_playlist is not None:
3804             return not no_playlist
3805
3806         video_id = '' if video_id is True else f' {video_id}'
3807         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3808         if self.get_param('noplaylist'):
3809             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3810             return False
3811         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3812         return True
3813
3814
3815 class SearchInfoExtractor(InfoExtractor):
3816     """
3817     Base class for paged search queries extractors.
3818     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3819     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3820     """
3821
3822     _MAX_RESULTS = float('inf')
3823
3824     @classmethod
3825     def _make_valid_url(cls):
3826         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3827
3828     def _real_extract(self, query):
3829         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3830         if prefix == '':
3831             return self._get_n_results(query, 1)
3832         elif prefix == 'all':
3833             return self._get_n_results(query, self._MAX_RESULTS)
3834         else:
3835             n = int(prefix)
3836             if n <= 0:
3837                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3838             elif n > self._MAX_RESULTS:
3839                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3840                 n = self._MAX_RESULTS
3841             return self._get_n_results(query, n)
3842
3843     def _get_n_results(self, query, n):
3844         """Get a specified number of results for a query.
3845         Either this function or _search_results must be overridden by subclasses """
3846         return self.playlist_result(
3847             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3848             query, query)
3849
3850     def _search_results(self, query):
3851         """Returns an iterator of search results"""
3852         raise NotImplementedError('This method must be implemented by subclasses')
3853
3854     @classproperty
3855     def SEARCH_KEY(cls):
3856         return cls._SEARCH_KEY