yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import itertools
   9 import json
  10 import math
  11 import netrc
  12 import os
  13 import random
  14 import re
  15 import sys
  16 import time
  17 import urllib.parse
  18 import urllib.request
  19 import xml.etree.ElementTree
  20
  21 from ..compat import functools  # isort: split
  22 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  23 from ..downloader import FileDownloader
  24 from ..downloader.f4m import get_base_url, remove_encrypted_media
  25 from ..utils import (
  26     JSON_LD_RE,
  27     NO_DEFAULT,
  28     ExtractorError,
  29     GeoRestrictedError,
  30     GeoUtils,
  31     LenientJSONDecoder,
  32     RegexNotFoundError,
  33     UnsupportedError,
  34     age_restricted,
  35     base_url,
  36     bug_reports_message,
  37     classproperty,
  38     clean_html,
  39     determine_ext,
  40     determine_protocol,
  41     dict_get,
  42     encode_data_uri,
  43     error_to_compat_str,
  44     extract_attributes,
  45     filter_dict,
  46     fix_xml_ampersands,
  47     float_or_none,
  48     format_field,
  49     int_or_none,
  50     join_nonempty,
  51     js_to_json,
  52     mimetype2ext,
  53     network_exceptions,
  54     orderedSet,
  55     parse_bitrate,
  56     parse_codecs,
  57     parse_duration,
  58     parse_iso8601,
  59     parse_m3u8_attributes,
  60     parse_resolution,
  61     sanitize_filename,
  62     sanitized_Request,
  63     str_or_none,
  64     str_to_int,
  65     strip_or_none,
  66     traverse_obj,
  67     try_call,
  68     try_get,
  69     unescapeHTML,
  70     unified_strdate,
  71     unified_timestamp,
  72     update_Request,
  73     update_url_query,
  74     url_basename,
  75     url_or_none,
  76     urljoin,
  77     variadic,
  78     xpath_element,
  79     xpath_text,
  80     xpath_with_ns,
  81 )
  82
  83
  84 class InfoExtractor:
  85     """Information Extractor class.
  86
  87     Information extractors are the classes that, given a URL, extract
  88     information about the video (or videos) the URL refers to. This
  89     information includes the real video URL, the video title, author and
  90     others. The information is stored in a dictionary which is then
  91     passed to the YoutubeDL. The YoutubeDL processes this
  92     information possibly downloading the video to the file system, among
  93     other possible outcomes.
  94
  95     The type field determines the type of the result.
  96     By far the most common value (and the default if _type is missing) is
  97     "video", which indicates a single video.
  98
  99     For a video, the dictionaries must include the following fields:
 100
 101     id:             Video identifier.
 102     title:          Video title, unescaped. Set to an empty string if video has
 103                     no title as opposed to "None" which signifies that the
 104                     extractor failed to obtain a title
 105
 106     Additionally, it must contain either a formats entry or a url one:
 107
 108     formats:        A list of dictionaries for each format available, ordered
 109                     from worst to best quality.
 110
 111                     Potential fields:
 112                     * url        The mandatory URL representing the media:
 113                                    for plain file media - HTTP URL of this file,
 114                                    for RTMP - RTMP URL,
 115                                    for HLS - URL of the M3U8 media playlist,
 116                                    for HDS - URL of the F4M manifest,
 117                                    for DASH
 118                                      - HTTP URL to plain file media (in case of
 119                                        unfragmented media)
 120                                      - URL of the MPD manifest or base URL
 121                                        representing the media if MPD manifest
 122                                        is parsed from a string (in case of
 123                                        fragmented media)
 124                                    for MSS - URL of the ISM manifest.
 125                     * manifest_url
 126                                  The URL of the manifest file in case of
 127                                  fragmented media:
 128                                    for HLS - URL of the M3U8 master playlist,
 129                                    for HDS - URL of the F4M manifest,
 130                                    for DASH - URL of the MPD manifest,
 131                                    for MSS - URL of the ISM manifest.
 132                     * manifest_stream_number  (For internal use only)
 133                                  The index of the stream in the manifest file
 134                     * ext        Will be calculated from URL if missing
 135                     * format     A human-readable description of the format
 136                                  ("mp4 container with h264/opus").
 137                                  Calculated from the format_id, width, height.
 138                                  and format_note fields if missing.
 139                     * format_id  A short description of the format
 140                                  ("mp4_h264_opus" or "19").
 141                                 Technically optional, but strongly recommended.
 142                     * format_note Additional info about the format
 143                                  ("3D" or "DASH video")
 144                     * width      Width of the video, if known
 145                     * height     Height of the video, if known
 146                     * resolution Textual description of width and height
 147                     * dynamic_range The dynamic range of the video. One of:
 148                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 149                     * tbr        Average bitrate of audio and video in KBit/s
 150                     * abr        Average audio bitrate in KBit/s
 151                     * acodec     Name of the audio codec in use
 152                     * asr        Audio sampling rate in Hertz
 153                     * vbr        Average video bitrate in KBit/s
 154                     * fps        Frame rate
 155                     * vcodec     Name of the video codec in use
 156                     * container  Name of the container format
 157                     * filesize   The number of bytes, if known in advance
 158                     * filesize_approx  An estimate for the number of bytes
 159                     * player_url SWF Player URL (used for rtmpdump).
 160                     * protocol   The protocol that will be used for the actual
 161                                  download, lower-case. One of "http", "https" or
 162                                  one of the protocols defined in downloader.PROTOCOL_MAP
 163                     * fragment_base_url
 164                                  Base URL for fragments. Each fragment's path
 165                                  value (if present) will be relative to
 166                                  this URL.
 167                     * fragments  A list of fragments of a fragmented media.
 168                                  Each fragment entry must contain either an url
 169                                  or a path. If an url is present it should be
 170                                  considered by a client. Otherwise both path and
 171                                  fragment_base_url must be present. Here is
 172                                  the list of all potential fields:
 173                                  * "url" - fragment's URL
 174                                  * "path" - fragment's path relative to
 175                                             fragment_base_url
 176                                  * "duration" (optional, int or float)
 177                                  * "filesize" (optional, int)
 178                     * is_from_start  Is a live format that can be downloaded
 179                                 from the start. Boolean
 180                     * preference Order number of this format. If this field is
 181                                  present and not None, the formats get sorted
 182                                  by this field, regardless of all other values.
 183                                  -1 for default (order by other properties),
 184                                  -2 or smaller for less than default.
 185                                  < -1000 to hide the format (if there is
 186                                     another one which is strictly better)
 187                     * language   Language code, e.g. "de" or "en-US".
 188                     * language_preference  Is this in the language mentioned in
 189                                  the URL?
 190                                  10 if it's what the URL is about,
 191                                  -1 for default (don't know),
 192                                  -10 otherwise, other values reserved for now.
 193                     * quality    Order number of the video quality of this
 194                                  format, irrespective of the file format.
 195                                  -1 for default (order by other properties),
 196                                  -2 or smaller for less than default.
 197                     * source_preference  Order number for this video source
 198                                   (quality takes higher priority)
 199                                  -1 for default (order by other properties),
 200                                  -2 or smaller for less than default.
 201                     * http_headers  A dictionary of additional HTTP headers
 202                                  to add to the request.
 203                     * stretched_ratio  If given and not 1, indicates that the
 204                                  video's pixels are not square.
 205                                  width : height ratio as float.
 206                     * no_resume  The server does not support resuming the
 207                                  (HTTP or RTMP) download. Boolean.
 208                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 209                     * downloader_options  A dictionary of downloader options
 210                                  (For internal use only)
 211                                  * http_chunk_size Chunk size for HTTP downloads
 212                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 213                     RTMP formats can also have the additional fields: page_url,
 214                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 215                     rtmp_protocol, rtmp_real_time
 216
 217     url:            Final video URL.
 218     ext:            Video filename extension.
 219     format:         The video format, defaults to ext (used for --get-format)
 220     player_url:     SWF Player URL (used for rtmpdump).
 221
 222     The following fields are optional:
 223
 224     direct:         True if a direct video file was given (must only be set by GenericIE)
 225     alt_title:      A secondary title of the video.
 226     display_id      An alternative identifier for the video, not necessarily
 227                     unique, but available before title. Typically, id is
 228                     something like "4234987", title "Dancing naked mole rats",
 229                     and display_id "dancing-naked-mole-rats"
 230     thumbnails:     A list of dictionaries, with the following entries:
 231                         * "id" (optional, string) - Thumbnail format ID
 232                         * "url"
 233                         * "preference" (optional, int) - quality of the image
 234                         * "width" (optional, int)
 235                         * "height" (optional, int)
 236                         * "resolution" (optional, string "{width}x{height}",
 237                                         deprecated)
 238                         * "filesize" (optional, int)
 239                         * "http_headers" (dict) - HTTP headers for the request
 240     thumbnail:      Full URL to a video thumbnail image.
 241     description:    Full video description.
 242     uploader:       Full name of the video uploader.
 243     license:        License name the video is licensed under.
 244     creator:        The creator of the video.
 245     timestamp:      UNIX timestamp of the moment the video was uploaded
 246     upload_date:    Video upload date in UTC (YYYYMMDD).
 247                     If not explicitly set, calculated from timestamp
 248     release_timestamp: UNIX timestamp of the moment the video was released.
 249                     If it is not clear whether to use timestamp or this, use the former
 250     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 251                     If not explicitly set, calculated from release_timestamp
 252     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 253     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 254                     If not explicitly set, calculated from modified_timestamp
 255     uploader_id:    Nickname or id of the video uploader.
 256     uploader_url:   Full URL to a personal webpage of the video uploader.
 257     channel:        Full name of the channel the video is uploaded on.
 258                     Note that channel fields may or may not repeat uploader
 259                     fields. This depends on a particular extractor.
 260     channel_id:     Id of the channel.
 261     channel_url:    Full URL to a channel webpage.
 262     channel_follower_count: Number of followers of the channel.
 263     location:       Physical location where the video was filmed.
 264     subtitles:      The available subtitles as a dictionary in the format
 265                     {tag: subformats}. "tag" is usually a language code, and
 266                     "subformats" is a list sorted from lower to higher
 267                     preference, each element is a dictionary with the "ext"
 268                     entry and one of:
 269                         * "data": The subtitles file contents
 270                         * "url": A URL pointing to the subtitles file
 271                     It can optionally also have:
 272                         * "name": Name or description of the subtitles
 273                         * "http_headers": A dictionary of additional HTTP headers
 274                                   to add to the request.
 275                     "ext" will be calculated from URL if missing
 276     automatic_captions: Like 'subtitles'; contains automatically generated
 277                     captions instead of normal subtitles
 278     duration:       Length of the video in seconds, as an integer or float.
 279     view_count:     How many users have watched the video on the platform.
 280     like_count:     Number of positive ratings of the video
 281     dislike_count:  Number of negative ratings of the video
 282     repost_count:   Number of reposts of the video
 283     average_rating: Average rating give by users, the scale used depends on the webpage
 284     comment_count:  Number of comments on the video
 285     comments:       A list of comments, each with one or more of the following
 286                     properties (all but one of text or html optional):
 287                         * "author" - human-readable name of the comment author
 288                         * "author_id" - user ID of the comment author
 289                         * "author_thumbnail" - The thumbnail of the comment author
 290                         * "id" - Comment ID
 291                         * "html" - Comment as HTML
 292                         * "text" - Plain text of the comment
 293                         * "timestamp" - UNIX timestamp of comment
 294                         * "parent" - ID of the comment this one is replying to.
 295                                      Set to "root" to indicate that this is a
 296                                      comment to the original video.
 297                         * "like_count" - Number of positive ratings of the comment
 298                         * "dislike_count" - Number of negative ratings of the comment
 299                         * "is_favorited" - Whether the comment is marked as
 300                                            favorite by the video uploader
 301                         * "author_is_uploader" - Whether the comment is made by
 302                                                  the video uploader
 303     age_limit:      Age restriction for the video, as an integer (years)
 304     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 305                     should allow to get the same result again. (It will be set
 306                     by YoutubeDL if it's missing)
 307     categories:     A list of categories that the video falls in, for example
 308                     ["Sports", "Berlin"]
 309     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 310     cast:           A list of the video cast
 311     is_live:        True, False, or None (=unknown). Whether this video is a
 312                     live stream that goes on instead of a fixed-length video.
 313     was_live:       True, False, or None (=unknown). Whether this video was
 314                     originally a live stream.
 315     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live'
 316                     or 'post_live' (was live, but VOD is not yet processed)
 317                     If absent, automatically set from is_live, was_live
 318     start_time:     Time in seconds where the reproduction should start, as
 319                     specified in the URL.
 320     end_time:       Time in seconds where the reproduction should end, as
 321                     specified in the URL.
 322     chapters:       A list of dictionaries, with the following entries:
 323                         * "start_time" - The start time of the chapter in seconds
 324                         * "end_time" - The end time of the chapter in seconds
 325                         * "title" (optional, string)
 326     playable_in_embed: Whether this video is allowed to play in embedded
 327                     players on other sites. Can be True (=always allowed),
 328                     False (=never allowed), None (=unknown), or a string
 329                     specifying the criteria for embedability (Eg: 'whitelist')
 330     availability:   Under what condition the video is available. One of
 331                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 332                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 333                     to set it
 334     __post_extractor: A function to be called just before the metadata is
 335                     written to either disk, logger or console. The function
 336                     must return a dict which will be added to the info_dict.
 337                     This is usefull for additional information that is
 338                     time-consuming to extract. Note that the fields thus
 339                     extracted will not be available to output template and
 340                     match_filter. So, only "comments" and "comment_count" are
 341                     currently allowed to be extracted via this method.
 342
 343     The following fields should only be used when the video belongs to some logical
 344     chapter or section:
 345
 346     chapter:        Name or title of the chapter the video belongs to.
 347     chapter_number: Number of the chapter the video belongs to, as an integer.
 348     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 349
 350     The following fields should only be used when the video is an episode of some
 351     series, programme or podcast:
 352
 353     series:         Title of the series or programme the video episode belongs to.
 354     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 355     season:         Title of the season the video episode belongs to.
 356     season_number:  Number of the season the video episode belongs to, as an integer.
 357     season_id:      Id of the season the video episode belongs to, as a unicode string.
 358     episode:        Title of the video episode. Unlike mandatory video title field,
 359                     this field should denote the exact title of the video episode
 360                     without any kind of decoration.
 361     episode_number: Number of the video episode within a season, as an integer.
 362     episode_id:     Id of the video episode, as a unicode string.
 363
 364     The following fields should only be used when the media is a track or a part of
 365     a music album:
 366
 367     track:          Title of the track.
 368     track_number:   Number of the track within an album or a disc, as an integer.
 369     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 370                     as a unicode string.
 371     artist:         Artist(s) of the track.
 372     genre:          Genre(s) of the track.
 373     album:          Title of the album the track belongs to.
 374     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 375     album_artist:   List of all artists appeared on the album (e.g.
 376                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 377                     and compilations).
 378     disc_number:    Number of the disc or other physical medium the track belongs to,
 379                     as an integer.
 380     release_year:   Year (YYYY) when the album was released.
 381     composer:       Composer of the piece
 382
 383     The following fields should only be set for clips that should be cut from the original video:
 384
 385     section_start:  Start time of the section in seconds
 386     section_end:    End time of the section in seconds
 387
 388     The following fields should only be set for storyboards:
 389     rows:           Number of rows in each storyboard fragment, as an integer
 390     columns:        Number of columns in each storyboard fragment, as an integer
 391
 392     Unless mentioned otherwise, the fields should be Unicode strings.
 393
 394     Unless mentioned otherwise, None is equivalent to absence of information.
 395
 396
 397     _type "playlist" indicates multiple videos.
 398     There must be a key "entries", which is a list, an iterable, or a PagedList
 399     object, each element of which is a valid dictionary by this specification.
 400
 401     Additionally, playlists can have "id", "title", and any other relevant
 402     attributes with the same semantics as videos (see above).
 403
 404     It can also have the following optional fields:
 405
 406     playlist_count: The total number of videos in a playlist. If not given,
 407                     YoutubeDL tries to calculate it from "entries"
 408
 409
 410     _type "multi_video" indicates that there are multiple videos that
 411     form a single show, for examples multiple acts of an opera or TV episode.
 412     It must have an entries key like a playlist and contain all the keys
 413     required for a video at the same time.
 414
 415
 416     _type "url" indicates that the video must be extracted from another
 417     location, possibly by a different extractor. Its only required key is:
 418     "url" - the next URL to extract.
 419     The key "ie_key" can be set to the class name (minus the trailing "IE",
 420     e.g. "Youtube") if the extractor class is known in advance.
 421     Additionally, the dictionary may have any properties of the resolved entity
 422     known in advance, for example "title" if the title of the referred video is
 423     known ahead of time.
 424
 425
 426     _type "url_transparent" entities have the same specification as "url", but
 427     indicate that the given additional information is more precise than the one
 428     associated with the resolved URL.
 429     This is useful when a site employs a video service that hosts the video and
 430     its technical metadata, but that video service does not embed a useful
 431     title, description etc.
 432
 433
 434     Subclasses of this should define a _VALID_URL regexp and, re-define the
 435     _real_extract() and (optionally) _real_initialize() methods.
 436     Probably, they should also be added to the list of extractors.
 437
 438     Subclasses may also override suitable() if necessary, but ensure the function
 439     signature is preserved and that this function imports everything it needs
 440     (except other extractors), so that lazy_extractors works correctly.
 441
 442     To support username + password (or netrc) login, the extractor must define a
 443     _NETRC_MACHINE and re-define _perform_login(username, password) and
 444     (optionally) _initialize_pre_login() methods. The _perform_login method will
 445     be called between _initialize_pre_login and _real_initialize if credentials
 446     are passed by the user. In cases where it is necessary to have the login
 447     process as part of the extraction rather than initialization, _perform_login
 448     can be left undefined.
 449
 450     _GEO_BYPASS attribute may be set to False in order to disable
 451     geo restriction bypass mechanisms for a particular extractor.
 452     Though it won't disable explicit geo restriction bypass based on
 453     country code provided with geo_bypass_country.
 454
 455     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 456     countries for this extractor. One of these countries will be used by
 457     geo restriction bypass mechanism right away in order to bypass
 458     geo restriction, of course, if the mechanism is not disabled.
 459
 460     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 461     IP blocks in CIDR notation for this extractor. One of these IP blocks
 462     will be used by geo restriction bypass mechanism similarly
 463     to _GEO_COUNTRIES.
 464
 465     The _WORKING attribute should be set to False for broken IEs
 466     in order to warn the users and skip the tests.
 467     """
 468
 469     _ready = False
 470     _downloader = None
 471     _x_forwarded_for_ip = None
 472     _GEO_BYPASS = True
 473     _GEO_COUNTRIES = None
 474     _GEO_IP_BLOCKS = None
 475     _WORKING = True
 476     _NETRC_MACHINE = None
 477     IE_DESC = None
 478     SEARCH_KEY = None
 479
 480     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 481         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 482         return {
 483             None: '',
 484             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 485             'password': f'Use {password_hint}',
 486             'cookies': (
 487                 'Use --cookies-from-browser or --cookies for the authentication. '
 488                 'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 489         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 490
 491     def __init__(self, downloader=None):
 492         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 493         If a downloader is not passed during initialization,
 494         it must be set using "set_downloader()" before "extract()" is called"""
 495         self._ready = False
 496         self._x_forwarded_for_ip = None
 497         self._printed_messages = set()
 498         self.set_downloader(downloader)
 499
 500     @classmethod
 501     def _match_valid_url(cls, url):
 502         # This does not use has/getattr intentionally - we want to know whether
 503         # we have cached the regexp for *this* class, whereas getattr would also
 504         # match the superclass
 505         if '_VALID_URL_RE' not in cls.__dict__:
 506             if '_VALID_URL' not in cls.__dict__:
 507                 cls._VALID_URL = cls._make_valid_url()
 508             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 509         return cls._VALID_URL_RE.match(url)
 510
 511     @classmethod
 512     def suitable(cls, url):
 513         """Receives a URL and returns True if suitable for this IE."""
 514         # This function must import everything it needs (except other extractors),
 515         # so that lazy_extractors works correctly
 516         return cls._match_valid_url(url) is not None
 517
 518     @classmethod
 519     def _match_id(cls, url):
 520         return cls._match_valid_url(url).group('id')
 521
 522     @classmethod
 523     def get_temp_id(cls, url):
 524         try:
 525             return cls._match_id(url)
 526         except (IndexError, AttributeError):
 527             return None
 528
 529     @classmethod
 530     def working(cls):
 531         """Getter method for _WORKING."""
 532         return cls._WORKING
 533
 534     @classmethod
 535     def supports_login(cls):
 536         return bool(cls._NETRC_MACHINE)
 537
 538     def initialize(self):
 539         """Initializes an instance (authentication, etc)."""
 540         self._printed_messages = set()
 541         self._initialize_geo_bypass({
 542             'countries': self._GEO_COUNTRIES,
 543             'ip_blocks': self._GEO_IP_BLOCKS,
 544         })
 545         if not self._ready:
 546             self._initialize_pre_login()
 547             if self.supports_login():
 548                 username, password = self._get_login_info()
 549                 if username:
 550                     self._perform_login(username, password)
 551             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 552                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 553             self._real_initialize()
 554             self._ready = True
 555
 556     def _initialize_geo_bypass(self, geo_bypass_context):
 557         """
 558         Initialize geo restriction bypass mechanism.
 559
 560         This method is used to initialize geo bypass mechanism based on faking
 561         X-Forwarded-For HTTP header. A random country from provided country list
 562         is selected and a random IP belonging to this country is generated. This
 563         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 564         HTTP requests.
 565
 566         This method will be used for initial geo bypass mechanism initialization
 567         during the instance initialization with _GEO_COUNTRIES and
 568         _GEO_IP_BLOCKS.
 569
 570         You may also manually call it from extractor's code if geo bypass
 571         information is not available beforehand (e.g. obtained during
 572         extraction) or due to some other reason. In this case you should pass
 573         this information in geo bypass context passed as first argument. It may
 574         contain following fields:
 575
 576         countries:  List of geo unrestricted countries (similar
 577                     to _GEO_COUNTRIES)
 578         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 579                     (similar to _GEO_IP_BLOCKS)
 580
 581         """
 582         if not self._x_forwarded_for_ip:
 583
 584             # Geo bypass mechanism is explicitly disabled by user
 585             if not self.get_param('geo_bypass', True):
 586                 return
 587
 588             if not geo_bypass_context:
 589                 geo_bypass_context = {}
 590
 591             # Backward compatibility: previously _initialize_geo_bypass
 592             # expected a list of countries, some 3rd party code may still use
 593             # it this way
 594             if isinstance(geo_bypass_context, (list, tuple)):
 595                 geo_bypass_context = {
 596                     'countries': geo_bypass_context,
 597                 }
 598
 599             # The whole point of geo bypass mechanism is to fake IP
 600             # as X-Forwarded-For HTTP header based on some IP block or
 601             # country code.
 602
 603             # Path 1: bypassing based on IP block in CIDR notation
 604
 605             # Explicit IP block specified by user, use it right away
 606             # regardless of whether extractor is geo bypassable or not
 607             ip_block = self.get_param('geo_bypass_ip_block', None)
 608
 609             # Otherwise use random IP block from geo bypass context but only
 610             # if extractor is known as geo bypassable
 611             if not ip_block:
 612                 ip_blocks = geo_bypass_context.get('ip_blocks')
 613                 if self._GEO_BYPASS and ip_blocks:
 614                     ip_block = random.choice(ip_blocks)
 615
 616             if ip_block:
 617                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 618                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 619                 return
 620
 621             # Path 2: bypassing based on country code
 622
 623             # Explicit country code specified by user, use it right away
 624             # regardless of whether extractor is geo bypassable or not
 625             country = self.get_param('geo_bypass_country', None)
 626
 627             # Otherwise use random country code from geo bypass context but
 628             # only if extractor is known as geo bypassable
 629             if not country:
 630                 countries = geo_bypass_context.get('countries')
 631                 if self._GEO_BYPASS and countries:
 632                     country = random.choice(countries)
 633
 634             if country:
 635                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 636                 self._downloader.write_debug(
 637                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 638
 639     def extract(self, url):
 640         """Extracts URL information and returns it in list of dicts."""
 641         try:
 642             for _ in range(2):
 643                 try:
 644                     self.initialize()
 645                     self.write_debug('Extracting URL: %s' % url)
 646                     ie_result = self._real_extract(url)
 647                     if ie_result is None:
 648                         return None
 649                     if self._x_forwarded_for_ip:
 650                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 651                     subtitles = ie_result.get('subtitles') or {}
 652                     if 'no-live-chat' in self.get_param('compat_opts'):
 653                         for lang in ('live_chat', 'comments', 'danmaku'):
 654                             subtitles.pop(lang, None)
 655                     return ie_result
 656                 except GeoRestrictedError as e:
 657                     if self.__maybe_fake_ip_and_retry(e.countries):
 658                         continue
 659                     raise
 660         except UnsupportedError:
 661             raise
 662         except ExtractorError as e:
 663             kwargs = {
 664                 'video_id': e.video_id or self.get_temp_id(url),
 665                 'ie': self.IE_NAME,
 666                 'tb': e.traceback or sys.exc_info()[2],
 667                 'expected': e.expected,
 668                 'cause': e.cause
 669             }
 670             if hasattr(e, 'countries'):
 671                 kwargs['countries'] = e.countries
 672             raise type(e)(e.orig_msg, **kwargs)
 673         except http.client.IncompleteRead as e:
 674             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 675         except (KeyError, StopIteration) as e:
 676             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 677
 678     def __maybe_fake_ip_and_retry(self, countries):
 679         if (not self.get_param('geo_bypass_country', None)
 680                 and self._GEO_BYPASS
 681                 and self.get_param('geo_bypass', True)
 682                 and not self._x_forwarded_for_ip
 683                 and countries):
 684             country_code = random.choice(countries)
 685             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 686             if self._x_forwarded_for_ip:
 687                 self.report_warning(
 688                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 689                     % (self._x_forwarded_for_ip, country_code.upper()))
 690                 return True
 691         return False
 692
 693     def set_downloader(self, downloader):
 694         """Sets a YoutubeDL instance as the downloader for this IE."""
 695         self._downloader = downloader
 696
 697     @property
 698     def cache(self):
 699         return self._downloader.cache
 700
 701     @property
 702     def cookiejar(self):
 703         return self._downloader.cookiejar
 704
 705     def _initialize_pre_login(self):
 706         """ Initialization before login. Redefine in subclasses."""
 707         pass
 708
 709     def _perform_login(self, username, password):
 710         """ Login with username and password. Redefine in subclasses."""
 711         pass
 712
 713     def _real_initialize(self):
 714         """Real initialization process. Redefine in subclasses."""
 715         pass
 716
 717     def _real_extract(self, url):
 718         """Real extraction process. Redefine in subclasses."""
 719         raise NotImplementedError('This method must be implemented by subclasses')
 720
 721     @classmethod
 722     def ie_key(cls):
 723         """A string for getting the InfoExtractor with get_info_extractor"""
 724         return cls.__name__[:-2]
 725
 726     @classproperty
 727     def IE_NAME(cls):
 728         return cls.__name__[:-2]
 729
 730     @staticmethod
 731     def __can_accept_status_code(err, expected_status):
 732         assert isinstance(err, urllib.error.HTTPError)
 733         if expected_status is None:
 734             return False
 735         elif callable(expected_status):
 736             return expected_status(err.code) is True
 737         else:
 738             return err.code in variadic(expected_status)
 739
 740     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 741         if isinstance(url_or_request, urllib.request.Request):
 742             return update_Request(url_or_request, data=data, headers=headers, query=query)
 743         if query:
 744             url_or_request = update_url_query(url_or_request, query)
 745         return sanitized_Request(url_or_request, data, headers or {})
 746
 747     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 748         """
 749         Return the response handle.
 750
 751         See _download_webpage docstring for arguments specification.
 752         """
 753         if not self._downloader._first_webpage_request:
 754             sleep_interval = self.get_param('sleep_interval_requests') or 0
 755             if sleep_interval > 0:
 756                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 757                 time.sleep(sleep_interval)
 758         else:
 759             self._downloader._first_webpage_request = False
 760
 761         if note is None:
 762             self.report_download_webpage(video_id)
 763         elif note is not False:
 764             if video_id is None:
 765                 self.to_screen(str(note))
 766             else:
 767                 self.to_screen(f'{video_id}: {note}')
 768
 769         # Some sites check X-Forwarded-For HTTP header in order to figure out
 770         # the origin of the client behind proxy. This allows bypassing geo
 771         # restriction by faking this header's value to IP that belongs to some
 772         # geo unrestricted country. We will do so once we encounter any
 773         # geo restriction error.
 774         if self._x_forwarded_for_ip:
 775             headers = (headers or {}).copy()
 776             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 777
 778         try:
 779             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 780         except network_exceptions as err:
 781             if isinstance(err, urllib.error.HTTPError):
 782                 if self.__can_accept_status_code(err, expected_status):
 783                     # Retain reference to error to prevent file object from
 784                     # being closed before it can be read. Works around the
 785                     # effects of <https://bugs.python.org/issue15002>
 786                     # introduced in Python 3.4.1.
 787                     err.fp._error = err
 788                     return err.fp
 789
 790             if errnote is False:
 791                 return False
 792             if errnote is None:
 793                 errnote = 'Unable to download webpage'
 794
 795             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 796             if fatal:
 797                 raise ExtractorError(errmsg, cause=err)
 798             else:
 799                 self.report_warning(errmsg)
 800                 return False
 801
 802     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 803                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 804         """
 805         Return a tuple (page content as string, URL handle).
 806
 807         Arguments:
 808         url_or_request -- plain text URL as a string or
 809             a urllib.request.Request object
 810         video_id -- Video/playlist/item identifier (string)
 811
 812         Keyword arguments:
 813         note -- note printed before downloading (string)
 814         errnote -- note printed in case of an error (string)
 815         fatal -- flag denoting whether error should be considered fatal,
 816             i.e. whether it should cause ExtractionError to be raised,
 817             otherwise a warning will be reported and extraction continued
 818         encoding -- encoding for a page content decoding, guessed automatically
 819             when not explicitly specified
 820         data -- POST data (bytes)
 821         headers -- HTTP headers (dict)
 822         query -- URL query (dict)
 823         expected_status -- allows to accept failed HTTP requests (non 2xx
 824             status code) by explicitly specifying a set of accepted status
 825             codes. Can be any of the following entities:
 826                 - an integer type specifying an exact failed status code to
 827                   accept
 828                 - a list or a tuple of integer types specifying a list of
 829                   failed status codes to accept
 830                 - a callable accepting an actual failed status code and
 831                   returning True if it should be accepted
 832             Note that this argument does not affect success status codes (2xx)
 833             which are always accepted.
 834         """
 835
 836         # Strip hashes from the URL (#1038)
 837         if isinstance(url_or_request, str):
 838             url_or_request = url_or_request.partition('#')[0]
 839
 840         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 841         if urlh is False:
 842             assert not fatal
 843             return False
 844         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 845         return (content, urlh)
 846
 847     @staticmethod
 848     def _guess_encoding_from_content(content_type, webpage_bytes):
 849         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 850         if m:
 851             encoding = m.group(1)
 852         else:
 853             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 854                           webpage_bytes[:1024])
 855             if m:
 856                 encoding = m.group(1).decode('ascii')
 857             elif webpage_bytes.startswith(b'\xff\xfe'):
 858                 encoding = 'utf-16'
 859             else:
 860                 encoding = 'utf-8'
 861
 862         return encoding
 863
 864     def __check_blocked(self, content):
 865         first_block = content[:512]
 866         if ('<title>Access to this site is blocked</title>' in content
 867                 and 'Websense' in first_block):
 868             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 869             blocked_iframe = self._html_search_regex(
 870                 r'<iframe src="([^"]+)"', content,
 871                 'Websense information URL', default=None)
 872             if blocked_iframe:
 873                 msg += ' Visit %s for more details' % blocked_iframe
 874             raise ExtractorError(msg, expected=True)
 875         if '<title>The URL you requested has been blocked</title>' in first_block:
 876             msg = (
 877                 'Access to this webpage has been blocked by Indian censorship. '
 878                 'Use a VPN or proxy server (with --proxy) to route around it.')
 879             block_msg = self._html_search_regex(
 880                 r'</h1><p>(.*?)</p>',
 881                 content, 'block message', default=None)
 882             if block_msg:
 883                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 884             raise ExtractorError(msg, expected=True)
 885         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 886                 and 'blocklist.rkn.gov.ru' in content):
 887             raise ExtractorError(
 888                 'Access to this webpage has been blocked by decision of the Russian government. '
 889                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 890                 expected=True)
 891
 892     def _request_dump_filename(self, url, video_id):
 893         basen = f'{video_id}_{url}'
 894         trim_length = self.get_param('trim_file_name') or 240
 895         if len(basen) > trim_length:
 896             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 897             basen = basen[:trim_length - len(h)] + h
 898         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 899         # Working around MAX_PATH limitation on Windows (see
 900         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 901         if compat_os_name == 'nt':
 902             absfilepath = os.path.abspath(filename)
 903             if len(absfilepath) > 259:
 904                 filename = fR'\\?\{absfilepath}'
 905         return filename
 906
 907     def __decode_webpage(self, webpage_bytes, encoding, headers):
 908         if not encoding:
 909             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 910         try:
 911             return webpage_bytes.decode(encoding, 'replace')
 912         except LookupError:
 913             return webpage_bytes.decode('utf-8', 'replace')
 914
 915     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 916         webpage_bytes = urlh.read()
 917         if prefix is not None:
 918             webpage_bytes = prefix + webpage_bytes
 919         if self.get_param('dump_intermediate_pages', False):
 920             self.to_screen('Dumping request to ' + urlh.geturl())
 921             dump = base64.b64encode(webpage_bytes).decode('ascii')
 922             self._downloader.to_screen(dump)
 923         if self.get_param('write_pages'):
 924             filename = self._request_dump_filename(urlh.geturl(), video_id)
 925             self.to_screen(f'Saving request to {filename}')
 926             with open(filename, 'wb') as outf:
 927                 outf.write(webpage_bytes)
 928
 929         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 930         self.__check_blocked(content)
 931
 932         return content
 933
 934     def __print_error(self, errnote, fatal, video_id, err):
 935         if fatal:
 936             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 937         elif errnote:
 938             self.report_warning(f'{video_id}: {errnote}: {err}')
 939
 940     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 941         if transform_source:
 942             xml_string = transform_source(xml_string)
 943         try:
 944             return compat_etree_fromstring(xml_string.encode('utf-8'))
 945         except xml.etree.ElementTree.ParseError as ve:
 946             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 947
 948     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
 949         try:
 950             return json.loads(
 951                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 952         except ValueError as ve:
 953             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
 954
 955     def _parse_socket_response_as_json(self, data, *args, **kwargs):
 956         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
 957
 958     def __create_download_methods(name, parser, note, errnote, return_value):
 959
 960         def parse(ie, content, *args, errnote=errnote, **kwargs):
 961             if parser is None:
 962                 return content
 963             if errnote is False:
 964                 kwargs['errnote'] = errnote
 965             # parser is fetched by name so subclasses can override it
 966             return getattr(ie, parser)(content, *args, **kwargs)
 967
 968         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 969                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 970             res = self._download_webpage_handle(
 971                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 972                 data=data, headers=headers, query=query, expected_status=expected_status)
 973             if res is False:
 974                 return res
 975             content, urlh = res
 976             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
 977
 978         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 979                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 980             if self.get_param('load_pages'):
 981                 url_or_request = self._create_request(url_or_request, data, headers, query)
 982                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
 983                 self.to_screen(f'Loading request from {filename}')
 984                 try:
 985                     with open(filename, 'rb') as dumpf:
 986                         webpage_bytes = dumpf.read()
 987                 except OSError as e:
 988                     self.report_warning(f'Unable to load request from disk: {e}')
 989                 else:
 990                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
 991                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
 992             kwargs = {
 993                 'note': note,
 994                 'errnote': errnote,
 995                 'transform_source': transform_source,
 996                 'fatal': fatal,
 997                 'encoding': encoding,
 998                 'data': data,
 999                 'headers': headers,
1000                 'query': query,
1001                 'expected_status': expected_status,
1002             }
1003             if parser is None:
1004                 kwargs.pop('transform_source')
1005             # The method is fetched by name so subclasses can override _download_..._handle
1006             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1007             return res if res is False else res[0]
1008
1009         def impersonate(func, name, return_value):
1010             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1011             func.__doc__ = f'''
1012                 @param transform_source     Apply this transformation before parsing
1013                 @returns                    {return_value}
1014
1015                 See _download_webpage_handle docstring for other arguments specification
1016             '''
1017
1018         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1019         impersonate(download_content, f'_download_{name}', f'{return_value}')
1020         return download_handle, download_content
1021
1022     _download_xml_handle, _download_xml = __create_download_methods(
1023         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1024     _download_json_handle, _download_json = __create_download_methods(
1025         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1026     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1027         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1028     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1029
1030     def _download_webpage(
1031             self, url_or_request, video_id, note=None, errnote=None,
1032             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1033         """
1034         Return the data of the page as a string.
1035
1036         Keyword arguments:
1037         tries -- number of tries
1038         timeout -- sleep interval between tries
1039
1040         See _download_webpage_handle docstring for other arguments specification.
1041         """
1042
1043         R''' # NB: These are unused; should they be deprecated?
1044         if tries != 1:
1045             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1046         if timeout is NO_DEFAULT:
1047             timeout = 5
1048         else:
1049             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1050         '''
1051
1052         try_count = 0
1053         while True:
1054             try:
1055                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1056             except http.client.IncompleteRead as e:
1057                 try_count += 1
1058                 if try_count >= tries:
1059                     raise e
1060                 self._sleep(timeout, video_id)
1061
1062     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1063         idstr = format_field(video_id, None, '%s: ')
1064         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1065         if only_once:
1066             if f'WARNING: {msg}' in self._printed_messages:
1067                 return
1068             self._printed_messages.add(f'WARNING: {msg}')
1069         self._downloader.report_warning(msg, *args, **kwargs)
1070
1071     def to_screen(self, msg, *args, **kwargs):
1072         """Print msg to screen, prefixing it with '[ie_name]'"""
1073         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1074
1075     def write_debug(self, msg, *args, **kwargs):
1076         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1077
1078     def get_param(self, name, default=None, *args, **kwargs):
1079         if self._downloader:
1080             return self._downloader.params.get(name, default, *args, **kwargs)
1081         return default
1082
1083     def report_drm(self, video_id, partial=False):
1084         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1085
1086     def report_extraction(self, id_or_name):
1087         """Report information extraction."""
1088         self.to_screen('%s: Extracting information' % id_or_name)
1089
1090     def report_download_webpage(self, video_id):
1091         """Report webpage download."""
1092         self.to_screen('%s: Downloading webpage' % video_id)
1093
1094     def report_age_confirmation(self):
1095         """Report attempt to confirm age."""
1096         self.to_screen('Confirming age')
1097
1098     def report_login(self):
1099         """Report attempt to log in."""
1100         self.to_screen('Logging in')
1101
1102     def raise_login_required(
1103             self, msg='This video is only available for registered users',
1104             metadata_available=False, method=NO_DEFAULT):
1105         if metadata_available and (
1106                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1107             self.report_warning(msg)
1108             return
1109         msg += format_field(self._login_hint(method), None, '. %s')
1110         raise ExtractorError(msg, expected=True)
1111
1112     def raise_geo_restricted(
1113             self, msg='This video is not available from your location due to geo restriction',
1114             countries=None, metadata_available=False):
1115         if metadata_available and (
1116                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1117             self.report_warning(msg)
1118         else:
1119             raise GeoRestrictedError(msg, countries=countries)
1120
1121     def raise_no_formats(self, msg, expected=False, video_id=None):
1122         if expected and (
1123                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1124             self.report_warning(msg, video_id)
1125         elif isinstance(msg, ExtractorError):
1126             raise msg
1127         else:
1128             raise ExtractorError(msg, expected=expected, video_id=video_id)
1129
1130     # Methods for following #608
1131     @staticmethod
1132     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1133         """Returns a URL that points to a page that should be processed"""
1134         if ie is not None:
1135             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1136         if video_id is not None:
1137             kwargs['id'] = video_id
1138         if video_title is not None:
1139             kwargs['title'] = video_title
1140         return {
1141             **kwargs,
1142             '_type': 'url_transparent' if url_transparent else 'url',
1143             'url': url,
1144         }
1145
1146     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1147         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1148                 for m in orderedSet(map(getter, matches) if getter else matches))
1149         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1150
1151     @staticmethod
1152     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1153         """Returns a playlist"""
1154         if playlist_id:
1155             kwargs['id'] = playlist_id
1156         if playlist_title:
1157             kwargs['title'] = playlist_title
1158         if playlist_description is not None:
1159             kwargs['description'] = playlist_description
1160         return {
1161             **kwargs,
1162             '_type': 'multi_video' if multi_video else 'playlist',
1163             'entries': entries,
1164         }
1165
1166     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1167         """
1168         Perform a regex search on the given string, using a single or a list of
1169         patterns returning the first matching group.
1170         In case of failure return a default value or raise a WARNING or a
1171         RegexNotFoundError, depending on fatal, specifying the field name.
1172         """
1173         if string is None:
1174             mobj = None
1175         elif isinstance(pattern, (str, re.Pattern)):
1176             mobj = re.search(pattern, string, flags)
1177         else:
1178             for p in pattern:
1179                 mobj = re.search(p, string, flags)
1180                 if mobj:
1181                     break
1182
1183         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1184
1185         if mobj:
1186             if group is None:
1187                 # return the first matching group
1188                 return next(g for g in mobj.groups() if g is not None)
1189             elif isinstance(group, (list, tuple)):
1190                 return tuple(mobj.group(g) for g in group)
1191             else:
1192                 return mobj.group(group)
1193         elif default is not NO_DEFAULT:
1194             return default
1195         elif fatal:
1196             raise RegexNotFoundError('Unable to extract %s' % _name)
1197         else:
1198             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1199             return None
1200
1201     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1202                      contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs):
1203         """Searches string for the JSON object specified by start_pattern"""
1204         # NB: end_pattern is only used to reduce the size of the initial match
1205         if default is NO_DEFAULT:
1206             default, has_default = {}, False
1207         else:
1208             fatal, has_default = False, True
1209
1210         json_string = self._search_regex(
1211             rf'{start_pattern}\s*(?P<json>{{\s*{contains_pattern}\s*}})\s*{end_pattern}',
1212             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1213         if not json_string:
1214             return default
1215
1216         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1217         try:
1218             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1219         except ExtractorError as e:
1220             if fatal:
1221                 raise ExtractorError(
1222                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1223             elif not has_default:
1224                 self.report_warning(
1225                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1226         return default
1227
1228     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1229         """
1230         Like _search_regex, but strips HTML tags and unescapes entities.
1231         """
1232         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1233         if res:
1234             return clean_html(res).strip()
1235         else:
1236             return res
1237
1238     def _get_netrc_login_info(self, netrc_machine=None):
1239         username = None
1240         password = None
1241         netrc_machine = netrc_machine or self._NETRC_MACHINE
1242
1243         if self.get_param('usenetrc', False):
1244             try:
1245                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1246                 if os.path.isdir(netrc_file):
1247                     netrc_file = os.path.join(netrc_file, '.netrc')
1248                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1249                 if info is not None:
1250                     username = info[0]
1251                     password = info[2]
1252                 else:
1253                     raise netrc.NetrcParseError(
1254                         'No authenticators for %s' % netrc_machine)
1255             except (OSError, netrc.NetrcParseError) as err:
1256                 self.report_warning(
1257                     'parsing .netrc: %s' % error_to_compat_str(err))
1258
1259         return username, password
1260
1261     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1262         """
1263         Get the login info as (username, password)
1264         First look for the manually specified credentials using username_option
1265         and password_option as keys in params dictionary. If no such credentials
1266         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1267         value.
1268         If there's no info available, return (None, None)
1269         """
1270
1271         # Attempt to use provided username and password or .netrc data
1272         username = self.get_param(username_option)
1273         if username is not None:
1274             password = self.get_param(password_option)
1275         else:
1276             username, password = self._get_netrc_login_info(netrc_machine)
1277
1278         return username, password
1279
1280     def _get_tfa_info(self, note='two-factor verification code'):
1281         """
1282         Get the two-factor authentication info
1283         TODO - asking the user will be required for sms/phone verify
1284         currently just uses the command line option
1285         If there's no info available, return None
1286         """
1287
1288         tfa = self.get_param('twofactor')
1289         if tfa is not None:
1290             return tfa
1291
1292         return getpass.getpass('Type %s and press [Return]: ' % note)
1293
1294     # Helper functions for extracting OpenGraph info
1295     @staticmethod
1296     def _og_regexes(prop):
1297         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1298         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1299                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1300         template = r'<meta[^>]+?%s[^>]+?%s'
1301         return [
1302             template % (property_re, content_re),
1303             template % (content_re, property_re),
1304         ]
1305
1306     @staticmethod
1307     def _meta_regex(prop):
1308         return r'''(?isx)<meta
1309                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1310                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1311
1312     def _og_search_property(self, prop, html, name=None, **kargs):
1313         prop = variadic(prop)
1314         if name is None:
1315             name = 'OpenGraph %s' % prop[0]
1316         og_regexes = []
1317         for p in prop:
1318             og_regexes.extend(self._og_regexes(p))
1319         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1320         if escaped is None:
1321             return None
1322         return unescapeHTML(escaped)
1323
1324     def _og_search_thumbnail(self, html, **kargs):
1325         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1326
1327     def _og_search_description(self, html, **kargs):
1328         return self._og_search_property('description', html, fatal=False, **kargs)
1329
1330     def _og_search_title(self, html, *, fatal=False, **kargs):
1331         return self._og_search_property('title', html, fatal=fatal, **kargs)
1332
1333     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1334         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1335         if secure:
1336             regexes = self._og_regexes('video:secure_url') + regexes
1337         return self._html_search_regex(regexes, html, name, **kargs)
1338
1339     def _og_search_url(self, html, **kargs):
1340         return self._og_search_property('url', html, **kargs)
1341
1342     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1343         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1344
1345     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1346         name = variadic(name)
1347         if display_name is None:
1348             display_name = name[0]
1349         return self._html_search_regex(
1350             [self._meta_regex(n) for n in name],
1351             html, display_name, fatal=fatal, group='content', **kwargs)
1352
1353     def _dc_search_uploader(self, html):
1354         return self._html_search_meta('dc.creator', html, 'uploader')
1355
1356     def _rta_search(self, html):
1357         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1358         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1359                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1360                      html):
1361             return 18
1362         return 0
1363
1364     def _media_rating_search(self, html):
1365         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1366         rating = self._html_search_meta('rating', html)
1367
1368         if not rating:
1369             return None
1370
1371         RATING_TABLE = {
1372             'safe for kids': 0,
1373             'general': 8,
1374             '14 years': 14,
1375             'mature': 17,
1376             'restricted': 19,
1377         }
1378         return RATING_TABLE.get(rating.lower())
1379
1380     def _family_friendly_search(self, html):
1381         # See http://schema.org/VideoObject
1382         family_friendly = self._html_search_meta(
1383             'isFamilyFriendly', html, default=None)
1384
1385         if not family_friendly:
1386             return None
1387
1388         RATING_TABLE = {
1389             '1': 0,
1390             'true': 0,
1391             '0': 18,
1392             'false': 18,
1393         }
1394         return RATING_TABLE.get(family_friendly.lower())
1395
1396     def _twitter_search_player(self, html):
1397         return self._html_search_meta('twitter:player', html,
1398                                       'twitter card player')
1399
1400     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1401         """Yield all json ld objects in the html"""
1402         if default is not NO_DEFAULT:
1403             fatal = False
1404         for mobj in re.finditer(JSON_LD_RE, html):
1405             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1406             for json_ld in variadic(json_ld_item):
1407                 if isinstance(json_ld, dict):
1408                     yield json_ld
1409
1410     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1411         """Search for a video in any json ld in the html"""
1412         if default is not NO_DEFAULT:
1413             fatal = False
1414         info = self._json_ld(
1415             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1416             video_id, fatal=fatal, expected_type=expected_type)
1417         if info:
1418             return info
1419         if default is not NO_DEFAULT:
1420             return default
1421         elif fatal:
1422             raise RegexNotFoundError('Unable to extract JSON-LD')
1423         else:
1424             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1425             return {}
1426
1427     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1428         if isinstance(json_ld, str):
1429             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1430         if not json_ld:
1431             return {}
1432         info = {}
1433         if not isinstance(json_ld, (list, tuple, dict)):
1434             return info
1435         if isinstance(json_ld, dict):
1436             json_ld = [json_ld]
1437
1438         INTERACTION_TYPE_MAP = {
1439             'CommentAction': 'comment',
1440             'AgreeAction': 'like',
1441             'DisagreeAction': 'dislike',
1442             'LikeAction': 'like',
1443             'DislikeAction': 'dislike',
1444             'ListenAction': 'view',
1445             'WatchAction': 'view',
1446             'ViewAction': 'view',
1447         }
1448
1449         def is_type(e, *expected_types):
1450             type = variadic(traverse_obj(e, '@type'))
1451             return any(x in type for x in expected_types)
1452
1453         def extract_interaction_type(e):
1454             interaction_type = e.get('interactionType')
1455             if isinstance(interaction_type, dict):
1456                 interaction_type = interaction_type.get('@type')
1457             return str_or_none(interaction_type)
1458
1459         def extract_interaction_statistic(e):
1460             interaction_statistic = e.get('interactionStatistic')
1461             if isinstance(interaction_statistic, dict):
1462                 interaction_statistic = [interaction_statistic]
1463             if not isinstance(interaction_statistic, list):
1464                 return
1465             for is_e in interaction_statistic:
1466                 if not is_type(is_e, 'InteractionCounter'):
1467                     continue
1468                 interaction_type = extract_interaction_type(is_e)
1469                 if not interaction_type:
1470                     continue
1471                 # For interaction count some sites provide string instead of
1472                 # an integer (as per spec) with non digit characters (e.g. ",")
1473                 # so extracting count with more relaxed str_to_int
1474                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1475                 if interaction_count is None:
1476                     continue
1477                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1478                 if not count_kind:
1479                     continue
1480                 count_key = '%s_count' % count_kind
1481                 if info.get(count_key) is not None:
1482                     continue
1483                 info[count_key] = interaction_count
1484
1485         def extract_chapter_information(e):
1486             chapters = [{
1487                 'title': part.get('name'),
1488                 'start_time': part.get('startOffset'),
1489                 'end_time': part.get('endOffset'),
1490             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1491             for idx, (last_c, current_c, next_c) in enumerate(zip(
1492                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1493                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1494                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1495                 if None in current_c.values():
1496                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1497                     return
1498             if chapters:
1499                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1500                 info['chapters'] = chapters
1501
1502         def extract_video_object(e):
1503             assert is_type(e, 'VideoObject')
1504             author = e.get('author')
1505             info.update({
1506                 'url': url_or_none(e.get('contentUrl')),
1507                 'title': unescapeHTML(e.get('name')),
1508                 'description': unescapeHTML(e.get('description')),
1509                 'thumbnails': [{'url': unescapeHTML(url)}
1510                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1511                                if url_or_none(url)],
1512                 'duration': parse_duration(e.get('duration')),
1513                 'timestamp': unified_timestamp(e.get('uploadDate')),
1514                 # author can be an instance of 'Organization' or 'Person' types.
1515                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1516                 # however some websites are using 'Text' type instead.
1517                 # 1. https://schema.org/VideoObject
1518                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1519                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1520                 'tbr': int_or_none(e.get('bitrate')),
1521                 'width': int_or_none(e.get('width')),
1522                 'height': int_or_none(e.get('height')),
1523                 'view_count': int_or_none(e.get('interactionCount')),
1524             })
1525             extract_interaction_statistic(e)
1526             extract_chapter_information(e)
1527
1528         def traverse_json_ld(json_ld, at_top_level=True):
1529             for e in json_ld:
1530                 if at_top_level and '@context' not in e:
1531                     continue
1532                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1533                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1534                     break
1535                 if expected_type is not None and not is_type(e, expected_type):
1536                     continue
1537                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1538                 if rating is not None:
1539                     info['average_rating'] = rating
1540                 if is_type(e, 'TVEpisode', 'Episode'):
1541                     episode_name = unescapeHTML(e.get('name'))
1542                     info.update({
1543                         'episode': episode_name,
1544                         'episode_number': int_or_none(e.get('episodeNumber')),
1545                         'description': unescapeHTML(e.get('description')),
1546                     })
1547                     if not info.get('title') and episode_name:
1548                         info['title'] = episode_name
1549                     part_of_season = e.get('partOfSeason')
1550                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1551                         info.update({
1552                             'season': unescapeHTML(part_of_season.get('name')),
1553                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1554                         })
1555                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1556                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1557                         info['series'] = unescapeHTML(part_of_series.get('name'))
1558                 elif is_type(e, 'Movie'):
1559                     info.update({
1560                         'title': unescapeHTML(e.get('name')),
1561                         'description': unescapeHTML(e.get('description')),
1562                         'duration': parse_duration(e.get('duration')),
1563                         'timestamp': unified_timestamp(e.get('dateCreated')),
1564                     })
1565                 elif is_type(e, 'Article', 'NewsArticle'):
1566                     info.update({
1567                         'timestamp': parse_iso8601(e.get('datePublished')),
1568                         'title': unescapeHTML(e.get('headline')),
1569                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1570                     })
1571                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1572                         extract_video_object(e['video'][0])
1573                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1574                         extract_video_object(e['subjectOf'][0])
1575                 elif is_type(e, 'VideoObject'):
1576                     extract_video_object(e)
1577                     if expected_type is None:
1578                         continue
1579                     else:
1580                         break
1581                 video = e.get('video')
1582                 if is_type(video, 'VideoObject'):
1583                     extract_video_object(video)
1584                 if expected_type is None:
1585                     continue
1586                 else:
1587                     break
1588         traverse_json_ld(json_ld)
1589
1590         return filter_dict(info)
1591
1592     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1593         return self._parse_json(
1594             self._search_regex(
1595                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1596                 webpage, 'next.js data', fatal=fatal, **kw),
1597             video_id, transform_source=transform_source, fatal=fatal)
1598
1599     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1600         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1601         rectx = re.escape(context_name)
1602         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1603         js, arg_keys, arg_vals = self._search_regex(
1604             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1605             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
1606
1607         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1608
1609         for key, val in args.items():
1610             if val in ('undefined', 'void 0'):
1611                 args[key] = 'null'
1612
1613         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1614         return traverse_obj(ret, traverse) or {}
1615
1616     @staticmethod
1617     def _hidden_inputs(html):
1618         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1619         hidden_inputs = {}
1620         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1621             attrs = extract_attributes(input)
1622             if not input:
1623                 continue
1624             if attrs.get('type') not in ('hidden', 'submit'):
1625                 continue
1626             name = attrs.get('name') or attrs.get('id')
1627             value = attrs.get('value')
1628             if name and value is not None:
1629                 hidden_inputs[name] = value
1630         return hidden_inputs
1631
1632     def _form_hidden_inputs(self, form_id, html):
1633         form = self._search_regex(
1634             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1635             html, '%s form' % form_id, group='form')
1636         return self._hidden_inputs(form)
1637
1638     class FormatSort:
1639         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1640
1641         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1642                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1643                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1644         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1645                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1646                         'fps', 'fs_approx', 'source', 'id')
1647
1648         settings = {
1649             'vcodec': {'type': 'ordered', 'regex': True,
1650                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1651             'acodec': {'type': 'ordered', 'regex': True,
1652                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1653             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1654                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1655             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1656                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1657             'vext': {'type': 'ordered', 'field': 'video_ext',
1658                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1659                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1660             'aext': {'type': 'ordered', 'field': 'audio_ext',
1661                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1662                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1663             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1664             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1665                            'field': ('vcodec', 'acodec'),
1666                            'function': lambda it: int(any(v != 'none' for v in it))},
1667             'ie_pref': {'priority': True, 'type': 'extractor'},
1668             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1669             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1670             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1671             'quality': {'convert': 'float', 'default': -1},
1672             'filesize': {'convert': 'bytes'},
1673             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1674             'id': {'convert': 'string', 'field': 'format_id'},
1675             'height': {'convert': 'float_none'},
1676             'width': {'convert': 'float_none'},
1677             'fps': {'convert': 'float_none'},
1678             'tbr': {'convert': 'float_none'},
1679             'vbr': {'convert': 'float_none'},
1680             'abr': {'convert': 'float_none'},
1681             'asr': {'convert': 'float_none'},
1682             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1683
1684             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1685             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1686             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1687             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1688             'res': {'type': 'multiple', 'field': ('height', 'width'),
1689                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1690
1691             # For compatibility with youtube-dl
1692             'format_id': {'type': 'alias', 'field': 'id'},
1693             'preference': {'type': 'alias', 'field': 'ie_pref'},
1694             'language_preference': {'type': 'alias', 'field': 'lang'},
1695             'source_preference': {'type': 'alias', 'field': 'source'},
1696             'protocol': {'type': 'alias', 'field': 'proto'},
1697             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1698
1699             # Deprecated
1700             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1701             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1702             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1703             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1704             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1705             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1706             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1707             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1708             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1709             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1710             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1711             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1712             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1713             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1714             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1715             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1716             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1717             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1718             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1719             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1720         }
1721
1722         def __init__(self, ie, field_preference):
1723             self._order = []
1724             self.ydl = ie._downloader
1725             self.evaluate_params(self.ydl.params, field_preference)
1726             if ie.get_param('verbose'):
1727                 self.print_verbose_info(self.ydl.write_debug)
1728
1729         def _get_field_setting(self, field, key):
1730             if field not in self.settings:
1731                 if key in ('forced', 'priority'):
1732                     return False
1733                 self.ydl.deprecation_warning(
1734                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1735                     'and may be removed in a future version')
1736                 self.settings[field] = {}
1737             propObj = self.settings[field]
1738             if key not in propObj:
1739                 type = propObj.get('type')
1740                 if key == 'field':
1741                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1742                 elif key == 'convert':
1743                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1744                 else:
1745                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1746                 propObj[key] = default
1747             return propObj[key]
1748
1749         def _resolve_field_value(self, field, value, convertNone=False):
1750             if value is None:
1751                 if not convertNone:
1752                     return None
1753             else:
1754                 value = value.lower()
1755             conversion = self._get_field_setting(field, 'convert')
1756             if conversion == 'ignore':
1757                 return None
1758             if conversion == 'string':
1759                 return value
1760             elif conversion == 'float_none':
1761                 return float_or_none(value)
1762             elif conversion == 'bytes':
1763                 return FileDownloader.parse_bytes(value)
1764             elif conversion == 'order':
1765                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1766                 use_regex = self._get_field_setting(field, 'regex')
1767                 list_length = len(order_list)
1768                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1769                 if use_regex and value is not None:
1770                     for i, regex in enumerate(order_list):
1771                         if regex and re.match(regex, value):
1772                             return list_length - i
1773                     return list_length - empty_pos  # not in list
1774                 else:  # not regex or  value = None
1775                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1776             else:
1777                 if value.isnumeric():
1778                     return float(value)
1779                 else:
1780                     self.settings[field]['convert'] = 'string'
1781                     return value
1782
1783         def evaluate_params(self, params, sort_extractor):
1784             self._use_free_order = params.get('prefer_free_formats', False)
1785             self._sort_user = params.get('format_sort', [])
1786             self._sort_extractor = sort_extractor
1787
1788             def add_item(field, reverse, closest, limit_text):
1789                 field = field.lower()
1790                 if field in self._order:
1791                     return
1792                 self._order.append(field)
1793                 limit = self._resolve_field_value(field, limit_text)
1794                 data = {
1795                     'reverse': reverse,
1796                     'closest': False if limit is None else closest,
1797                     'limit_text': limit_text,
1798                     'limit': limit}
1799                 if field in self.settings:
1800                     self.settings[field].update(data)
1801                 else:
1802                     self.settings[field] = data
1803
1804             sort_list = (
1805                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1806                 + (tuple() if params.get('format_sort_force', False)
1807                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1808                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1809
1810             for item in sort_list:
1811                 match = re.match(self.regex, item)
1812                 if match is None:
1813                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1814                 field = match.group('field')
1815                 if field is None:
1816                     continue
1817                 if self._get_field_setting(field, 'type') == 'alias':
1818                     alias, field = field, self._get_field_setting(field, 'field')
1819                     if self._get_field_setting(alias, 'deprecated'):
1820                         self.ydl.deprecation_warning(
1821                             f'Format sorting alias {alias} is deprecated '
1822                             f'and may be removed in a future version. Please use {field} instead')
1823                 reverse = match.group('reverse') is not None
1824                 closest = match.group('separator') == '~'
1825                 limit_text = match.group('limit')
1826
1827                 has_limit = limit_text is not None
1828                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1829                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1830
1831                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1832                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1833                 limit_count = len(limits)
1834                 for (i, f) in enumerate(fields):
1835                     add_item(f, reverse, closest,
1836                              limits[i] if i < limit_count
1837                              else limits[0] if has_limit and not has_multiple_limits
1838                              else None)
1839
1840         def print_verbose_info(self, write_debug):
1841             if self._sort_user:
1842                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1843             if self._sort_extractor:
1844                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1845             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1846                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1847                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1848                               self._get_field_setting(field, 'limit_text'),
1849                               self._get_field_setting(field, 'limit'))
1850                 if self._get_field_setting(field, 'limit_text') is not None else '')
1851                 for field in self._order if self._get_field_setting(field, 'visible')]))
1852
1853         def _calculate_field_preference_from_value(self, format, field, type, value):
1854             reverse = self._get_field_setting(field, 'reverse')
1855             closest = self._get_field_setting(field, 'closest')
1856             limit = self._get_field_setting(field, 'limit')
1857
1858             if type == 'extractor':
1859                 maximum = self._get_field_setting(field, 'max')
1860                 if value is None or (maximum is not None and value >= maximum):
1861                     value = -1
1862             elif type == 'boolean':
1863                 in_list = self._get_field_setting(field, 'in_list')
1864                 not_in_list = self._get_field_setting(field, 'not_in_list')
1865                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1866             elif type == 'ordered':
1867                 value = self._resolve_field_value(field, value, True)
1868
1869             # try to convert to number
1870             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1871             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1872             if is_num:
1873                 value = val_num
1874
1875             return ((-10, 0) if value is None
1876                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1877                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1878                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1879                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1880                     else (-1, value, 0))
1881
1882         def _calculate_field_preference(self, format, field):
1883             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1884             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1885             if type == 'multiple':
1886                 type = 'field'  # Only 'field' is allowed in multiple for now
1887                 actual_fields = self._get_field_setting(field, 'field')
1888
1889                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1890             else:
1891                 value = get_value(field)
1892             return self._calculate_field_preference_from_value(format, field, type, value)
1893
1894         def calculate_preference(self, format):
1895             # Determine missing protocol
1896             if not format.get('protocol'):
1897                 format['protocol'] = determine_protocol(format)
1898
1899             # Determine missing ext
1900             if not format.get('ext') and 'url' in format:
1901                 format['ext'] = determine_ext(format['url'])
1902             if format.get('vcodec') == 'none':
1903                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1904                 format['video_ext'] = 'none'
1905             else:
1906                 format['video_ext'] = format['ext']
1907                 format['audio_ext'] = 'none'
1908             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1909             #    format['preference'] = -1000
1910
1911             # Determine missing bitrates
1912             if format.get('tbr') is None:
1913                 if format.get('vbr') is not None and format.get('abr') is not None:
1914                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1915             else:
1916                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1917                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1918                 if format.get('acodec') != 'none' and format.get('abr') is None:
1919                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1920
1921             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1922
1923     def _sort_formats(self, formats, field_preference=[]):
1924         if not formats:
1925             return
1926         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1927
1928     def _check_formats(self, formats, video_id):
1929         if formats:
1930             formats[:] = filter(
1931                 lambda f: self._is_valid_url(
1932                     f['url'], video_id,
1933                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1934                 formats)
1935
1936     @staticmethod
1937     def _remove_duplicate_formats(formats):
1938         format_urls = set()
1939         unique_formats = []
1940         for f in formats:
1941             if f['url'] not in format_urls:
1942                 format_urls.add(f['url'])
1943                 unique_formats.append(f)
1944         formats[:] = unique_formats
1945
1946     def _is_valid_url(self, url, video_id, item='video', headers={}):
1947         url = self._proto_relative_url(url, scheme='http:')
1948         # For now assume non HTTP(S) URLs always valid
1949         if not (url.startswith('http://') or url.startswith('https://')):
1950             return True
1951         try:
1952             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1953             return True
1954         except ExtractorError as e:
1955             self.to_screen(
1956                 '%s: %s URL is invalid, skipping: %s'
1957                 % (video_id, item, error_to_compat_str(e.cause)))
1958             return False
1959
1960     def http_scheme(self):
1961         """ Either "http:" or "https:", depending on the user's preferences """
1962         return (
1963             'http:'
1964             if self.get_param('prefer_insecure', False)
1965             else 'https:')
1966
1967     def _proto_relative_url(self, url, scheme=None):
1968         if url is None:
1969             return url
1970         if url.startswith('//'):
1971             if scheme is None:
1972                 scheme = self.http_scheme()
1973             return scheme + url
1974         else:
1975             return url
1976
1977     def _sleep(self, timeout, video_id, msg_template=None):
1978         if msg_template is None:
1979             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1980         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1981         self.to_screen(msg)
1982         time.sleep(timeout)
1983
1984     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1985                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1986                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1987         res = self._download_xml_handle(
1988             manifest_url, video_id, 'Downloading f4m manifest',
1989             'Unable to download f4m manifest',
1990             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1991             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1992             transform_source=transform_source,
1993             fatal=fatal, data=data, headers=headers, query=query)
1994         if res is False:
1995             return []
1996
1997         manifest, urlh = res
1998         manifest_url = urlh.geturl()
1999
2000         return self._parse_f4m_formats(
2001             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2002             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2003
2004     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2005                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2006                            fatal=True, m3u8_id=None):
2007         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2008             return []
2009
2010         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2011         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2012         if akamai_pv is not None and ';' in akamai_pv.text:
2013             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2014             if playerVerificationChallenge.strip() != '':
2015                 return []
2016
2017         formats = []
2018         manifest_version = '1.0'
2019         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2020         if not media_nodes:
2021             manifest_version = '2.0'
2022             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2023         # Remove unsupported DRM protected media from final formats
2024         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2025         media_nodes = remove_encrypted_media(media_nodes)
2026         if not media_nodes:
2027             return formats
2028
2029         manifest_base_url = get_base_url(manifest)
2030
2031         bootstrap_info = xpath_element(
2032             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2033             'bootstrap info', default=None)
2034
2035         vcodec = None
2036         mime_type = xpath_text(
2037             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2038             'base URL', default=None)
2039         if mime_type and mime_type.startswith('audio/'):
2040             vcodec = 'none'
2041
2042         for i, media_el in enumerate(media_nodes):
2043             tbr = int_or_none(media_el.attrib.get('bitrate'))
2044             width = int_or_none(media_el.attrib.get('width'))
2045             height = int_or_none(media_el.attrib.get('height'))
2046             format_id = join_nonempty(f4m_id, tbr or i)
2047             # If <bootstrapInfo> is present, the specified f4m is a
2048             # stream-level manifest, and only set-level manifests may refer to
2049             # external resources.  See section 11.4 and section 4 of F4M spec
2050             if bootstrap_info is None:
2051                 media_url = None
2052                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2053                 if manifest_version == '2.0':
2054                     media_url = media_el.attrib.get('href')
2055                 if media_url is None:
2056                     media_url = media_el.attrib.get('url')
2057                 if not media_url:
2058                     continue
2059                 manifest_url = (
2060                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2061                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2062                 # If media_url is itself a f4m manifest do the recursive extraction
2063                 # since bitrates in parent manifest (this one) and media_url manifest
2064                 # may differ leading to inability to resolve the format by requested
2065                 # bitrate in f4m downloader
2066                 ext = determine_ext(manifest_url)
2067                 if ext == 'f4m':
2068                     f4m_formats = self._extract_f4m_formats(
2069                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2070                         transform_source=transform_source, fatal=fatal)
2071                     # Sometimes stream-level manifest contains single media entry that
2072                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2073                     # At the same time parent's media entry in set-level manifest may
2074                     # contain it. We will copy it from parent in such cases.
2075                     if len(f4m_formats) == 1:
2076                         f = f4m_formats[0]
2077                         f.update({
2078                             'tbr': f.get('tbr') or tbr,
2079                             'width': f.get('width') or width,
2080                             'height': f.get('height') or height,
2081                             'format_id': f.get('format_id') if not tbr else format_id,
2082                             'vcodec': vcodec,
2083                         })
2084                     formats.extend(f4m_formats)
2085                     continue
2086                 elif ext == 'm3u8':
2087                     formats.extend(self._extract_m3u8_formats(
2088                         manifest_url, video_id, 'mp4', preference=preference,
2089                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2090                     continue
2091             formats.append({
2092                 'format_id': format_id,
2093                 'url': manifest_url,
2094                 'manifest_url': manifest_url,
2095                 'ext': 'flv' if bootstrap_info is not None else None,
2096                 'protocol': 'f4m',
2097                 'tbr': tbr,
2098                 'width': width,
2099                 'height': height,
2100                 'vcodec': vcodec,
2101                 'preference': preference,
2102                 'quality': quality,
2103             })
2104         return formats
2105
2106     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2107         return {
2108             'format_id': join_nonempty(m3u8_id, 'meta'),
2109             'url': m3u8_url,
2110             'ext': ext,
2111             'protocol': 'm3u8',
2112             'preference': preference - 100 if preference else -100,
2113             'quality': quality,
2114             'resolution': 'multiple',
2115             'format_note': 'Quality selection URL',
2116         }
2117
2118     def _report_ignoring_subs(self, name):
2119         self.report_warning(bug_reports_message(
2120             f'Ignoring subtitle tracks found in the {name} manifest; '
2121             'if any subtitle tracks are missing,'
2122         ), only_once=True)
2123
2124     def _extract_m3u8_formats(self, *args, **kwargs):
2125         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2126         if subs:
2127             self._report_ignoring_subs('HLS')
2128         return fmts
2129
2130     def _extract_m3u8_formats_and_subtitles(
2131             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2132             preference=None, quality=None, m3u8_id=None, note=None,
2133             errnote=None, fatal=True, live=False, data=None, headers={},
2134             query={}):
2135
2136         res = self._download_webpage_handle(
2137             m3u8_url, video_id,
2138             note='Downloading m3u8 information' if note is None else note,
2139             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2140             fatal=fatal, data=data, headers=headers, query=query)
2141
2142         if res is False:
2143             return [], {}
2144
2145         m3u8_doc, urlh = res
2146         m3u8_url = urlh.geturl()
2147
2148         return self._parse_m3u8_formats_and_subtitles(
2149             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2150             preference=preference, quality=quality, m3u8_id=m3u8_id,
2151             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2152             headers=headers, query=query, video_id=video_id)
2153
2154     def _parse_m3u8_formats_and_subtitles(
2155             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2156             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2157             errnote=None, fatal=True, data=None, headers={}, query={},
2158             video_id=None):
2159         formats, subtitles = [], {}
2160
2161         has_drm = re.search('|'.join([
2162             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2163             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2164         ]), m3u8_doc)
2165
2166         def format_url(url):
2167             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2168
2169         if self.get_param('hls_split_discontinuity', False):
2170             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2171                 if not m3u8_doc:
2172                     if not manifest_url:
2173                         return []
2174                     m3u8_doc = self._download_webpage(
2175                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2176                         note=False, errnote='Failed to download m3u8 playlist information')
2177                     if m3u8_doc is False:
2178                         return []
2179                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2180
2181         else:
2182             def _extract_m3u8_playlist_indices(*args, **kwargs):
2183                 return [None]
2184
2185         # References:
2186         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2187         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2188         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2189
2190         # We should try extracting formats only from master playlists [1, 4.3.4],
2191         # i.e. playlists that describe available qualities. On the other hand
2192         # media playlists [1, 4.3.3] should be returned as is since they contain
2193         # just the media without qualities renditions.
2194         # Fortunately, master playlist can be easily distinguished from media
2195         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2196         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2197         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2198         # media playlist and MUST NOT appear in master playlist thus we can
2199         # clearly detect media playlist with this criterion.
2200
2201         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2202             formats = [{
2203                 'format_id': join_nonempty(m3u8_id, idx),
2204                 'format_index': idx,
2205                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2206                 'ext': ext,
2207                 'protocol': entry_protocol,
2208                 'preference': preference,
2209                 'quality': quality,
2210                 'has_drm': has_drm,
2211             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2212
2213             return formats, subtitles
2214
2215         groups = {}
2216         last_stream_inf = {}
2217
2218         def extract_media(x_media_line):
2219             media = parse_m3u8_attributes(x_media_line)
2220             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2221             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2222             if not (media_type and group_id and name):
2223                 return
2224             groups.setdefault(group_id, []).append(media)
2225             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2226             if media_type == 'SUBTITLES':
2227                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2228                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2229                 # However, lack of URI has been spotted in the wild.
2230                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2231                 if not media.get('URI'):
2232                     return
2233                 url = format_url(media['URI'])
2234                 sub_info = {
2235                     'url': url,
2236                     'ext': determine_ext(url),
2237                 }
2238                 if sub_info['ext'] == 'm3u8':
2239                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2240                     # files may contain is WebVTT:
2241                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2242                     sub_info['ext'] = 'vtt'
2243                     sub_info['protocol'] = 'm3u8_native'
2244                 lang = media.get('LANGUAGE') or 'und'
2245                 subtitles.setdefault(lang, []).append(sub_info)
2246             if media_type not in ('VIDEO', 'AUDIO'):
2247                 return
2248             media_url = media.get('URI')
2249             if media_url:
2250                 manifest_url = format_url(media_url)
2251                 formats.extend({
2252                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2253                     'format_note': name,
2254                     'format_index': idx,
2255                     'url': manifest_url,
2256                     'manifest_url': m3u8_url,
2257                     'language': media.get('LANGUAGE'),
2258                     'ext': ext,
2259                     'protocol': entry_protocol,
2260                     'preference': preference,
2261                     'quality': quality,
2262                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2263                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2264
2265         def build_stream_name():
2266             # Despite specification does not mention NAME attribute for
2267             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2268             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2269             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2270             stream_name = last_stream_inf.get('NAME')
2271             if stream_name:
2272                 return stream_name
2273             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2274             # from corresponding rendition group
2275             stream_group_id = last_stream_inf.get('VIDEO')
2276             if not stream_group_id:
2277                 return
2278             stream_group = groups.get(stream_group_id)
2279             if not stream_group:
2280                 return stream_group_id
2281             rendition = stream_group[0]
2282             return rendition.get('NAME') or stream_group_id
2283
2284         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2285         # chance to detect video only formats when EXT-X-STREAM-INF tags
2286         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2287         for line in m3u8_doc.splitlines():
2288             if line.startswith('#EXT-X-MEDIA:'):
2289                 extract_media(line)
2290
2291         for line in m3u8_doc.splitlines():
2292             if line.startswith('#EXT-X-STREAM-INF:'):
2293                 last_stream_inf = parse_m3u8_attributes(line)
2294             elif line.startswith('#') or not line.strip():
2295                 continue
2296             else:
2297                 tbr = float_or_none(
2298                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2299                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2300                 manifest_url = format_url(line.strip())
2301
2302                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2303                     format_id = [m3u8_id, None, idx]
2304                     # Bandwidth of live streams may differ over time thus making
2305                     # format_id unpredictable. So it's better to keep provided
2306                     # format_id intact.
2307                     if not live:
2308                         stream_name = build_stream_name()
2309                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2310                     f = {
2311                         'format_id': join_nonempty(*format_id),
2312                         'format_index': idx,
2313                         'url': manifest_url,
2314                         'manifest_url': m3u8_url,
2315                         'tbr': tbr,
2316                         'ext': ext,
2317                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2318                         'protocol': entry_protocol,
2319                         'preference': preference,
2320                         'quality': quality,
2321                     }
2322                     resolution = last_stream_inf.get('RESOLUTION')
2323                     if resolution:
2324                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2325                         if mobj:
2326                             f['width'] = int(mobj.group('width'))
2327                             f['height'] = int(mobj.group('height'))
2328                     # Unified Streaming Platform
2329                     mobj = re.search(
2330                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2331                     if mobj:
2332                         abr, vbr = mobj.groups()
2333                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2334                         f.update({
2335                             'vbr': vbr,
2336                             'abr': abr,
2337                         })
2338                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2339                     f.update(codecs)
2340                     audio_group_id = last_stream_inf.get('AUDIO')
2341                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2342                     # references a rendition group MUST have a CODECS attribute.
2343                     # However, this is not always respected, for example, [2]
2344                     # contains EXT-X-STREAM-INF tag which references AUDIO
2345                     # rendition group but does not have CODECS and despite
2346                     # referencing an audio group it represents a complete
2347                     # (with audio and video) format. So, for such cases we will
2348                     # ignore references to rendition groups and treat them
2349                     # as complete formats.
2350                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2351                         audio_group = groups.get(audio_group_id)
2352                         if audio_group and audio_group[0].get('URI'):
2353                             # TODO: update acodec for audio only formats with
2354                             # the same GROUP-ID
2355                             f['acodec'] = 'none'
2356                     if not f.get('ext'):
2357                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2358                     formats.append(f)
2359
2360                     # for DailyMotion
2361                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2362                     if progressive_uri:
2363                         http_f = f.copy()
2364                         del http_f['manifest_url']
2365                         http_f.update({
2366                             'format_id': f['format_id'].replace('hls-', 'http-'),
2367                             'protocol': 'http',
2368                             'url': progressive_uri,
2369                         })
2370                         formats.append(http_f)
2371
2372                 last_stream_inf = {}
2373         return formats, subtitles
2374
2375     def _extract_m3u8_vod_duration(
2376             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2377
2378         m3u8_vod = self._download_webpage(
2379             m3u8_vod_url, video_id,
2380             note='Downloading m3u8 VOD manifest' if note is None else note,
2381             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2382             fatal=False, data=data, headers=headers, query=query)
2383
2384         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2385
2386     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2387         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2388             return None
2389
2390         return int(sum(
2391             float(line[len('#EXTINF:'):].split(',')[0])
2392             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2393
2394     @staticmethod
2395     def _xpath_ns(path, namespace=None):
2396         if not namespace:
2397             return path
2398         out = []
2399         for c in path.split('/'):
2400             if not c or c == '.':
2401                 out.append(c)
2402             else:
2403                 out.append('{%s}%s' % (namespace, c))
2404         return '/'.join(out)
2405
2406     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2407         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2408         if res is False:
2409             assert not fatal
2410             return [], {}
2411
2412         smil, urlh = res
2413         smil_url = urlh.geturl()
2414
2415         namespace = self._parse_smil_namespace(smil)
2416
2417         fmts = self._parse_smil_formats(
2418             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2419         subs = self._parse_smil_subtitles(
2420             smil, namespace=namespace)
2421
2422         return fmts, subs
2423
2424     def _extract_smil_formats(self, *args, **kwargs):
2425         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2426         if subs:
2427             self._report_ignoring_subs('SMIL')
2428         return fmts
2429
2430     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2431         res = self._download_smil(smil_url, video_id, fatal=fatal)
2432         if res is False:
2433             return {}
2434
2435         smil, urlh = res
2436         smil_url = urlh.geturl()
2437
2438         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2439
2440     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2441         return self._download_xml_handle(
2442             smil_url, video_id, 'Downloading SMIL file',
2443             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2444
2445     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2446         namespace = self._parse_smil_namespace(smil)
2447
2448         formats = self._parse_smil_formats(
2449             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2450         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2451
2452         video_id = os.path.splitext(url_basename(smil_url))[0]
2453         title = None
2454         description = None
2455         upload_date = None
2456         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2457             name = meta.attrib.get('name')
2458             content = meta.attrib.get('content')
2459             if not name or not content:
2460                 continue
2461             if not title and name == 'title':
2462                 title = content
2463             elif not description and name in ('description', 'abstract'):
2464                 description = content
2465             elif not upload_date and name == 'date':
2466                 upload_date = unified_strdate(content)
2467
2468         thumbnails = [{
2469             'id': image.get('type'),
2470             'url': image.get('src'),
2471             'width': int_or_none(image.get('width')),
2472             'height': int_or_none(image.get('height')),
2473         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2474
2475         return {
2476             'id': video_id,
2477             'title': title or video_id,
2478             'description': description,
2479             'upload_date': upload_date,
2480             'thumbnails': thumbnails,
2481             'formats': formats,
2482             'subtitles': subtitles,
2483         }
2484
2485     def _parse_smil_namespace(self, smil):
2486         return self._search_regex(
2487             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2488
2489     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2490         base = smil_url
2491         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2492             b = meta.get('base') or meta.get('httpBase')
2493             if b:
2494                 base = b
2495                 break
2496
2497         formats = []
2498         rtmp_count = 0
2499         http_count = 0
2500         m3u8_count = 0
2501         imgs_count = 0
2502
2503         srcs = set()
2504         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2505         for medium in media:
2506             src = medium.get('src')
2507             if not src or src in srcs:
2508                 continue
2509             srcs.add(src)
2510
2511             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2512             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2513             width = int_or_none(medium.get('width'))
2514             height = int_or_none(medium.get('height'))
2515             proto = medium.get('proto')
2516             ext = medium.get('ext')
2517             src_ext = determine_ext(src)
2518             streamer = medium.get('streamer') or base
2519
2520             if proto == 'rtmp' or streamer.startswith('rtmp'):
2521                 rtmp_count += 1
2522                 formats.append({
2523                     'url': streamer,
2524                     'play_path': src,
2525                     'ext': 'flv',
2526                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2527                     'tbr': bitrate,
2528                     'filesize': filesize,
2529                     'width': width,
2530                     'height': height,
2531                 })
2532                 if transform_rtmp_url:
2533                     streamer, src = transform_rtmp_url(streamer, src)
2534                     formats[-1].update({
2535                         'url': streamer,
2536                         'play_path': src,
2537                     })
2538                 continue
2539
2540             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2541             src_url = src_url.strip()
2542
2543             if proto == 'm3u8' or src_ext == 'm3u8':
2544                 m3u8_formats = self._extract_m3u8_formats(
2545                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2546                 if len(m3u8_formats) == 1:
2547                     m3u8_count += 1
2548                     m3u8_formats[0].update({
2549                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2550                         'tbr': bitrate,
2551                         'width': width,
2552                         'height': height,
2553                     })
2554                 formats.extend(m3u8_formats)
2555             elif src_ext == 'f4m':
2556                 f4m_url = src_url
2557                 if not f4m_params:
2558                     f4m_params = {
2559                         'hdcore': '3.2.0',
2560                         'plugin': 'flowplayer-3.2.0.1',
2561                     }
2562                 f4m_url += '&' if '?' in f4m_url else '?'
2563                 f4m_url += urllib.parse.urlencode(f4m_params)
2564                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2565             elif src_ext == 'mpd':
2566                 formats.extend(self._extract_mpd_formats(
2567                     src_url, video_id, mpd_id='dash', fatal=False))
2568             elif re.search(r'\.ism/[Mm]anifest', src_url):
2569                 formats.extend(self._extract_ism_formats(
2570                     src_url, video_id, ism_id='mss', fatal=False))
2571             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2572                 http_count += 1
2573                 formats.append({
2574                     'url': src_url,
2575                     'ext': ext or src_ext or 'flv',
2576                     'format_id': 'http-%d' % (bitrate or http_count),
2577                     'tbr': bitrate,
2578                     'filesize': filesize,
2579                     'width': width,
2580                     'height': height,
2581                 })
2582
2583         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2584             src = medium.get('src')
2585             if not src or src in srcs:
2586                 continue
2587             srcs.add(src)
2588
2589             imgs_count += 1
2590             formats.append({
2591                 'format_id': 'imagestream-%d' % (imgs_count),
2592                 'url': src,
2593                 'ext': mimetype2ext(medium.get('type')),
2594                 'acodec': 'none',
2595                 'vcodec': 'none',
2596                 'width': int_or_none(medium.get('width')),
2597                 'height': int_or_none(medium.get('height')),
2598                 'format_note': 'SMIL storyboards',
2599             })
2600
2601         return formats
2602
2603     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2604         urls = []
2605         subtitles = {}
2606         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2607             src = textstream.get('src')
2608             if not src or src in urls:
2609                 continue
2610             urls.append(src)
2611             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2612             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2613             subtitles.setdefault(lang, []).append({
2614                 'url': src,
2615                 'ext': ext,
2616             })
2617         return subtitles
2618
2619     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2620         res = self._download_xml_handle(
2621             xspf_url, playlist_id, 'Downloading xpsf playlist',
2622             'Unable to download xspf manifest', fatal=fatal)
2623         if res is False:
2624             return []
2625
2626         xspf, urlh = res
2627         xspf_url = urlh.geturl()
2628
2629         return self._parse_xspf(
2630             xspf, playlist_id, xspf_url=xspf_url,
2631             xspf_base_url=base_url(xspf_url))
2632
2633     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2634         NS_MAP = {
2635             'xspf': 'http://xspf.org/ns/0/',
2636             's1': 'http://static.streamone.nl/player/ns/0',
2637         }
2638
2639         entries = []
2640         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2641             title = xpath_text(
2642                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2643             description = xpath_text(
2644                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2645             thumbnail = xpath_text(
2646                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2647             duration = float_or_none(
2648                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2649
2650             formats = []
2651             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2652                 format_url = urljoin(xspf_base_url, location.text)
2653                 if not format_url:
2654                     continue
2655                 formats.append({
2656                     'url': format_url,
2657                     'manifest_url': xspf_url,
2658                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2659                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2660                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2661                 })
2662             self._sort_formats(formats)
2663
2664             entries.append({
2665                 'id': playlist_id,
2666                 'title': title,
2667                 'description': description,
2668                 'thumbnail': thumbnail,
2669                 'duration': duration,
2670                 'formats': formats,
2671             })
2672         return entries
2673
2674     def _extract_mpd_formats(self, *args, **kwargs):
2675         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2676         if subs:
2677             self._report_ignoring_subs('DASH')
2678         return fmts
2679
2680     def _extract_mpd_formats_and_subtitles(
2681             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2682             fatal=True, data=None, headers={}, query={}):
2683         res = self._download_xml_handle(
2684             mpd_url, video_id,
2685             note='Downloading MPD manifest' if note is None else note,
2686             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2687             fatal=fatal, data=data, headers=headers, query=query)
2688         if res is False:
2689             return [], {}
2690         mpd_doc, urlh = res
2691         if mpd_doc is None:
2692             return [], {}
2693
2694         # We could have been redirected to a new url when we retrieved our mpd file.
2695         mpd_url = urlh.geturl()
2696         mpd_base_url = base_url(mpd_url)
2697
2698         return self._parse_mpd_formats_and_subtitles(
2699             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2700
2701     def _parse_mpd_formats(self, *args, **kwargs):
2702         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2703         if subs:
2704             self._report_ignoring_subs('DASH')
2705         return fmts
2706
2707     def _parse_mpd_formats_and_subtitles(
2708             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2709         """
2710         Parse formats from MPD manifest.
2711         References:
2712          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2713             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2714          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2715         """
2716         if not self.get_param('dynamic_mpd', True):
2717             if mpd_doc.get('type') == 'dynamic':
2718                 return [], {}
2719
2720         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2721
2722         def _add_ns(path):
2723             return self._xpath_ns(path, namespace)
2724
2725         def is_drm_protected(element):
2726             return element.find(_add_ns('ContentProtection')) is not None
2727
2728         def extract_multisegment_info(element, ms_parent_info):
2729             ms_info = ms_parent_info.copy()
2730
2731             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2732             # common attributes and elements.  We will only extract relevant
2733             # for us.
2734             def extract_common(source):
2735                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2736                 if segment_timeline is not None:
2737                     s_e = segment_timeline.findall(_add_ns('S'))
2738                     if s_e:
2739                         ms_info['total_number'] = 0
2740                         ms_info['s'] = []
2741                         for s in s_e:
2742                             r = int(s.get('r', 0))
2743                             ms_info['total_number'] += 1 + r
2744                             ms_info['s'].append({
2745                                 't': int(s.get('t', 0)),
2746                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2747                                 'd': int(s.attrib['d']),
2748                                 'r': r,
2749                             })
2750                 start_number = source.get('startNumber')
2751                 if start_number:
2752                     ms_info['start_number'] = int(start_number)
2753                 timescale = source.get('timescale')
2754                 if timescale:
2755                     ms_info['timescale'] = int(timescale)
2756                 segment_duration = source.get('duration')
2757                 if segment_duration:
2758                     ms_info['segment_duration'] = float(segment_duration)
2759
2760             def extract_Initialization(source):
2761                 initialization = source.find(_add_ns('Initialization'))
2762                 if initialization is not None:
2763                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2764
2765             segment_list = element.find(_add_ns('SegmentList'))
2766             if segment_list is not None:
2767                 extract_common(segment_list)
2768                 extract_Initialization(segment_list)
2769                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2770                 if segment_urls_e:
2771                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2772             else:
2773                 segment_template = element.find(_add_ns('SegmentTemplate'))
2774                 if segment_template is not None:
2775                     extract_common(segment_template)
2776                     media = segment_template.get('media')
2777                     if media:
2778                         ms_info['media'] = media
2779                     initialization = segment_template.get('initialization')
2780                     if initialization:
2781                         ms_info['initialization'] = initialization
2782                     else:
2783                         extract_Initialization(segment_template)
2784             return ms_info
2785
2786         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2787         formats, subtitles = [], {}
2788         stream_numbers = collections.defaultdict(int)
2789         for period in mpd_doc.findall(_add_ns('Period')):
2790             period_duration = parse_duration(period.get('duration')) or mpd_duration
2791             period_ms_info = extract_multisegment_info(period, {
2792                 'start_number': 1,
2793                 'timescale': 1,
2794             })
2795             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2796                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2797                 for representation in adaptation_set.findall(_add_ns('Representation')):
2798                     representation_attrib = adaptation_set.attrib.copy()
2799                     representation_attrib.update(representation.attrib)
2800                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2801                     mime_type = representation_attrib['mimeType']
2802                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2803
2804                     codec_str = representation_attrib.get('codecs', '')
2805                     # Some kind of binary subtitle found in some youtube livestreams
2806                     if mime_type == 'application/x-rawcc':
2807                         codecs = {'scodec': codec_str}
2808                     else:
2809                         codecs = parse_codecs(codec_str)
2810                     if content_type not in ('video', 'audio', 'text'):
2811                         if mime_type == 'image/jpeg':
2812                             content_type = mime_type
2813                         elif codecs.get('vcodec', 'none') != 'none':
2814                             content_type = 'video'
2815                         elif codecs.get('acodec', 'none') != 'none':
2816                             content_type = 'audio'
2817                         elif codecs.get('scodec', 'none') != 'none':
2818                             content_type = 'text'
2819                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2820                             content_type = 'text'
2821                         else:
2822                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2823                             continue
2824
2825                     base_url = ''
2826                     for element in (representation, adaptation_set, period, mpd_doc):
2827                         base_url_e = element.find(_add_ns('BaseURL'))
2828                         if try_call(lambda: base_url_e.text) is not None:
2829                             base_url = base_url_e.text + base_url
2830                             if re.match(r'^https?://', base_url):
2831                                 break
2832                     if mpd_base_url and base_url.startswith('/'):
2833                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2834                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2835                         if not mpd_base_url.endswith('/'):
2836                             mpd_base_url += '/'
2837                         base_url = mpd_base_url + base_url
2838                     representation_id = representation_attrib.get('id')
2839                     lang = representation_attrib.get('lang')
2840                     url_el = representation.find(_add_ns('BaseURL'))
2841                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2842                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2843                     if representation_id is not None:
2844                         format_id = representation_id
2845                     else:
2846                         format_id = content_type
2847                     if mpd_id:
2848                         format_id = mpd_id + '-' + format_id
2849                     if content_type in ('video', 'audio'):
2850                         f = {
2851                             'format_id': format_id,
2852                             'manifest_url': mpd_url,
2853                             'ext': mimetype2ext(mime_type),
2854                             'width': int_or_none(representation_attrib.get('width')),
2855                             'height': int_or_none(representation_attrib.get('height')),
2856                             'tbr': float_or_none(bandwidth, 1000),
2857                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2858                             'fps': int_or_none(representation_attrib.get('frameRate')),
2859                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2860                             'format_note': 'DASH %s' % content_type,
2861                             'filesize': filesize,
2862                             'container': mimetype2ext(mime_type) + '_dash',
2863                             **codecs
2864                         }
2865                     elif content_type == 'text':
2866                         f = {
2867                             'ext': mimetype2ext(mime_type),
2868                             'manifest_url': mpd_url,
2869                             'filesize': filesize,
2870                         }
2871                     elif content_type == 'image/jpeg':
2872                         # See test case in VikiIE
2873                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2874                         f = {
2875                             'format_id': format_id,
2876                             'ext': 'mhtml',
2877                             'manifest_url': mpd_url,
2878                             'format_note': 'DASH storyboards (jpeg)',
2879                             'acodec': 'none',
2880                             'vcodec': 'none',
2881                         }
2882                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2883                         f['has_drm'] = True
2884                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2885
2886                     def prepare_template(template_name, identifiers):
2887                         tmpl = representation_ms_info[template_name]
2888                         # First of, % characters outside $...$ templates
2889                         # must be escaped by doubling for proper processing
2890                         # by % operator string formatting used further (see
2891                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2892                         t = ''
2893                         in_template = False
2894                         for c in tmpl:
2895                             t += c
2896                             if c == '$':
2897                                 in_template = not in_template
2898                             elif c == '%' and not in_template:
2899                                 t += c
2900                         # Next, $...$ templates are translated to their
2901                         # %(...) counterparts to be used with % operator
2902                         if representation_id is not None:
2903                             t = t.replace('$RepresentationID$', representation_id)
2904                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2905                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2906                         t.replace('$$', '$')
2907                         return t
2908
2909                     # @initialization is a regular template like @media one
2910                     # so it should be handled just the same way (see
2911                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2912                     if 'initialization' in representation_ms_info:
2913                         initialization_template = prepare_template(
2914                             'initialization',
2915                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2916                             # $Time$ shall not be included for @initialization thus
2917                             # only $Bandwidth$ remains
2918                             ('Bandwidth', ))
2919                         representation_ms_info['initialization_url'] = initialization_template % {
2920                             'Bandwidth': bandwidth,
2921                         }
2922
2923                     def location_key(location):
2924                         return 'url' if re.match(r'^https?://', location) else 'path'
2925
2926                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2927
2928                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2929                         media_location_key = location_key(media_template)
2930
2931                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2932                         # can't be used at the same time
2933                         if '%(Number' in media_template and 's' not in representation_ms_info:
2934                             segment_duration = None
2935                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2936                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2937                                 representation_ms_info['total_number'] = int(math.ceil(
2938                                     float_or_none(period_duration, segment_duration, default=0)))
2939                             representation_ms_info['fragments'] = [{
2940                                 media_location_key: media_template % {
2941                                     'Number': segment_number,
2942                                     'Bandwidth': bandwidth,
2943                                 },
2944                                 'duration': segment_duration,
2945                             } for segment_number in range(
2946                                 representation_ms_info['start_number'],
2947                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2948                         else:
2949                             # $Number*$ or $Time$ in media template with S list available
2950                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2951                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2952                             representation_ms_info['fragments'] = []
2953                             segment_time = 0
2954                             segment_d = None
2955                             segment_number = representation_ms_info['start_number']
2956
2957                             def add_segment_url():
2958                                 segment_url = media_template % {
2959                                     'Time': segment_time,
2960                                     'Bandwidth': bandwidth,
2961                                     'Number': segment_number,
2962                                 }
2963                                 representation_ms_info['fragments'].append({
2964                                     media_location_key: segment_url,
2965                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2966                                 })
2967
2968                             for num, s in enumerate(representation_ms_info['s']):
2969                                 segment_time = s.get('t') or segment_time
2970                                 segment_d = s['d']
2971                                 add_segment_url()
2972                                 segment_number += 1
2973                                 for r in range(s.get('r', 0)):
2974                                     segment_time += segment_d
2975                                     add_segment_url()
2976                                     segment_number += 1
2977                                 segment_time += segment_d
2978                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2979                         # No media template
2980                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2981                         # or any YouTube dashsegments video
2982                         fragments = []
2983                         segment_index = 0
2984                         timescale = representation_ms_info['timescale']
2985                         for s in representation_ms_info['s']:
2986                             duration = float_or_none(s['d'], timescale)
2987                             for r in range(s.get('r', 0) + 1):
2988                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2989                                 fragments.append({
2990                                     location_key(segment_uri): segment_uri,
2991                                     'duration': duration,
2992                                 })
2993                                 segment_index += 1
2994                         representation_ms_info['fragments'] = fragments
2995                     elif 'segment_urls' in representation_ms_info:
2996                         # Segment URLs with no SegmentTimeline
2997                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2998                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2999                         fragments = []
3000                         segment_duration = float_or_none(
3001                             representation_ms_info['segment_duration'],
3002                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3003                         for segment_url in representation_ms_info['segment_urls']:
3004                             fragment = {
3005                                 location_key(segment_url): segment_url,
3006                             }
3007                             if segment_duration:
3008                                 fragment['duration'] = segment_duration
3009                             fragments.append(fragment)
3010                         representation_ms_info['fragments'] = fragments
3011                     # If there is a fragments key available then we correctly recognized fragmented media.
3012                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3013                     # assumption is not necessarily correct since we may simply have no support for
3014                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3015                     if 'fragments' in representation_ms_info:
3016                         f.update({
3017                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3018                             'url': mpd_url or base_url,
3019                             'fragment_base_url': base_url,
3020                             'fragments': [],
3021                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3022                         })
3023                         if 'initialization_url' in representation_ms_info:
3024                             initialization_url = representation_ms_info['initialization_url']
3025                             if not f.get('url'):
3026                                 f['url'] = initialization_url
3027                             f['fragments'].append({location_key(initialization_url): initialization_url})
3028                         f['fragments'].extend(representation_ms_info['fragments'])
3029                         if not period_duration:
3030                             period_duration = try_get(
3031                                 representation_ms_info,
3032                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3033                     else:
3034                         # Assuming direct URL to unfragmented media.
3035                         f['url'] = base_url
3036                     if content_type in ('video', 'audio', 'image/jpeg'):
3037                         f['manifest_stream_number'] = stream_numbers[f['url']]
3038                         stream_numbers[f['url']] += 1
3039                         formats.append(f)
3040                     elif content_type == 'text':
3041                         subtitles.setdefault(lang or 'und', []).append(f)
3042
3043         return formats, subtitles
3044
3045     def _extract_ism_formats(self, *args, **kwargs):
3046         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3047         if subs:
3048             self._report_ignoring_subs('ISM')
3049         return fmts
3050
3051     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3052         res = self._download_xml_handle(
3053             ism_url, video_id,
3054             note='Downloading ISM manifest' if note is None else note,
3055             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3056             fatal=fatal, data=data, headers=headers, query=query)
3057         if res is False:
3058             return [], {}
3059         ism_doc, urlh = res
3060         if ism_doc is None:
3061             return [], {}
3062
3063         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3064
3065     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3066         """
3067         Parse formats from ISM manifest.
3068         References:
3069          1. [MS-SSTR]: Smooth Streaming Protocol,
3070             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3071         """
3072         if ism_doc.get('IsLive') == 'TRUE':
3073             return [], {}
3074
3075         duration = int(ism_doc.attrib['Duration'])
3076         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3077
3078         formats = []
3079         subtitles = {}
3080         for stream in ism_doc.findall('StreamIndex'):
3081             stream_type = stream.get('Type')
3082             if stream_type not in ('video', 'audio', 'text'):
3083                 continue
3084             url_pattern = stream.attrib['Url']
3085             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3086             stream_name = stream.get('Name')
3087             stream_language = stream.get('Language', 'und')
3088             for track in stream.findall('QualityLevel'):
3089                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3090                 # TODO: add support for WVC1 and WMAP
3091                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3092                     self.report_warning('%s is not a supported codec' % fourcc)
3093                     continue
3094                 tbr = int(track.attrib['Bitrate']) // 1000
3095                 # [1] does not mention Width and Height attributes. However,
3096                 # they're often present while MaxWidth and MaxHeight are
3097                 # missing, so should be used as fallbacks
3098                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3099                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3100                 sampling_rate = int_or_none(track.get('SamplingRate'))
3101
3102                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3103                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3104
3105                 fragments = []
3106                 fragment_ctx = {
3107                     'time': 0,
3108                 }
3109                 stream_fragments = stream.findall('c')
3110                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3111                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3112                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3113                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3114                     if not fragment_ctx['duration']:
3115                         try:
3116                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3117                         except IndexError:
3118                             next_fragment_time = duration
3119                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3120                     for _ in range(fragment_repeat):
3121                         fragments.append({
3122                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3123                             'duration': fragment_ctx['duration'] / stream_timescale,
3124                         })
3125                         fragment_ctx['time'] += fragment_ctx['duration']
3126
3127                 if stream_type == 'text':
3128                     subtitles.setdefault(stream_language, []).append({
3129                         'ext': 'ismt',
3130                         'protocol': 'ism',
3131                         'url': ism_url,
3132                         'manifest_url': ism_url,
3133                         'fragments': fragments,
3134                         '_download_params': {
3135                             'stream_type': stream_type,
3136                             'duration': duration,
3137                             'timescale': stream_timescale,
3138                             'fourcc': fourcc,
3139                             'language': stream_language,
3140                             'codec_private_data': track.get('CodecPrivateData'),
3141                         }
3142                     })
3143                 elif stream_type in ('video', 'audio'):
3144                     formats.append({
3145                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3146                         'url': ism_url,
3147                         'manifest_url': ism_url,
3148                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3149                         'width': width,
3150                         'height': height,
3151                         'tbr': tbr,
3152                         'asr': sampling_rate,
3153                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3154                         'acodec': 'none' if stream_type == 'video' else fourcc,
3155                         'protocol': 'ism',
3156                         'fragments': fragments,
3157                         'has_drm': ism_doc.find('Protection') is not None,
3158                         '_download_params': {
3159                             'stream_type': stream_type,
3160                             'duration': duration,
3161                             'timescale': stream_timescale,
3162                             'width': width or 0,
3163                             'height': height or 0,
3164                             'fourcc': fourcc,
3165                             'language': stream_language,
3166                             'codec_private_data': track.get('CodecPrivateData'),
3167                             'sampling_rate': sampling_rate,
3168                             'channels': int_or_none(track.get('Channels', 2)),
3169                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3170                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3171                         },
3172                     })
3173         return formats, subtitles
3174
3175     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3176         def absolute_url(item_url):
3177             return urljoin(base_url, item_url)
3178
3179         def parse_content_type(content_type):
3180             if not content_type:
3181                 return {}
3182             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3183             if ctr:
3184                 mimetype, codecs = ctr.groups()
3185                 f = parse_codecs(codecs)
3186                 f['ext'] = mimetype2ext(mimetype)
3187                 return f
3188             return {}
3189
3190         def _media_formats(src, cur_media_type, type_info=None):
3191             type_info = type_info or {}
3192             full_url = absolute_url(src)
3193             ext = type_info.get('ext') or determine_ext(full_url)
3194             if ext == 'm3u8':
3195                 is_plain_url = False
3196                 formats = self._extract_m3u8_formats(
3197                     full_url, video_id, ext='mp4',
3198                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3199                     preference=preference, quality=quality, fatal=False)
3200             elif ext == 'mpd':
3201                 is_plain_url = False
3202                 formats = self._extract_mpd_formats(
3203                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3204             else:
3205                 is_plain_url = True
3206                 formats = [{
3207                     'url': full_url,
3208                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3209                     'ext': ext,
3210                 }]
3211             return is_plain_url, formats
3212
3213         entries = []
3214         # amp-video and amp-audio are very similar to their HTML5 counterparts
3215         # so we will include them right here (see
3216         # https://www.ampproject.org/docs/reference/components/amp-video)
3217         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3218         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3219         media_tags = [(media_tag, media_tag_name, media_type, '')
3220                       for media_tag, media_tag_name, media_type
3221                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3222         media_tags.extend(re.findall(
3223             # We only allow video|audio followed by a whitespace or '>'.
3224             # Allowing more characters may end up in significant slow down (see
3225             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3226             # http://www.porntrex.com/maps/videositemap.xml).
3227             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3228         for media_tag, _, media_type, media_content in media_tags:
3229             media_info = {
3230                 'formats': [],
3231                 'subtitles': {},
3232             }
3233             media_attributes = extract_attributes(media_tag)
3234             src = strip_or_none(media_attributes.get('src'))
3235             if src:
3236                 f = parse_content_type(media_attributes.get('type'))
3237                 _, formats = _media_formats(src, media_type, f)
3238                 media_info['formats'].extend(formats)
3239             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3240             if media_content:
3241                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3242                     s_attr = extract_attributes(source_tag)
3243                     # data-video-src and data-src are non standard but seen
3244                     # several times in the wild
3245                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3246                     if not src:
3247                         continue
3248                     f = parse_content_type(s_attr.get('type'))
3249                     is_plain_url, formats = _media_formats(src, media_type, f)
3250                     if is_plain_url:
3251                         # width, height, res, label and title attributes are
3252                         # all not standard but seen several times in the wild
3253                         labels = [
3254                             s_attr.get(lbl)
3255                             for lbl in ('label', 'title')
3256                             if str_or_none(s_attr.get(lbl))
3257                         ]
3258                         width = int_or_none(s_attr.get('width'))
3259                         height = (int_or_none(s_attr.get('height'))
3260                                   or int_or_none(s_attr.get('res')))
3261                         if not width or not height:
3262                             for lbl in labels:
3263                                 resolution = parse_resolution(lbl)
3264                                 if not resolution:
3265                                     continue
3266                                 width = width or resolution.get('width')
3267                                 height = height or resolution.get('height')
3268                         for lbl in labels:
3269                             tbr = parse_bitrate(lbl)
3270                             if tbr:
3271                                 break
3272                         else:
3273                             tbr = None
3274                         f.update({
3275                             'width': width,
3276                             'height': height,
3277                             'tbr': tbr,
3278                             'format_id': s_attr.get('label') or s_attr.get('title'),
3279                         })
3280                         f.update(formats[0])
3281                         media_info['formats'].append(f)
3282                     else:
3283                         media_info['formats'].extend(formats)
3284                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3285                     track_attributes = extract_attributes(track_tag)
3286                     kind = track_attributes.get('kind')
3287                     if not kind or kind in ('subtitles', 'captions'):
3288                         src = strip_or_none(track_attributes.get('src'))
3289                         if not src:
3290                             continue
3291                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3292                         media_info['subtitles'].setdefault(lang, []).append({
3293                             'url': absolute_url(src),
3294                         })
3295             for f in media_info['formats']:
3296                 f.setdefault('http_headers', {})['Referer'] = base_url
3297             if media_info['formats'] or media_info['subtitles']:
3298                 entries.append(media_info)
3299         return entries
3300
3301     def _extract_akamai_formats(self, *args, **kwargs):
3302         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3303         if subs:
3304             self._report_ignoring_subs('akamai')
3305         return fmts
3306
3307     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3308         signed = 'hdnea=' in manifest_url
3309         if not signed:
3310             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3311             manifest_url = re.sub(
3312                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3313                 '', manifest_url).strip('?')
3314
3315         formats = []
3316         subtitles = {}
3317
3318         hdcore_sign = 'hdcore=3.7.0'
3319         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3320         hds_host = hosts.get('hds')
3321         if hds_host:
3322             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3323         if 'hdcore=' not in f4m_url:
3324             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3325         f4m_formats = self._extract_f4m_formats(
3326             f4m_url, video_id, f4m_id='hds', fatal=False)
3327         for entry in f4m_formats:
3328             entry.update({'extra_param_to_segment_url': hdcore_sign})
3329         formats.extend(f4m_formats)
3330
3331         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3332         hls_host = hosts.get('hls')
3333         if hls_host:
3334             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3335         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3336             m3u8_url, video_id, 'mp4', 'm3u8_native',
3337             m3u8_id='hls', fatal=False)
3338         formats.extend(m3u8_formats)
3339         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3340
3341         http_host = hosts.get('http')
3342         if http_host and m3u8_formats and not signed:
3343             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3344             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3345             qualities_length = len(qualities)
3346             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3347                 i = 0
3348                 for f in m3u8_formats:
3349                     if f['vcodec'] != 'none':
3350                         for protocol in ('http', 'https'):
3351                             http_f = f.copy()
3352                             del http_f['manifest_url']
3353                             http_url = re.sub(
3354                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3355                             http_f.update({
3356                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3357                                 'url': http_url,
3358                                 'protocol': protocol,
3359                             })
3360                             formats.append(http_f)
3361                         i += 1
3362
3363         return formats, subtitles
3364
3365     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3366         query = urllib.parse.urlparse(url).query
3367         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3368         mobj = re.search(
3369             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3370         url_base = mobj.group('url')
3371         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3372         formats = []
3373
3374         def manifest_url(manifest):
3375             m_url = f'{http_base_url}/{manifest}'
3376             if query:
3377                 m_url += '?%s' % query
3378             return m_url
3379
3380         if 'm3u8' not in skip_protocols:
3381             formats.extend(self._extract_m3u8_formats(
3382                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3383                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3384         if 'f4m' not in skip_protocols:
3385             formats.extend(self._extract_f4m_formats(
3386                 manifest_url('manifest.f4m'),
3387                 video_id, f4m_id='hds', fatal=False))
3388         if 'dash' not in skip_protocols:
3389             formats.extend(self._extract_mpd_formats(
3390                 manifest_url('manifest.mpd'),
3391                 video_id, mpd_id='dash', fatal=False))
3392         if re.search(r'(?:/smil:|\.smil)', url_base):
3393             if 'smil' not in skip_protocols:
3394                 rtmp_formats = self._extract_smil_formats(
3395                     manifest_url('jwplayer.smil'),
3396                     video_id, fatal=False)
3397                 for rtmp_format in rtmp_formats:
3398                     rtsp_format = rtmp_format.copy()
3399                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3400                     del rtsp_format['play_path']
3401                     del rtsp_format['ext']
3402                     rtsp_format.update({
3403                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3404                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3405                         'protocol': 'rtsp',
3406                     })
3407                     formats.extend([rtmp_format, rtsp_format])
3408         else:
3409             for protocol in ('rtmp', 'rtsp'):
3410                 if protocol not in skip_protocols:
3411                     formats.append({
3412                         'url': f'{protocol}:{url_base}',
3413                         'format_id': protocol,
3414                         'protocol': protocol,
3415                     })
3416         return formats
3417
3418     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3419         mobj = re.search(
3420             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3421             webpage)
3422         if mobj:
3423             try:
3424                 jwplayer_data = self._parse_json(mobj.group('options'),
3425                                                  video_id=video_id,
3426                                                  transform_source=transform_source)
3427             except ExtractorError:
3428                 pass
3429             else:
3430                 if isinstance(jwplayer_data, dict):
3431                     return jwplayer_data
3432
3433     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3434         jwplayer_data = self._find_jwplayer_data(
3435             webpage, video_id, transform_source=js_to_json)
3436         return self._parse_jwplayer_data(
3437             jwplayer_data, video_id, *args, **kwargs)
3438
3439     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3440                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3441         # JWPlayer backward compatibility: flattened playlists
3442         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3443         if 'playlist' not in jwplayer_data:
3444             jwplayer_data = {'playlist': [jwplayer_data]}
3445
3446         entries = []
3447
3448         # JWPlayer backward compatibility: single playlist item
3449         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3450         if not isinstance(jwplayer_data['playlist'], list):
3451             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3452
3453         for video_data in jwplayer_data['playlist']:
3454             # JWPlayer backward compatibility: flattened sources
3455             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3456             if 'sources' not in video_data:
3457                 video_data['sources'] = [video_data]
3458
3459             this_video_id = video_id or video_data['mediaid']
3460
3461             formats = self._parse_jwplayer_formats(
3462                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3463                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3464
3465             subtitles = {}
3466             tracks = video_data.get('tracks')
3467             if tracks and isinstance(tracks, list):
3468                 for track in tracks:
3469                     if not isinstance(track, dict):
3470                         continue
3471                     track_kind = track.get('kind')
3472                     if not track_kind or not isinstance(track_kind, str):
3473                         continue
3474                     if track_kind.lower() not in ('captions', 'subtitles'):
3475                         continue
3476                     track_url = urljoin(base_url, track.get('file'))
3477                     if not track_url:
3478                         continue
3479                     subtitles.setdefault(track.get('label') or 'en', []).append({
3480                         'url': self._proto_relative_url(track_url)
3481                     })
3482
3483             entry = {
3484                 'id': this_video_id,
3485                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3486                 'description': clean_html(video_data.get('description')),
3487                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3488                 'timestamp': int_or_none(video_data.get('pubdate')),
3489                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3490                 'subtitles': subtitles,
3491             }
3492             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3493             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3494                 entry.update({
3495                     '_type': 'url_transparent',
3496                     'url': formats[0]['url'],
3497                 })
3498             else:
3499                 self._sort_formats(formats)
3500                 entry['formats'] = formats
3501             entries.append(entry)
3502         if len(entries) == 1:
3503             return entries[0]
3504         else:
3505             return self.playlist_result(entries)
3506
3507     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3508                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3509         urls = []
3510         formats = []
3511         for source in jwplayer_sources_data:
3512             if not isinstance(source, dict):
3513                 continue
3514             source_url = urljoin(
3515                 base_url, self._proto_relative_url(source.get('file')))
3516             if not source_url or source_url in urls:
3517                 continue
3518             urls.append(source_url)
3519             source_type = source.get('type') or ''
3520             ext = mimetype2ext(source_type) or determine_ext(source_url)
3521             if source_type == 'hls' or ext == 'm3u8':
3522                 formats.extend(self._extract_m3u8_formats(
3523                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3524                     m3u8_id=m3u8_id, fatal=False))
3525             elif source_type == 'dash' or ext == 'mpd':
3526                 formats.extend(self._extract_mpd_formats(
3527                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3528             elif ext == 'smil':
3529                 formats.extend(self._extract_smil_formats(
3530                     source_url, video_id, fatal=False))
3531             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3532             elif source_type.startswith('audio') or ext in (
3533                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3534                 formats.append({
3535                     'url': source_url,
3536                     'vcodec': 'none',
3537                     'ext': ext,
3538                 })
3539             else:
3540                 height = int_or_none(source.get('height'))
3541                 if height is None:
3542                     # Often no height is provided but there is a label in
3543                     # format like "1080p", "720p SD", or 1080.
3544                     height = int_or_none(self._search_regex(
3545                         r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
3546                         'height', default=None))
3547                 a_format = {
3548                     'url': source_url,
3549                     'width': int_or_none(source.get('width')),
3550                     'height': height,
3551                     'tbr': int_or_none(source.get('bitrate')),
3552                     'ext': ext,
3553                 }
3554                 if source_url.startswith('rtmp'):
3555                     a_format['ext'] = 'flv'
3556                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3557                     # of jwplayer.flash.swf
3558                     rtmp_url_parts = re.split(
3559                         r'((?:mp4|mp3|flv):)', source_url, 1)
3560                     if len(rtmp_url_parts) == 3:
3561                         rtmp_url, prefix, play_path = rtmp_url_parts
3562                         a_format.update({
3563                             'url': rtmp_url,
3564                             'play_path': prefix + play_path,
3565                         })
3566                     if rtmp_params:
3567                         a_format.update(rtmp_params)
3568                 formats.append(a_format)
3569         return formats
3570
3571     def _live_title(self, name):
3572         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3573         return name
3574
3575     def _int(self, v, name, fatal=False, **kwargs):
3576         res = int_or_none(v, **kwargs)
3577         if res is None:
3578             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3579             if fatal:
3580                 raise ExtractorError(msg)
3581             else:
3582                 self.report_warning(msg)
3583         return res
3584
3585     def _float(self, v, name, fatal=False, **kwargs):
3586         res = float_or_none(v, **kwargs)
3587         if res is None:
3588             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3589             if fatal:
3590                 raise ExtractorError(msg)
3591             else:
3592                 self.report_warning(msg)
3593         return res
3594
3595     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3596                     path='/', secure=False, discard=False, rest={}, **kwargs):
3597         cookie = http.cookiejar.Cookie(
3598             0, name, value, port, port is not None, domain, True,
3599             domain.startswith('.'), path, True, secure, expire_time,
3600             discard, None, None, rest)
3601         self.cookiejar.set_cookie(cookie)
3602
3603     def _get_cookies(self, url):
3604         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3605         return http.cookies.SimpleCookie(self._downloader._calc_cookies(url))
3606
3607     def _apply_first_set_cookie_header(self, url_handle, cookie):
3608         """
3609         Apply first Set-Cookie header instead of the last. Experimental.
3610
3611         Some sites (e.g. [1-3]) may serve two cookies under the same name
3612         in Set-Cookie header and expect the first (old) one to be set rather
3613         than second (new). However, as of RFC6265 the newer one cookie
3614         should be set into cookie store what actually happens.
3615         We will workaround this issue by resetting the cookie to
3616         the first one manually.
3617         1. https://new.vk.com/
3618         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3619         3. https://learning.oreilly.com/
3620         """
3621         for header, cookies in url_handle.headers.items():
3622             if header.lower() != 'set-cookie':
3623                 continue
3624             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3625             cookie_value = re.search(
3626                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3627             if cookie_value:
3628                 value, domain = cookie_value.groups()
3629                 self._set_cookie(domain, cookie, value)
3630                 break
3631
3632     @classmethod
3633     def get_testcases(cls, include_onlymatching=False):
3634         t = getattr(cls, '_TEST', None)
3635         if t:
3636             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3637             tests = [t]
3638         else:
3639             tests = getattr(cls, '_TESTS', [])
3640         for t in tests:
3641             if not include_onlymatching and t.get('only_matching', False):
3642                 continue
3643             t['name'] = cls.ie_key()
3644             yield t
3645
3646     @classproperty
3647     def age_limit(cls):
3648         """Get age limit from the testcases"""
3649         return max(traverse_obj(
3650             tuple(cls.get_testcases(include_onlymatching=False)),
3651             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3652
3653     @classmethod
3654     def is_suitable(cls, age_limit):
3655         """Test whether the extractor is generally suitable for the given age limit"""
3656         return not age_restricted(cls.age_limit, age_limit)
3657
3658     @classmethod
3659     def description(cls, *, markdown=True, search_examples=None):
3660         """Description of the extractor"""
3661         desc = ''
3662         if cls._NETRC_MACHINE:
3663             if markdown:
3664                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3665             else:
3666                 desc += f' [{cls._NETRC_MACHINE}]'
3667         if cls.IE_DESC is False:
3668             desc += ' [HIDDEN]'
3669         elif cls.IE_DESC:
3670             desc += f' {cls.IE_DESC}'
3671         if cls.SEARCH_KEY:
3672             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3673             if search_examples:
3674                 _COUNTS = ('', '5', '10', 'all')
3675                 desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3676         if not cls.working():
3677             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3678
3679         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3680         return f'{name}:{desc}' if desc else name
3681
3682     def extract_subtitles(self, *args, **kwargs):
3683         if (self.get_param('writesubtitles', False)
3684                 or self.get_param('listsubtitles')):
3685             return self._get_subtitles(*args, **kwargs)
3686         return {}
3687
3688     def _get_subtitles(self, *args, **kwargs):
3689         raise NotImplementedError('This method must be implemented by subclasses')
3690
3691     def extract_comments(self, *args, **kwargs):
3692         if not self.get_param('getcomments'):
3693             return None
3694         generator = self._get_comments(*args, **kwargs)
3695
3696         def extractor():
3697             comments = []
3698             interrupted = True
3699             try:
3700                 while True:
3701                     comments.append(next(generator))
3702             except StopIteration:
3703                 interrupted = False
3704             except KeyboardInterrupt:
3705                 self.to_screen('Interrupted by user')
3706             except Exception as e:
3707                 if self.get_param('ignoreerrors') is not True:
3708                     raise
3709                 self._downloader.report_error(e)
3710             comment_count = len(comments)
3711             self.to_screen(f'Extracted {comment_count} comments')
3712             return {
3713                 'comments': comments,
3714                 'comment_count': None if interrupted else comment_count
3715             }
3716         return extractor
3717
3718     def _get_comments(self, *args, **kwargs):
3719         raise NotImplementedError('This method must be implemented by subclasses')
3720
3721     @staticmethod
3722     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3723         """ Merge subtitle items for one language. Items with duplicated URLs/data
3724         will be dropped. """
3725         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3726         ret = list(subtitle_list1)
3727         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3728         return ret
3729
3730     @classmethod
3731     def _merge_subtitles(cls, *dicts, target=None):
3732         """ Merge subtitle dictionaries, language by language. """
3733         if target is None:
3734             target = {}
3735         for d in dicts:
3736             for lang, subs in d.items():
3737                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3738         return target
3739
3740     def extract_automatic_captions(self, *args, **kwargs):
3741         if (self.get_param('writeautomaticsub', False)
3742                 or self.get_param('listsubtitles')):
3743             return self._get_automatic_captions(*args, **kwargs)
3744         return {}
3745
3746     def _get_automatic_captions(self, *args, **kwargs):
3747         raise NotImplementedError('This method must be implemented by subclasses')
3748
3749     @functools.cached_property
3750     def _cookies_passed(self):
3751         """Whether cookies have been passed to YoutubeDL"""
3752         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3753
3754     def mark_watched(self, *args, **kwargs):
3755         if not self.get_param('mark_watched', False):
3756             return
3757         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3758             self._mark_watched(*args, **kwargs)
3759
3760     def _mark_watched(self, *args, **kwargs):
3761         raise NotImplementedError('This method must be implemented by subclasses')
3762
3763     def geo_verification_headers(self):
3764         headers = {}
3765         geo_verification_proxy = self.get_param('geo_verification_proxy')
3766         if geo_verification_proxy:
3767             headers['Ytdl-request-proxy'] = geo_verification_proxy
3768         return headers
3769
3770     def _generic_id(self, url):
3771         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3772
3773     def _generic_title(self, url):
3774         return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3775
3776     @staticmethod
3777     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3778         all_known = all(map(
3779             lambda x: x is not None,
3780             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3781         return (
3782             'private' if is_private
3783             else 'premium_only' if needs_premium
3784             else 'subscriber_only' if needs_subscription
3785             else 'needs_auth' if needs_auth
3786             else 'unlisted' if is_unlisted
3787             else 'public' if all_known
3788             else None)
3789
3790     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3791         '''
3792         @returns            A list of values for the extractor argument given by "key"
3793                             or "default" if no such key is present
3794         @param default      The default value to return when the key is not present (default: [])
3795         @param casesense    When false, the values are converted to lower case
3796         '''
3797         val = traverse_obj(
3798             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3799         if val is None:
3800             return [] if default is NO_DEFAULT else default
3801         return list(val) if casesense else [x.lower() for x in val]
3802
3803     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3804         if not playlist_id or not video_id:
3805             return not video_id
3806
3807         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3808         if no_playlist is not None:
3809             return not no_playlist
3810
3811         video_id = '' if video_id is True else f' {video_id}'
3812         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3813         if self.get_param('noplaylist'):
3814             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3815             return False
3816         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3817         return True
3818
3819
3820 class SearchInfoExtractor(InfoExtractor):
3821     """
3822     Base class for paged search queries extractors.
3823     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3824     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3825     """
3826
3827     _MAX_RESULTS = float('inf')
3828
3829     @classmethod
3830     def _make_valid_url(cls):
3831         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3832
3833     def _real_extract(self, query):
3834         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3835         if prefix == '':
3836             return self._get_n_results(query, 1)
3837         elif prefix == 'all':
3838             return self._get_n_results(query, self._MAX_RESULTS)
3839         else:
3840             n = int(prefix)
3841             if n <= 0:
3842                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3843             elif n > self._MAX_RESULTS:
3844                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3845                 n = self._MAX_RESULTS
3846             return self._get_n_results(query, n)
3847
3848     def _get_n_results(self, query, n):
3849         """Get a specified number of results for a query.
3850         Either this function or _search_results must be overridden by subclasses """
3851         return self.playlist_result(
3852             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3853             query, query)
3854
3855     def _search_results(self, query):
3856         """Returns an iterator of search results"""
3857         raise NotImplementedError('This method must be implemented by subclasses')
3858
3859     @classproperty
3860     def SEARCH_KEY(cls):
3861         return cls._SEARCH_KEY