yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import hashlib
   4 import itertools
   5 import json
   6 import math
   7 import netrc
   8 import os
   9 import random
  10 import sys
  11 import time
  12 import xml.etree.ElementTree
  13
  14 from ..compat import functools, re  # isort: split
  15 from ..compat import (
  16     compat_cookiejar_Cookie,
  17     compat_cookies_SimpleCookie,
  18     compat_etree_fromstring,
  19     compat_expanduser,
  20     compat_getpass,
  21     compat_http_client,
  22     compat_os_name,
  23     compat_str,
  24     compat_urllib_error,
  25     compat_urllib_parse_unquote,
  26     compat_urllib_parse_urlencode,
  27     compat_urllib_request,
  28     compat_urlparse,
  29 )
  30 from ..downloader import FileDownloader
  31 from ..downloader.f4m import get_base_url, remove_encrypted_media
  32 from ..utils import (
  33     JSON_LD_RE,
  34     NO_DEFAULT,
  35     ExtractorError,
  36     GeoRestrictedError,
  37     GeoUtils,
  38     LenientJSONDecoder,
  39     RegexNotFoundError,
  40     UnsupportedError,
  41     age_restricted,
  42     base_url,
  43     bug_reports_message,
  44     classproperty,
  45     clean_html,
  46     determine_ext,
  47     determine_protocol,
  48     dict_get,
  49     encode_data_uri,
  50     error_to_compat_str,
  51     extract_attributes,
  52     filter_dict,
  53     fix_xml_ampersands,
  54     float_or_none,
  55     format_field,
  56     int_or_none,
  57     join_nonempty,
  58     js_to_json,
  59     mimetype2ext,
  60     network_exceptions,
  61     orderedSet,
  62     parse_bitrate,
  63     parse_codecs,
  64     parse_duration,
  65     parse_iso8601,
  66     parse_m3u8_attributes,
  67     parse_resolution,
  68     sanitize_filename,
  69     sanitized_Request,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     traverse_obj,
  74     try_get,
  75     unescapeHTML,
  76     unified_strdate,
  77     unified_timestamp,
  78     update_Request,
  79     update_url_query,
  80     url_basename,
  81     url_or_none,
  82     urljoin,
  83     variadic,
  84     xpath_element,
  85     xpath_text,
  86     xpath_with_ns,
  87 )
  88
  89
  90 class InfoExtractor:
  91     """Information Extractor class.
  92
  93     Information extractors are the classes that, given a URL, extract
  94     information about the video (or videos) the URL refers to. This
  95     information includes the real video URL, the video title, author and
  96     others. The information is stored in a dictionary which is then
  97     passed to the YoutubeDL. The YoutubeDL processes this
  98     information possibly downloading the video to the file system, among
  99     other possible outcomes.
 100
 101     The type field determines the type of the result.
 102     By far the most common value (and the default if _type is missing) is
 103     "video", which indicates a single video.
 104
 105     For a video, the dictionaries must include the following fields:
 106
 107     id:             Video identifier.
 108     title:          Video title, unescaped. Set to an empty string if video has
 109                     no title as opposed to "None" which signifies that the
 110                     extractor failed to obtain a title
 111
 112     Additionally, it must contain either a formats entry or a url one:
 113
 114     formats:        A list of dictionaries for each format available, ordered
 115                     from worst to best quality.
 116
 117                     Potential fields:
 118                     * url        The mandatory URL representing the media:
 119                                    for plain file media - HTTP URL of this file,
 120                                    for RTMP - RTMP URL,
 121                                    for HLS - URL of the M3U8 media playlist,
 122                                    for HDS - URL of the F4M manifest,
 123                                    for DASH
 124                                      - HTTP URL to plain file media (in case of
 125                                        unfragmented media)
 126                                      - URL of the MPD manifest or base URL
 127                                        representing the media if MPD manifest
 128                                        is parsed from a string (in case of
 129                                        fragmented media)
 130                                    for MSS - URL of the ISM manifest.
 131                     * manifest_url
 132                                  The URL of the manifest file in case of
 133                                  fragmented media:
 134                                    for HLS - URL of the M3U8 master playlist,
 135                                    for HDS - URL of the F4M manifest,
 136                                    for DASH - URL of the MPD manifest,
 137                                    for MSS - URL of the ISM manifest.
 138                     * manifest_stream_number  (For internal use only)
 139                                  The index of the stream in the manifest file
 140                     * ext        Will be calculated from URL if missing
 141                     * format     A human-readable description of the format
 142                                  ("mp4 container with h264/opus").
 143                                  Calculated from the format_id, width, height.
 144                                  and format_note fields if missing.
 145                     * format_id  A short description of the format
 146                                  ("mp4_h264_opus" or "19").
 147                                 Technically optional, but strongly recommended.
 148                     * format_note Additional info about the format
 149                                  ("3D" or "DASH video")
 150                     * width      Width of the video, if known
 151                     * height     Height of the video, if known
 152                     * resolution Textual description of width and height
 153                     * dynamic_range The dynamic range of the video. One of:
 154                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 155                     * tbr        Average bitrate of audio and video in KBit/s
 156                     * abr        Average audio bitrate in KBit/s
 157                     * acodec     Name of the audio codec in use
 158                     * asr        Audio sampling rate in Hertz
 159                     * vbr        Average video bitrate in KBit/s
 160                     * fps        Frame rate
 161                     * vcodec     Name of the video codec in use
 162                     * container  Name of the container format
 163                     * filesize   The number of bytes, if known in advance
 164                     * filesize_approx  An estimate for the number of bytes
 165                     * player_url SWF Player URL (used for rtmpdump).
 166                     * protocol   The protocol that will be used for the actual
 167                                  download, lower-case. One of "http", "https" or
 168                                  one of the protocols defined in downloader.PROTOCOL_MAP
 169                     * fragment_base_url
 170                                  Base URL for fragments. Each fragment's path
 171                                  value (if present) will be relative to
 172                                  this URL.
 173                     * fragments  A list of fragments of a fragmented media.
 174                                  Each fragment entry must contain either an url
 175                                  or a path. If an url is present it should be
 176                                  considered by a client. Otherwise both path and
 177                                  fragment_base_url must be present. Here is
 178                                  the list of all potential fields:
 179                                  * "url" - fragment's URL
 180                                  * "path" - fragment's path relative to
 181                                             fragment_base_url
 182                                  * "duration" (optional, int or float)
 183                                  * "filesize" (optional, int)
 184                     * is_from_start  Is a live format that can be downloaded
 185                                 from the start. Boolean
 186                     * preference Order number of this format. If this field is
 187                                  present and not None, the formats get sorted
 188                                  by this field, regardless of all other values.
 189                                  -1 for default (order by other properties),
 190                                  -2 or smaller for less than default.
 191                                  < -1000 to hide the format (if there is
 192                                     another one which is strictly better)
 193                     * language   Language code, e.g. "de" or "en-US".
 194                     * language_preference  Is this in the language mentioned in
 195                                  the URL?
 196                                  10 if it's what the URL is about,
 197                                  -1 for default (don't know),
 198                                  -10 otherwise, other values reserved for now.
 199                     * quality    Order number of the video quality of this
 200                                  format, irrespective of the file format.
 201                                  -1 for default (order by other properties),
 202                                  -2 or smaller for less than default.
 203                     * source_preference  Order number for this video source
 204                                   (quality takes higher priority)
 205                                  -1 for default (order by other properties),
 206                                  -2 or smaller for less than default.
 207                     * http_headers  A dictionary of additional HTTP headers
 208                                  to add to the request.
 209                     * stretched_ratio  If given and not 1, indicates that the
 210                                  video's pixels are not square.
 211                                  width : height ratio as float.
 212                     * no_resume  The server does not support resuming the
 213                                  (HTTP or RTMP) download. Boolean.
 214                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 215                     * downloader_options  A dictionary of downloader options
 216                                  (For internal use only)
 217                                  * http_chunk_size Chunk size for HTTP downloads
 218                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 219                     RTMP formats can also have the additional fields: page_url,
 220                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 221                     rtmp_protocol, rtmp_real_time
 222
 223     url:            Final video URL.
 224     ext:            Video filename extension.
 225     format:         The video format, defaults to ext (used for --get-format)
 226     player_url:     SWF Player URL (used for rtmpdump).
 227
 228     The following fields are optional:
 229
 230     direct:         True if a direct video file was given (must only be set by GenericIE)
 231     alt_title:      A secondary title of the video.
 232     display_id      An alternative identifier for the video, not necessarily
 233                     unique, but available before title. Typically, id is
 234                     something like "4234987", title "Dancing naked mole rats",
 235                     and display_id "dancing-naked-mole-rats"
 236     thumbnails:     A list of dictionaries, with the following entries:
 237                         * "id" (optional, string) - Thumbnail format ID
 238                         * "url"
 239                         * "preference" (optional, int) - quality of the image
 240                         * "width" (optional, int)
 241                         * "height" (optional, int)
 242                         * "resolution" (optional, string "{width}x{height}",
 243                                         deprecated)
 244                         * "filesize" (optional, int)
 245                         * "http_headers" (dict) - HTTP headers for the request
 246     thumbnail:      Full URL to a video thumbnail image.
 247     description:    Full video description.
 248     uploader:       Full name of the video uploader.
 249     license:        License name the video is licensed under.
 250     creator:        The creator of the video.
 251     timestamp:      UNIX timestamp of the moment the video was uploaded
 252     upload_date:    Video upload date in UTC (YYYYMMDD).
 253                     If not explicitly set, calculated from timestamp
 254     release_timestamp: UNIX timestamp of the moment the video was released.
 255                     If it is not clear whether to use timestamp or this, use the former
 256     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 257                     If not explicitly set, calculated from release_timestamp
 258     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 259     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 260                     If not explicitly set, calculated from modified_timestamp
 261     uploader_id:    Nickname or id of the video uploader.
 262     uploader_url:   Full URL to a personal webpage of the video uploader.
 263     channel:        Full name of the channel the video is uploaded on.
 264                     Note that channel fields may or may not repeat uploader
 265                     fields. This depends on a particular extractor.
 266     channel_id:     Id of the channel.
 267     channel_url:    Full URL to a channel webpage.
 268     channel_follower_count: Number of followers of the channel.
 269     location:       Physical location where the video was filmed.
 270     subtitles:      The available subtitles as a dictionary in the format
 271                     {tag: subformats}. "tag" is usually a language code, and
 272                     "subformats" is a list sorted from lower to higher
 273                     preference, each element is a dictionary with the "ext"
 274                     entry and one of:
 275                         * "data": The subtitles file contents
 276                         * "url": A URL pointing to the subtitles file
 277                     It can optionally also have:
 278                         * "name": Name or description of the subtitles
 279                         * "http_headers": A dictionary of additional HTTP headers
 280                                   to add to the request.
 281                     "ext" will be calculated from URL if missing
 282     automatic_captions: Like 'subtitles'; contains automatically generated
 283                     captions instead of normal subtitles
 284     duration:       Length of the video in seconds, as an integer or float.
 285     view_count:     How many users have watched the video on the platform.
 286     like_count:     Number of positive ratings of the video
 287     dislike_count:  Number of negative ratings of the video
 288     repost_count:   Number of reposts of the video
 289     average_rating: Average rating give by users, the scale used depends on the webpage
 290     comment_count:  Number of comments on the video
 291     comments:       A list of comments, each with one or more of the following
 292                     properties (all but one of text or html optional):
 293                         * "author" - human-readable name of the comment author
 294                         * "author_id" - user ID of the comment author
 295                         * "author_thumbnail" - The thumbnail of the comment author
 296                         * "id" - Comment ID
 297                         * "html" - Comment as HTML
 298                         * "text" - Plain text of the comment
 299                         * "timestamp" - UNIX timestamp of comment
 300                         * "parent" - ID of the comment this one is replying to.
 301                                      Set to "root" to indicate that this is a
 302                                      comment to the original video.
 303                         * "like_count" - Number of positive ratings of the comment
 304                         * "dislike_count" - Number of negative ratings of the comment
 305                         * "is_favorited" - Whether the comment is marked as
 306                                            favorite by the video uploader
 307                         * "author_is_uploader" - Whether the comment is made by
 308                                                  the video uploader
 309     age_limit:      Age restriction for the video, as an integer (years)
 310     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 311                     should allow to get the same result again. (It will be set
 312                     by YoutubeDL if it's missing)
 313     categories:     A list of categories that the video falls in, for example
 314                     ["Sports", "Berlin"]
 315     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 316     cast:           A list of the video cast
 317     is_live:        True, False, or None (=unknown). Whether this video is a
 318                     live stream that goes on instead of a fixed-length video.
 319     was_live:       True, False, or None (=unknown). Whether this video was
 320                     originally a live stream.
 321     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 322                     If absent, automatically set from is_live, was_live
 323     start_time:     Time in seconds where the reproduction should start, as
 324                     specified in the URL.
 325     end_time:       Time in seconds where the reproduction should end, as
 326                     specified in the URL.
 327     chapters:       A list of dictionaries, with the following entries:
 328                         * "start_time" - The start time of the chapter in seconds
 329                         * "end_time" - The end time of the chapter in seconds
 330                         * "title" (optional, string)
 331     playable_in_embed: Whether this video is allowed to play in embedded
 332                     players on other sites. Can be True (=always allowed),
 333                     False (=never allowed), None (=unknown), or a string
 334                     specifying the criteria for embedability (Eg: 'whitelist')
 335     availability:   Under what condition the video is available. One of
 336                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 337                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 338                     to set it
 339     __post_extractor: A function to be called just before the metadata is
 340                     written to either disk, logger or console. The function
 341                     must return a dict which will be added to the info_dict.
 342                     This is usefull for additional information that is
 343                     time-consuming to extract. Note that the fields thus
 344                     extracted will not be available to output template and
 345                     match_filter. So, only "comments" and "comment_count" are
 346                     currently allowed to be extracted via this method.
 347
 348     The following fields should only be used when the video belongs to some logical
 349     chapter or section:
 350
 351     chapter:        Name or title of the chapter the video belongs to.
 352     chapter_number: Number of the chapter the video belongs to, as an integer.
 353     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 354
 355     The following fields should only be used when the video is an episode of some
 356     series, programme or podcast:
 357
 358     series:         Title of the series or programme the video episode belongs to.
 359     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 360     season:         Title of the season the video episode belongs to.
 361     season_number:  Number of the season the video episode belongs to, as an integer.
 362     season_id:      Id of the season the video episode belongs to, as a unicode string.
 363     episode:        Title of the video episode. Unlike mandatory video title field,
 364                     this field should denote the exact title of the video episode
 365                     without any kind of decoration.
 366     episode_number: Number of the video episode within a season, as an integer.
 367     episode_id:     Id of the video episode, as a unicode string.
 368
 369     The following fields should only be used when the media is a track or a part of
 370     a music album:
 371
 372     track:          Title of the track.
 373     track_number:   Number of the track within an album or a disc, as an integer.
 374     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 375                     as a unicode string.
 376     artist:         Artist(s) of the track.
 377     genre:          Genre(s) of the track.
 378     album:          Title of the album the track belongs to.
 379     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 380     album_artist:   List of all artists appeared on the album (e.g.
 381                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 382                     and compilations).
 383     disc_number:    Number of the disc or other physical medium the track belongs to,
 384                     as an integer.
 385     release_year:   Year (YYYY) when the album was released.
 386     composer:       Composer of the piece
 387
 388     The following fields should only be set for clips that should be cut from the original video:
 389
 390     section_start:  Start time of the section in seconds
 391     section_end:    End time of the section in seconds
 392
 393     Unless mentioned otherwise, the fields should be Unicode strings.
 394
 395     Unless mentioned otherwise, None is equivalent to absence of information.
 396
 397
 398     _type "playlist" indicates multiple videos.
 399     There must be a key "entries", which is a list, an iterable, or a PagedList
 400     object, each element of which is a valid dictionary by this specification.
 401
 402     Additionally, playlists can have "id", "title", and any other relevent
 403     attributes with the same semantics as videos (see above).
 404
 405     It can also have the following optional fields:
 406
 407     playlist_count: The total number of videos in a playlist. If not given,
 408                     YoutubeDL tries to calculate it from "entries"
 409
 410
 411     _type "multi_video" indicates that there are multiple videos that
 412     form a single show, for examples multiple acts of an opera or TV episode.
 413     It must have an entries key like a playlist and contain all the keys
 414     required for a video at the same time.
 415
 416
 417     _type "url" indicates that the video must be extracted from another
 418     location, possibly by a different extractor. Its only required key is:
 419     "url" - the next URL to extract.
 420     The key "ie_key" can be set to the class name (minus the trailing "IE",
 421     e.g. "Youtube") if the extractor class is known in advance.
 422     Additionally, the dictionary may have any properties of the resolved entity
 423     known in advance, for example "title" if the title of the referred video is
 424     known ahead of time.
 425
 426
 427     _type "url_transparent" entities have the same specification as "url", but
 428     indicate that the given additional information is more precise than the one
 429     associated with the resolved URL.
 430     This is useful when a site employs a video service that hosts the video and
 431     its technical metadata, but that video service does not embed a useful
 432     title, description etc.
 433
 434
 435     Subclasses of this should define a _VALID_URL regexp and, re-define the
 436     _real_extract() and (optionally) _real_initialize() methods.
 437     Probably, they should also be added to the list of extractors.
 438
 439     Subclasses may also override suitable() if necessary, but ensure the function
 440     signature is preserved and that this function imports everything it needs
 441     (except other extractors), so that lazy_extractors works correctly.
 442
 443     To support username + password (or netrc) login, the extractor must define a
 444     _NETRC_MACHINE and re-define _perform_login(username, password) and
 445     (optionally) _initialize_pre_login() methods. The _perform_login method will
 446     be called between _initialize_pre_login and _real_initialize if credentials
 447     are passed by the user. In cases where it is necessary to have the login
 448     process as part of the extraction rather than initialization, _perform_login
 449     can be left undefined.
 450
 451     _GEO_BYPASS attribute may be set to False in order to disable
 452     geo restriction bypass mechanisms for a particular extractor.
 453     Though it won't disable explicit geo restriction bypass based on
 454     country code provided with geo_bypass_country.
 455
 456     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 457     countries for this extractor. One of these countries will be used by
 458     geo restriction bypass mechanism right away in order to bypass
 459     geo restriction, of course, if the mechanism is not disabled.
 460
 461     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 462     IP blocks in CIDR notation for this extractor. One of these IP blocks
 463     will be used by geo restriction bypass mechanism similarly
 464     to _GEO_COUNTRIES.
 465
 466     The _WORKING attribute should be set to False for broken IEs
 467     in order to warn the users and skip the tests.
 468     """
 469
 470     _ready = False
 471     _downloader = None
 472     _x_forwarded_for_ip = None
 473     _GEO_BYPASS = True
 474     _GEO_COUNTRIES = None
 475     _GEO_IP_BLOCKS = None
 476     _WORKING = True
 477     _NETRC_MACHINE = None
 478     IE_DESC = None
 479     SEARCH_KEY = None
 480
 481     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 482         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 483         return {
 484             None: '',
 485             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 486             'password': f'Use {password_hint}',
 487             'cookies': (
 488                 'Use --cookies-from-browser or --cookies for the authentication. '
 489                 'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 490         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 491
 492     def __init__(self, downloader=None):
 493         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 494         If a downloader is not passed during initialization,
 495         it must be set using "set_downloader()" before "extract()" is called"""
 496         self._ready = False
 497         self._x_forwarded_for_ip = None
 498         self._printed_messages = set()
 499         self.set_downloader(downloader)
 500
 501     @classmethod
 502     def _match_valid_url(cls, url):
 503         # This does not use has/getattr intentionally - we want to know whether
 504         # we have cached the regexp for *this* class, whereas getattr would also
 505         # match the superclass
 506         if '_VALID_URL_RE' not in cls.__dict__:
 507             if '_VALID_URL' not in cls.__dict__:
 508                 cls._VALID_URL = cls._make_valid_url()
 509             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 510         return cls._VALID_URL_RE.match(url)
 511
 512     @classmethod
 513     def suitable(cls, url):
 514         """Receives a URL and returns True if suitable for this IE."""
 515         # This function must import everything it needs (except other extractors),
 516         # so that lazy_extractors works correctly
 517         return cls._match_valid_url(url) is not None
 518
 519     @classmethod
 520     def _match_id(cls, url):
 521         return cls._match_valid_url(url).group('id')
 522
 523     @classmethod
 524     def get_temp_id(cls, url):
 525         try:
 526             return cls._match_id(url)
 527         except (IndexError, AttributeError):
 528             return None
 529
 530     @classmethod
 531     def working(cls):
 532         """Getter method for _WORKING."""
 533         return cls._WORKING
 534
 535     @classmethod
 536     def supports_login(cls):
 537         return bool(cls._NETRC_MACHINE)
 538
 539     def initialize(self):
 540         """Initializes an instance (authentication, etc)."""
 541         self._printed_messages = set()
 542         self._initialize_geo_bypass({
 543             'countries': self._GEO_COUNTRIES,
 544             'ip_blocks': self._GEO_IP_BLOCKS,
 545         })
 546         if not self._ready:
 547             self._initialize_pre_login()
 548             if self.supports_login():
 549                 username, password = self._get_login_info()
 550                 if username:
 551                     self._perform_login(username, password)
 552             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 553                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 554             self._real_initialize()
 555             self._ready = True
 556
 557     def _initialize_geo_bypass(self, geo_bypass_context):
 558         """
 559         Initialize geo restriction bypass mechanism.
 560
 561         This method is used to initialize geo bypass mechanism based on faking
 562         X-Forwarded-For HTTP header. A random country from provided country list
 563         is selected and a random IP belonging to this country is generated. This
 564         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 565         HTTP requests.
 566
 567         This method will be used for initial geo bypass mechanism initialization
 568         during the instance initialization with _GEO_COUNTRIES and
 569         _GEO_IP_BLOCKS.
 570
 571         You may also manually call it from extractor's code if geo bypass
 572         information is not available beforehand (e.g. obtained during
 573         extraction) or due to some other reason. In this case you should pass
 574         this information in geo bypass context passed as first argument. It may
 575         contain following fields:
 576
 577         countries:  List of geo unrestricted countries (similar
 578                     to _GEO_COUNTRIES)
 579         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 580                     (similar to _GEO_IP_BLOCKS)
 581
 582         """
 583         if not self._x_forwarded_for_ip:
 584
 585             # Geo bypass mechanism is explicitly disabled by user
 586             if not self.get_param('geo_bypass', True):
 587                 return
 588
 589             if not geo_bypass_context:
 590                 geo_bypass_context = {}
 591
 592             # Backward compatibility: previously _initialize_geo_bypass
 593             # expected a list of countries, some 3rd party code may still use
 594             # it this way
 595             if isinstance(geo_bypass_context, (list, tuple)):
 596                 geo_bypass_context = {
 597                     'countries': geo_bypass_context,
 598                 }
 599
 600             # The whole point of geo bypass mechanism is to fake IP
 601             # as X-Forwarded-For HTTP header based on some IP block or
 602             # country code.
 603
 604             # Path 1: bypassing based on IP block in CIDR notation
 605
 606             # Explicit IP block specified by user, use it right away
 607             # regardless of whether extractor is geo bypassable or not
 608             ip_block = self.get_param('geo_bypass_ip_block', None)
 609
 610             # Otherwise use random IP block from geo bypass context but only
 611             # if extractor is known as geo bypassable
 612             if not ip_block:
 613                 ip_blocks = geo_bypass_context.get('ip_blocks')
 614                 if self._GEO_BYPASS and ip_blocks:
 615                     ip_block = random.choice(ip_blocks)
 616
 617             if ip_block:
 618                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 619                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 620                 return
 621
 622             # Path 2: bypassing based on country code
 623
 624             # Explicit country code specified by user, use it right away
 625             # regardless of whether extractor is geo bypassable or not
 626             country = self.get_param('geo_bypass_country', None)
 627
 628             # Otherwise use random country code from geo bypass context but
 629             # only if extractor is known as geo bypassable
 630             if not country:
 631                 countries = geo_bypass_context.get('countries')
 632                 if self._GEO_BYPASS and countries:
 633                     country = random.choice(countries)
 634
 635             if country:
 636                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 637                 self._downloader.write_debug(
 638                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 639
 640     def extract(self, url):
 641         """Extracts URL information and returns it in list of dicts."""
 642         try:
 643             for _ in range(2):
 644                 try:
 645                     self.initialize()
 646                     self.write_debug('Extracting URL: %s' % url)
 647                     ie_result = self._real_extract(url)
 648                     if ie_result is None:
 649                         return None
 650                     if self._x_forwarded_for_ip:
 651                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 652                     subtitles = ie_result.get('subtitles')
 653                     if (subtitles and 'live_chat' in subtitles
 654                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 655                         del subtitles['live_chat']
 656                     return ie_result
 657                 except GeoRestrictedError as e:
 658                     if self.__maybe_fake_ip_and_retry(e.countries):
 659                         continue
 660                     raise
 661         except UnsupportedError:
 662             raise
 663         except ExtractorError as e:
 664             kwargs = {
 665                 'video_id': e.video_id or self.get_temp_id(url),
 666                 'ie': self.IE_NAME,
 667                 'tb': e.traceback or sys.exc_info()[2],
 668                 'expected': e.expected,
 669                 'cause': e.cause
 670             }
 671             if hasattr(e, 'countries'):
 672                 kwargs['countries'] = e.countries
 673             raise type(e)(e.orig_msg, **kwargs)
 674         except compat_http_client.IncompleteRead as e:
 675             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 676         except (KeyError, StopIteration) as e:
 677             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 678
 679     def __maybe_fake_ip_and_retry(self, countries):
 680         if (not self.get_param('geo_bypass_country', None)
 681                 and self._GEO_BYPASS
 682                 and self.get_param('geo_bypass', True)
 683                 and not self._x_forwarded_for_ip
 684                 and countries):
 685             country_code = random.choice(countries)
 686             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 687             if self._x_forwarded_for_ip:
 688                 self.report_warning(
 689                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 690                     % (self._x_forwarded_for_ip, country_code.upper()))
 691                 return True
 692         return False
 693
 694     def set_downloader(self, downloader):
 695         """Sets a YoutubeDL instance as the downloader for this IE."""
 696         self._downloader = downloader
 697
 698     def _initialize_pre_login(self):
 699         """ Intialization before login. Redefine in subclasses."""
 700         pass
 701
 702     def _perform_login(self, username, password):
 703         """ Login with username and password. Redefine in subclasses."""
 704         pass
 705
 706     def _real_initialize(self):
 707         """Real initialization process. Redefine in subclasses."""
 708         pass
 709
 710     def _real_extract(self, url):
 711         """Real extraction process. Redefine in subclasses."""
 712         raise NotImplementedError('This method must be implemented by subclasses')
 713
 714     @classmethod
 715     def ie_key(cls):
 716         """A string for getting the InfoExtractor with get_info_extractor"""
 717         return cls.__name__[:-2]
 718
 719     @classproperty
 720     def IE_NAME(cls):
 721         return cls.__name__[:-2]
 722
 723     @staticmethod
 724     def __can_accept_status_code(err, expected_status):
 725         assert isinstance(err, compat_urllib_error.HTTPError)
 726         if expected_status is None:
 727             return False
 728         elif callable(expected_status):
 729             return expected_status(err.code) is True
 730         else:
 731             return err.code in variadic(expected_status)
 732
 733     def _create_request(self, url_or_request, data=None, headers={}, query={}):
 734         if isinstance(url_or_request, compat_urllib_request.Request):
 735             return update_Request(url_or_request, data=data, headers=headers, query=query)
 736         if query:
 737             url_or_request = update_url_query(url_or_request, query)
 738         return sanitized_Request(url_or_request, data, headers)
 739
 740     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 741         """
 742         Return the response handle.
 743
 744         See _download_webpage docstring for arguments specification.
 745         """
 746         if not self._downloader._first_webpage_request:
 747             sleep_interval = self.get_param('sleep_interval_requests') or 0
 748             if sleep_interval > 0:
 749                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 750                 time.sleep(sleep_interval)
 751         else:
 752             self._downloader._first_webpage_request = False
 753
 754         if note is None:
 755             self.report_download_webpage(video_id)
 756         elif note is not False:
 757             if video_id is None:
 758                 self.to_screen(str(note))
 759             else:
 760                 self.to_screen(f'{video_id}: {note}')
 761
 762         # Some sites check X-Forwarded-For HTTP header in order to figure out
 763         # the origin of the client behind proxy. This allows bypassing geo
 764         # restriction by faking this header's value to IP that belongs to some
 765         # geo unrestricted country. We will do so once we encounter any
 766         # geo restriction error.
 767         if self._x_forwarded_for_ip:
 768             if 'X-Forwarded-For' not in headers:
 769                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 770
 771         try:
 772             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 773         except network_exceptions as err:
 774             if isinstance(err, compat_urllib_error.HTTPError):
 775                 if self.__can_accept_status_code(err, expected_status):
 776                     # Retain reference to error to prevent file object from
 777                     # being closed before it can be read. Works around the
 778                     # effects of <https://bugs.python.org/issue15002>
 779                     # introduced in Python 3.4.1.
 780                     err.fp._error = err
 781                     return err.fp
 782
 783             if errnote is False:
 784                 return False
 785             if errnote is None:
 786                 errnote = 'Unable to download webpage'
 787
 788             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 789             if fatal:
 790                 raise ExtractorError(errmsg, cause=err)
 791             else:
 792                 self.report_warning(errmsg)
 793                 return False
 794
 795     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 796                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 797         """
 798         Return a tuple (page content as string, URL handle).
 799
 800         Arguments:
 801         url_or_request -- plain text URL as a string or
 802             a compat_urllib_request.Requestobject
 803         video_id -- Video/playlist/item identifier (string)
 804
 805         Keyword arguments:
 806         note -- note printed before downloading (string)
 807         errnote -- note printed in case of an error (string)
 808         fatal -- flag denoting whether error should be considered fatal,
 809             i.e. whether it should cause ExtractionError to be raised,
 810             otherwise a warning will be reported and extraction continued
 811         encoding -- encoding for a page content decoding, guessed automatically
 812             when not explicitly specified
 813         data -- POST data (bytes)
 814         headers -- HTTP headers (dict)
 815         query -- URL query (dict)
 816         expected_status -- allows to accept failed HTTP requests (non 2xx
 817             status code) by explicitly specifying a set of accepted status
 818             codes. Can be any of the following entities:
 819                 - an integer type specifying an exact failed status code to
 820                   accept
 821                 - a list or a tuple of integer types specifying a list of
 822                   failed status codes to accept
 823                 - a callable accepting an actual failed status code and
 824                   returning True if it should be accepted
 825             Note that this argument does not affect success status codes (2xx)
 826             which are always accepted.
 827         """
 828
 829         # Strip hashes from the URL (#1038)
 830         if isinstance(url_or_request, (compat_str, str)):
 831             url_or_request = url_or_request.partition('#')[0]
 832
 833         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 834         if urlh is False:
 835             assert not fatal
 836             return False
 837         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 838         return (content, urlh)
 839
 840     @staticmethod
 841     def _guess_encoding_from_content(content_type, webpage_bytes):
 842         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 843         if m:
 844             encoding = m.group(1)
 845         else:
 846             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 847                           webpage_bytes[:1024])
 848             if m:
 849                 encoding = m.group(1).decode('ascii')
 850             elif webpage_bytes.startswith(b'\xff\xfe'):
 851                 encoding = 'utf-16'
 852             else:
 853                 encoding = 'utf-8'
 854
 855         return encoding
 856
 857     def __check_blocked(self, content):
 858         first_block = content[:512]
 859         if ('<title>Access to this site is blocked</title>' in content
 860                 and 'Websense' in first_block):
 861             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 862             blocked_iframe = self._html_search_regex(
 863                 r'<iframe src="([^"]+)"', content,
 864                 'Websense information URL', default=None)
 865             if blocked_iframe:
 866                 msg += ' Visit %s for more details' % blocked_iframe
 867             raise ExtractorError(msg, expected=True)
 868         if '<title>The URL you requested has been blocked</title>' in first_block:
 869             msg = (
 870                 'Access to this webpage has been blocked by Indian censorship. '
 871                 'Use a VPN or proxy server (with --proxy) to route around it.')
 872             block_msg = self._html_search_regex(
 873                 r'</h1><p>(.*?)</p>',
 874                 content, 'block message', default=None)
 875             if block_msg:
 876                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 877             raise ExtractorError(msg, expected=True)
 878         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 879                 and 'blocklist.rkn.gov.ru' in content):
 880             raise ExtractorError(
 881                 'Access to this webpage has been blocked by decision of the Russian government. '
 882                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 883                 expected=True)
 884
 885     def _request_dump_filename(self, url, video_id):
 886         basen = f'{video_id}_{url}'
 887         trim_length = self.get_param('trim_file_name') or 240
 888         if len(basen) > trim_length:
 889             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 890             basen = basen[:trim_length - len(h)] + h
 891         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 892         # Working around MAX_PATH limitation on Windows (see
 893         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 894         if compat_os_name == 'nt':
 895             absfilepath = os.path.abspath(filename)
 896             if len(absfilepath) > 259:
 897                 filename = fR'\\?\{absfilepath}'
 898         return filename
 899
 900     def __decode_webpage(self, webpage_bytes, encoding, headers):
 901         if not encoding:
 902             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 903         try:
 904             return webpage_bytes.decode(encoding, 'replace')
 905         except LookupError:
 906             return webpage_bytes.decode('utf-8', 'replace')
 907
 908     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 909         webpage_bytes = urlh.read()
 910         if prefix is not None:
 911             webpage_bytes = prefix + webpage_bytes
 912         if self.get_param('dump_intermediate_pages', False):
 913             self.to_screen('Dumping request to ' + urlh.geturl())
 914             dump = base64.b64encode(webpage_bytes).decode('ascii')
 915             self._downloader.to_screen(dump)
 916         if self.get_param('write_pages'):
 917             filename = self._request_dump_filename(urlh.geturl(), video_id)
 918             self.to_screen(f'Saving request to {filename}')
 919             with open(filename, 'wb') as outf:
 920                 outf.write(webpage_bytes)
 921
 922         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 923         self.__check_blocked(content)
 924
 925         return content
 926
 927     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 928         if transform_source:
 929             xml_string = transform_source(xml_string)
 930         try:
 931             return compat_etree_fromstring(xml_string.encode('utf-8'))
 932         except xml.etree.ElementTree.ParseError as ve:
 933             errmsg = '%s: Failed to parse XML ' % video_id
 934             if fatal:
 935                 raise ExtractorError(errmsg, cause=ve)
 936             else:
 937                 self.report_warning(errmsg + str(ve))
 938
 939     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, **parser_kwargs):
 940         try:
 941             return json.loads(
 942                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 943         except ValueError as ve:
 944             errmsg = f'{video_id}: Failed to parse JSON'
 945             if fatal:
 946                 raise ExtractorError(errmsg, cause=ve)
 947             else:
 948                 self.report_warning(f'{errmsg}: {ve}')
 949
 950     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 951         return self._parse_json(
 952             data[data.find('{'):data.rfind('}') + 1],
 953             video_id, transform_source, fatal)
 954
 955     def __create_download_methods(name, parser, note, errnote, return_value):
 956
 957         def parse(ie, content, *args, **kwargs):
 958             if parser is None:
 959                 return content
 960             # parser is fetched by name so subclasses can override it
 961             return getattr(ie, parser)(content, *args, **kwargs)
 962
 963         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 964                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 965             res = self._download_webpage_handle(
 966                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 967                 data=data, headers=headers, query=query, expected_status=expected_status)
 968             if res is False:
 969                 return res
 970             content, urlh = res
 971             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal), urlh
 972
 973         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 974                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 975             if self.get_param('load_pages'):
 976                 url_or_request = self._create_request(url_or_request, data, headers, query)
 977                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
 978                 self.to_screen(f'Loading request from {filename}')
 979                 try:
 980                     with open(filename, 'rb') as dumpf:
 981                         webpage_bytes = dumpf.read()
 982                 except OSError as e:
 983                     self.report_warning(f'Unable to load request from disk: {e}')
 984                 else:
 985                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
 986                     return parse(self, content, video_id, transform_source, fatal)
 987             kwargs = {
 988                 'note': note,
 989                 'errnote': errnote,
 990                 'transform_source': transform_source,
 991                 'fatal': fatal,
 992                 'encoding': encoding,
 993                 'data': data,
 994                 'headers': headers,
 995                 'query': query,
 996                 'expected_status': expected_status,
 997             }
 998             if parser is None:
 999                 kwargs.pop('transform_source')
1000             # The method is fetched by name so subclasses can override _download_..._handle
1001             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1002             return res if res is False else res[0]
1003
1004         def impersonate(func, name, return_value):
1005             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1006             func.__doc__ = f'''
1007                 @param transform_source     Apply this transformation before parsing
1008                 @returns                    {return_value}
1009
1010                 See _download_webpage_handle docstring for other arguments specification
1011             '''
1012
1013         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1014         impersonate(download_content, f'_download_{name}', f'{return_value}')
1015         return download_handle, download_content
1016
1017     _download_xml_handle, _download_xml = __create_download_methods(
1018         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1019     _download_json_handle, _download_json = __create_download_methods(
1020         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1021     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1022         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1023     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1024
1025     def _download_webpage(
1026             self, url_or_request, video_id, note=None, errnote=None,
1027             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1028         """
1029         Return the data of the page as a string.
1030
1031         Keyword arguments:
1032         tries -- number of tries
1033         timeout -- sleep interval between tries
1034
1035         See _download_webpage_handle docstring for other arguments specification.
1036         """
1037
1038         R''' # NB: These are unused; should they be deprecated?
1039         if tries != 1:
1040             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1041         if timeout is NO_DEFAULT:
1042             timeout = 5
1043         else:
1044             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1045         '''
1046
1047         try_count = 0
1048         while True:
1049             try:
1050                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1051             except compat_http_client.IncompleteRead as e:
1052                 try_count += 1
1053                 if try_count >= tries:
1054                     raise e
1055                 self._sleep(timeout, video_id)
1056
1057     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1058         idstr = format_field(video_id, None, '%s: ')
1059         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1060         if only_once:
1061             if f'WARNING: {msg}' in self._printed_messages:
1062                 return
1063             self._printed_messages.add(f'WARNING: {msg}')
1064         self._downloader.report_warning(msg, *args, **kwargs)
1065
1066     def to_screen(self, msg, *args, **kwargs):
1067         """Print msg to screen, prefixing it with '[ie_name]'"""
1068         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1069
1070     def write_debug(self, msg, *args, **kwargs):
1071         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1072
1073     def get_param(self, name, default=None, *args, **kwargs):
1074         if self._downloader:
1075             return self._downloader.params.get(name, default, *args, **kwargs)
1076         return default
1077
1078     def report_drm(self, video_id, partial=False):
1079         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1080
1081     def report_extraction(self, id_or_name):
1082         """Report information extraction."""
1083         self.to_screen('%s: Extracting information' % id_or_name)
1084
1085     def report_download_webpage(self, video_id):
1086         """Report webpage download."""
1087         self.to_screen('%s: Downloading webpage' % video_id)
1088
1089     def report_age_confirmation(self):
1090         """Report attempt to confirm age."""
1091         self.to_screen('Confirming age')
1092
1093     def report_login(self):
1094         """Report attempt to log in."""
1095         self.to_screen('Logging in')
1096
1097     def raise_login_required(
1098             self, msg='This video is only available for registered users',
1099             metadata_available=False, method=NO_DEFAULT):
1100         if metadata_available and (
1101                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1102             self.report_warning(msg)
1103             return
1104         msg += format_field(self._login_hint(method), None, '. %s')
1105         raise ExtractorError(msg, expected=True)
1106
1107     def raise_geo_restricted(
1108             self, msg='This video is not available from your location due to geo restriction',
1109             countries=None, metadata_available=False):
1110         if metadata_available and (
1111                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1112             self.report_warning(msg)
1113         else:
1114             raise GeoRestrictedError(msg, countries=countries)
1115
1116     def raise_no_formats(self, msg, expected=False, video_id=None):
1117         if expected and (
1118                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1119             self.report_warning(msg, video_id)
1120         elif isinstance(msg, ExtractorError):
1121             raise msg
1122         else:
1123             raise ExtractorError(msg, expected=expected, video_id=video_id)
1124
1125     # Methods for following #608
1126     @staticmethod
1127     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1128         """Returns a URL that points to a page that should be processed"""
1129         if ie is not None:
1130             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1131         if video_id is not None:
1132             kwargs['id'] = video_id
1133         if video_title is not None:
1134             kwargs['title'] = video_title
1135         return {
1136             **kwargs,
1137             '_type': 'url_transparent' if url_transparent else 'url',
1138             'url': url,
1139         }
1140
1141     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1142         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1143                 for m in orderedSet(map(getter, matches) if getter else matches))
1144         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1145
1146     @staticmethod
1147     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1148         """Returns a playlist"""
1149         if playlist_id:
1150             kwargs['id'] = playlist_id
1151         if playlist_title:
1152             kwargs['title'] = playlist_title
1153         if playlist_description is not None:
1154             kwargs['description'] = playlist_description
1155         return {
1156             **kwargs,
1157             '_type': 'multi_video' if multi_video else 'playlist',
1158             'entries': entries,
1159         }
1160
1161     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1162         """
1163         Perform a regex search on the given string, using a single or a list of
1164         patterns returning the first matching group.
1165         In case of failure return a default value or raise a WARNING or a
1166         RegexNotFoundError, depending on fatal, specifying the field name.
1167         """
1168         if string is None:
1169             mobj = None
1170         elif isinstance(pattern, (str, re.Pattern)):
1171             mobj = re.search(pattern, string, flags)
1172         else:
1173             for p in pattern:
1174                 mobj = re.search(p, string, flags)
1175                 if mobj:
1176                     break
1177
1178         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1179
1180         if mobj:
1181             if group is None:
1182                 # return the first matching group
1183                 return next(g for g in mobj.groups() if g is not None)
1184             elif isinstance(group, (list, tuple)):
1185                 return tuple(mobj.group(g) for g in group)
1186             else:
1187                 return mobj.group(group)
1188         elif default is not NO_DEFAULT:
1189             return default
1190         elif fatal:
1191             raise RegexNotFoundError('Unable to extract %s' % _name)
1192         else:
1193             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1194             return None
1195
1196     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1197                      contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs):
1198         """Searches string for the JSON object specified by start_pattern"""
1199         # NB: end_pattern is only used to reduce the size of the initial match
1200         if default is NO_DEFAULT:
1201             default, has_default = {}, False
1202         else:
1203             fatal, has_default = False, True
1204
1205         json_string = self._search_regex(
1206             rf'{start_pattern}\s*(?P<json>{{\s*{contains_pattern}\s*}})\s*{end_pattern}',
1207             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1208         if not json_string:
1209             return default
1210
1211         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1212         try:
1213             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1214         except ExtractorError as e:
1215             if fatal:
1216                 raise ExtractorError(
1217                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1218             elif not has_default:
1219                 self.report_warning(
1220                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1221         return default
1222
1223     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1224         """
1225         Like _search_regex, but strips HTML tags and unescapes entities.
1226         """
1227         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1228         if res:
1229             return clean_html(res).strip()
1230         else:
1231             return res
1232
1233     def _get_netrc_login_info(self, netrc_machine=None):
1234         username = None
1235         password = None
1236         netrc_machine = netrc_machine or self._NETRC_MACHINE
1237
1238         if self.get_param('usenetrc', False):
1239             try:
1240                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1241                 if os.path.isdir(netrc_file):
1242                     netrc_file = os.path.join(netrc_file, '.netrc')
1243                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1244                 if info is not None:
1245                     username = info[0]
1246                     password = info[2]
1247                 else:
1248                     raise netrc.NetrcParseError(
1249                         'No authenticators for %s' % netrc_machine)
1250             except (OSError, netrc.NetrcParseError) as err:
1251                 self.report_warning(
1252                     'parsing .netrc: %s' % error_to_compat_str(err))
1253
1254         return username, password
1255
1256     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1257         """
1258         Get the login info as (username, password)
1259         First look for the manually specified credentials using username_option
1260         and password_option as keys in params dictionary. If no such credentials
1261         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1262         value.
1263         If there's no info available, return (None, None)
1264         """
1265
1266         # Attempt to use provided username and password or .netrc data
1267         username = self.get_param(username_option)
1268         if username is not None:
1269             password = self.get_param(password_option)
1270         else:
1271             username, password = self._get_netrc_login_info(netrc_machine)
1272
1273         return username, password
1274
1275     def _get_tfa_info(self, note='two-factor verification code'):
1276         """
1277         Get the two-factor authentication info
1278         TODO - asking the user will be required for sms/phone verify
1279         currently just uses the command line option
1280         If there's no info available, return None
1281         """
1282
1283         tfa = self.get_param('twofactor')
1284         if tfa is not None:
1285             return tfa
1286
1287         return compat_getpass('Type %s and press [Return]: ' % note)
1288
1289     # Helper functions for extracting OpenGraph info
1290     @staticmethod
1291     def _og_regexes(prop):
1292         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1293         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1294                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1295         template = r'<meta[^>]+?%s[^>]+?%s'
1296         return [
1297             template % (property_re, content_re),
1298             template % (content_re, property_re),
1299         ]
1300
1301     @staticmethod
1302     def _meta_regex(prop):
1303         return r'''(?isx)<meta
1304                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1305                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1306
1307     def _og_search_property(self, prop, html, name=None, **kargs):
1308         prop = variadic(prop)
1309         if name is None:
1310             name = 'OpenGraph %s' % prop[0]
1311         og_regexes = []
1312         for p in prop:
1313             og_regexes.extend(self._og_regexes(p))
1314         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1315         if escaped is None:
1316             return None
1317         return unescapeHTML(escaped)
1318
1319     def _og_search_thumbnail(self, html, **kargs):
1320         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1321
1322     def _og_search_description(self, html, **kargs):
1323         return self._og_search_property('description', html, fatal=False, **kargs)
1324
1325     def _og_search_title(self, html, *, fatal=False, **kargs):
1326         return self._og_search_property('title', html, fatal=fatal, **kargs)
1327
1328     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1329         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1330         if secure:
1331             regexes = self._og_regexes('video:secure_url') + regexes
1332         return self._html_search_regex(regexes, html, name, **kargs)
1333
1334     def _og_search_url(self, html, **kargs):
1335         return self._og_search_property('url', html, **kargs)
1336
1337     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1338         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1339
1340     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1341         name = variadic(name)
1342         if display_name is None:
1343             display_name = name[0]
1344         return self._html_search_regex(
1345             [self._meta_regex(n) for n in name],
1346             html, display_name, fatal=fatal, group='content', **kwargs)
1347
1348     def _dc_search_uploader(self, html):
1349         return self._html_search_meta('dc.creator', html, 'uploader')
1350
1351     def _rta_search(self, html):
1352         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1353         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1354                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1355                      html):
1356             return 18
1357         return 0
1358
1359     def _media_rating_search(self, html):
1360         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1361         rating = self._html_search_meta('rating', html)
1362
1363         if not rating:
1364             return None
1365
1366         RATING_TABLE = {
1367             'safe for kids': 0,
1368             'general': 8,
1369             '14 years': 14,
1370             'mature': 17,
1371             'restricted': 19,
1372         }
1373         return RATING_TABLE.get(rating.lower())
1374
1375     def _family_friendly_search(self, html):
1376         # See http://schema.org/VideoObject
1377         family_friendly = self._html_search_meta(
1378             'isFamilyFriendly', html, default=None)
1379
1380         if not family_friendly:
1381             return None
1382
1383         RATING_TABLE = {
1384             '1': 0,
1385             'true': 0,
1386             '0': 18,
1387             'false': 18,
1388         }
1389         return RATING_TABLE.get(family_friendly.lower())
1390
1391     def _twitter_search_player(self, html):
1392         return self._html_search_meta('twitter:player', html,
1393                                       'twitter card player')
1394
1395     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1396         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1397         default = kwargs.get('default', NO_DEFAULT)
1398         # JSON-LD may be malformed and thus `fatal` should be respected.
1399         # At the same time `default` may be passed that assumes `fatal=False`
1400         # for _search_regex. Let's simulate the same behavior here as well.
1401         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1402         json_ld = []
1403         for mobj in json_ld_list:
1404             json_ld_item = self._parse_json(
1405                 mobj.group('json_ld'), video_id, fatal=fatal)
1406             if not json_ld_item:
1407                 continue
1408             if isinstance(json_ld_item, dict):
1409                 json_ld.append(json_ld_item)
1410             elif isinstance(json_ld_item, (list, tuple)):
1411                 json_ld.extend(json_ld_item)
1412         if json_ld:
1413             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1414         if json_ld:
1415             return json_ld
1416         if default is not NO_DEFAULT:
1417             return default
1418         elif fatal:
1419             raise RegexNotFoundError('Unable to extract JSON-LD')
1420         else:
1421             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1422             return {}
1423
1424     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1425         if isinstance(json_ld, compat_str):
1426             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1427         if not json_ld:
1428             return {}
1429         info = {}
1430         if not isinstance(json_ld, (list, tuple, dict)):
1431             return info
1432         if isinstance(json_ld, dict):
1433             json_ld = [json_ld]
1434
1435         INTERACTION_TYPE_MAP = {
1436             'CommentAction': 'comment',
1437             'AgreeAction': 'like',
1438             'DisagreeAction': 'dislike',
1439             'LikeAction': 'like',
1440             'DislikeAction': 'dislike',
1441             'ListenAction': 'view',
1442             'WatchAction': 'view',
1443             'ViewAction': 'view',
1444         }
1445
1446         def is_type(e, *expected_types):
1447             type = variadic(traverse_obj(e, '@type'))
1448             return any(x in type for x in expected_types)
1449
1450         def extract_interaction_type(e):
1451             interaction_type = e.get('interactionType')
1452             if isinstance(interaction_type, dict):
1453                 interaction_type = interaction_type.get('@type')
1454             return str_or_none(interaction_type)
1455
1456         def extract_interaction_statistic(e):
1457             interaction_statistic = e.get('interactionStatistic')
1458             if isinstance(interaction_statistic, dict):
1459                 interaction_statistic = [interaction_statistic]
1460             if not isinstance(interaction_statistic, list):
1461                 return
1462             for is_e in interaction_statistic:
1463                 if not is_type(is_e, 'InteractionCounter'):
1464                     continue
1465                 interaction_type = extract_interaction_type(is_e)
1466                 if not interaction_type:
1467                     continue
1468                 # For interaction count some sites provide string instead of
1469                 # an integer (as per spec) with non digit characters (e.g. ",")
1470                 # so extracting count with more relaxed str_to_int
1471                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1472                 if interaction_count is None:
1473                     continue
1474                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1475                 if not count_kind:
1476                     continue
1477                 count_key = '%s_count' % count_kind
1478                 if info.get(count_key) is not None:
1479                     continue
1480                 info[count_key] = interaction_count
1481
1482         def extract_chapter_information(e):
1483             chapters = [{
1484                 'title': part.get('name'),
1485                 'start_time': part.get('startOffset'),
1486                 'end_time': part.get('endOffset'),
1487             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1488             for idx, (last_c, current_c, next_c) in enumerate(zip(
1489                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1490                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1491                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1492                 if None in current_c.values():
1493                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1494                     return
1495             if chapters:
1496                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1497                 info['chapters'] = chapters
1498
1499         def extract_video_object(e):
1500             assert is_type(e, 'VideoObject')
1501             author = e.get('author')
1502             info.update({
1503                 'url': traverse_obj(e, 'contentUrl', 'embedUrl', expected_type=url_or_none),
1504                 'title': unescapeHTML(e.get('name')),
1505                 'description': unescapeHTML(e.get('description')),
1506                 'thumbnails': [{'url': url}
1507                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1508                                if url_or_none(url)],
1509                 'duration': parse_duration(e.get('duration')),
1510                 'timestamp': unified_timestamp(e.get('uploadDate')),
1511                 # author can be an instance of 'Organization' or 'Person' types.
1512                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1513                 # however some websites are using 'Text' type instead.
1514                 # 1. https://schema.org/VideoObject
1515                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1516                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1517                 'tbr': int_or_none(e.get('bitrate')),
1518                 'width': int_or_none(e.get('width')),
1519                 'height': int_or_none(e.get('height')),
1520                 'view_count': int_or_none(e.get('interactionCount')),
1521             })
1522             extract_interaction_statistic(e)
1523             extract_chapter_information(e)
1524
1525         def traverse_json_ld(json_ld, at_top_level=True):
1526             for e in json_ld:
1527                 if at_top_level and '@context' not in e:
1528                     continue
1529                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1530                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1531                     break
1532                 if expected_type is not None and not is_type(e, expected_type):
1533                     continue
1534                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1535                 if rating is not None:
1536                     info['average_rating'] = rating
1537                 if is_type(e, 'TVEpisode', 'Episode'):
1538                     episode_name = unescapeHTML(e.get('name'))
1539                     info.update({
1540                         'episode': episode_name,
1541                         'episode_number': int_or_none(e.get('episodeNumber')),
1542                         'description': unescapeHTML(e.get('description')),
1543                     })
1544                     if not info.get('title') and episode_name:
1545                         info['title'] = episode_name
1546                     part_of_season = e.get('partOfSeason')
1547                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1548                         info.update({
1549                             'season': unescapeHTML(part_of_season.get('name')),
1550                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1551                         })
1552                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1553                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1554                         info['series'] = unescapeHTML(part_of_series.get('name'))
1555                 elif is_type(e, 'Movie'):
1556                     info.update({
1557                         'title': unescapeHTML(e.get('name')),
1558                         'description': unescapeHTML(e.get('description')),
1559                         'duration': parse_duration(e.get('duration')),
1560                         'timestamp': unified_timestamp(e.get('dateCreated')),
1561                     })
1562                 elif is_type(e, 'Article', 'NewsArticle'):
1563                     info.update({
1564                         'timestamp': parse_iso8601(e.get('datePublished')),
1565                         'title': unescapeHTML(e.get('headline')),
1566                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1567                     })
1568                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1569                         extract_video_object(e['video'][0])
1570                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1571                         extract_video_object(e['subjectOf'][0])
1572                 elif is_type(e, 'VideoObject'):
1573                     extract_video_object(e)
1574                     if expected_type is None:
1575                         continue
1576                     else:
1577                         break
1578                 video = e.get('video')
1579                 if is_type(video, 'VideoObject'):
1580                     extract_video_object(video)
1581                 if expected_type is None:
1582                     continue
1583                 else:
1584                     break
1585         traverse_json_ld(json_ld)
1586
1587         return filter_dict(info)
1588
1589     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1590         return self._parse_json(
1591             self._search_regex(
1592                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1593                 webpage, 'next.js data', fatal=fatal, **kw),
1594             video_id, transform_source=transform_source, fatal=fatal)
1595
1596     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1597         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1598         rectx = re.escape(context_name)
1599         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1600         js, arg_keys, arg_vals = self._search_regex(
1601             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1602             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
1603
1604         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1605
1606         for key, val in args.items():
1607             if val in ('undefined', 'void 0'):
1608                 args[key] = 'null'
1609
1610         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1611         return traverse_obj(ret, traverse) or {}
1612
1613     @staticmethod
1614     def _hidden_inputs(html):
1615         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1616         hidden_inputs = {}
1617         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1618             attrs = extract_attributes(input)
1619             if not input:
1620                 continue
1621             if attrs.get('type') not in ('hidden', 'submit'):
1622                 continue
1623             name = attrs.get('name') or attrs.get('id')
1624             value = attrs.get('value')
1625             if name and value is not None:
1626                 hidden_inputs[name] = value
1627         return hidden_inputs
1628
1629     def _form_hidden_inputs(self, form_id, html):
1630         form = self._search_regex(
1631             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1632             html, '%s form' % form_id, group='form')
1633         return self._hidden_inputs(form)
1634
1635     class FormatSort:
1636         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1637
1638         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1639                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1640                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1641         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1642                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1643                         'fps', 'fs_approx', 'source', 'id')
1644
1645         settings = {
1646             'vcodec': {'type': 'ordered', 'regex': True,
1647                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1648             'acodec': {'type': 'ordered', 'regex': True,
1649                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1650             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1651                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1652             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1653                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1654             'vext': {'type': 'ordered', 'field': 'video_ext',
1655                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1656                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1657             'aext': {'type': 'ordered', 'field': 'audio_ext',
1658                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1659                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1660             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1661             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1662                            'field': ('vcodec', 'acodec'),
1663                            'function': lambda it: int(any(v != 'none' for v in it))},
1664             'ie_pref': {'priority': True, 'type': 'extractor'},
1665             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1666             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1667             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1668             'quality': {'convert': 'float', 'default': -1},
1669             'filesize': {'convert': 'bytes'},
1670             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1671             'id': {'convert': 'string', 'field': 'format_id'},
1672             'height': {'convert': 'float_none'},
1673             'width': {'convert': 'float_none'},
1674             'fps': {'convert': 'float_none'},
1675             'tbr': {'convert': 'float_none'},
1676             'vbr': {'convert': 'float_none'},
1677             'abr': {'convert': 'float_none'},
1678             'asr': {'convert': 'float_none'},
1679             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1680
1681             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1682             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1683             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1684             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1685             'res': {'type': 'multiple', 'field': ('height', 'width'),
1686                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1687
1688             # For compatibility with youtube-dl
1689             'format_id': {'type': 'alias', 'field': 'id'},
1690             'preference': {'type': 'alias', 'field': 'ie_pref'},
1691             'language_preference': {'type': 'alias', 'field': 'lang'},
1692             'source_preference': {'type': 'alias', 'field': 'source'},
1693             'protocol': {'type': 'alias', 'field': 'proto'},
1694             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1695
1696             # Deprecated
1697             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1698             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1699             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1700             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1701             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1702             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1703             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1704             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1705             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1706             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1707             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1708             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1709             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1710             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1711             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1712             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1713             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1714             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1715             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1716             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1717         }
1718
1719         def __init__(self, ie, field_preference):
1720             self._order = []
1721             self.ydl = ie._downloader
1722             self.evaluate_params(self.ydl.params, field_preference)
1723             if ie.get_param('verbose'):
1724                 self.print_verbose_info(self.ydl.write_debug)
1725
1726         def _get_field_setting(self, field, key):
1727             if field not in self.settings:
1728                 if key in ('forced', 'priority'):
1729                     return False
1730                 self.ydl.deprecation_warning(
1731                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1732                     'and may be removed in a future version')
1733                 self.settings[field] = {}
1734             propObj = self.settings[field]
1735             if key not in propObj:
1736                 type = propObj.get('type')
1737                 if key == 'field':
1738                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1739                 elif key == 'convert':
1740                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1741                 else:
1742                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1743                 propObj[key] = default
1744             return propObj[key]
1745
1746         def _resolve_field_value(self, field, value, convertNone=False):
1747             if value is None:
1748                 if not convertNone:
1749                     return None
1750             else:
1751                 value = value.lower()
1752             conversion = self._get_field_setting(field, 'convert')
1753             if conversion == 'ignore':
1754                 return None
1755             if conversion == 'string':
1756                 return value
1757             elif conversion == 'float_none':
1758                 return float_or_none(value)
1759             elif conversion == 'bytes':
1760                 return FileDownloader.parse_bytes(value)
1761             elif conversion == 'order':
1762                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1763                 use_regex = self._get_field_setting(field, 'regex')
1764                 list_length = len(order_list)
1765                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1766                 if use_regex and value is not None:
1767                     for i, regex in enumerate(order_list):
1768                         if regex and re.match(regex, value):
1769                             return list_length - i
1770                     return list_length - empty_pos  # not in list
1771                 else:  # not regex or  value = None
1772                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1773             else:
1774                 if value.isnumeric():
1775                     return float(value)
1776                 else:
1777                     self.settings[field]['convert'] = 'string'
1778                     return value
1779
1780         def evaluate_params(self, params, sort_extractor):
1781             self._use_free_order = params.get('prefer_free_formats', False)
1782             self._sort_user = params.get('format_sort', [])
1783             self._sort_extractor = sort_extractor
1784
1785             def add_item(field, reverse, closest, limit_text):
1786                 field = field.lower()
1787                 if field in self._order:
1788                     return
1789                 self._order.append(field)
1790                 limit = self._resolve_field_value(field, limit_text)
1791                 data = {
1792                     'reverse': reverse,
1793                     'closest': False if limit is None else closest,
1794                     'limit_text': limit_text,
1795                     'limit': limit}
1796                 if field in self.settings:
1797                     self.settings[field].update(data)
1798                 else:
1799                     self.settings[field] = data
1800
1801             sort_list = (
1802                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1803                 + (tuple() if params.get('format_sort_force', False)
1804                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1805                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1806
1807             for item in sort_list:
1808                 match = re.match(self.regex, item)
1809                 if match is None:
1810                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1811                 field = match.group('field')
1812                 if field is None:
1813                     continue
1814                 if self._get_field_setting(field, 'type') == 'alias':
1815                     alias, field = field, self._get_field_setting(field, 'field')
1816                     if self._get_field_setting(alias, 'deprecated'):
1817                         self.ydl.deprecation_warning(
1818                             f'Format sorting alias {alias} is deprecated '
1819                             f'and may be removed in a future version. Please use {field} instead')
1820                 reverse = match.group('reverse') is not None
1821                 closest = match.group('separator') == '~'
1822                 limit_text = match.group('limit')
1823
1824                 has_limit = limit_text is not None
1825                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1826                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1827
1828                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1829                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1830                 limit_count = len(limits)
1831                 for (i, f) in enumerate(fields):
1832                     add_item(f, reverse, closest,
1833                              limits[i] if i < limit_count
1834                              else limits[0] if has_limit and not has_multiple_limits
1835                              else None)
1836
1837         def print_verbose_info(self, write_debug):
1838             if self._sort_user:
1839                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1840             if self._sort_extractor:
1841                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1842             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1843                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1844                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1845                               self._get_field_setting(field, 'limit_text'),
1846                               self._get_field_setting(field, 'limit'))
1847                 if self._get_field_setting(field, 'limit_text') is not None else '')
1848                 for field in self._order if self._get_field_setting(field, 'visible')]))
1849
1850         def _calculate_field_preference_from_value(self, format, field, type, value):
1851             reverse = self._get_field_setting(field, 'reverse')
1852             closest = self._get_field_setting(field, 'closest')
1853             limit = self._get_field_setting(field, 'limit')
1854
1855             if type == 'extractor':
1856                 maximum = self._get_field_setting(field, 'max')
1857                 if value is None or (maximum is not None and value >= maximum):
1858                     value = -1
1859             elif type == 'boolean':
1860                 in_list = self._get_field_setting(field, 'in_list')
1861                 not_in_list = self._get_field_setting(field, 'not_in_list')
1862                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1863             elif type == 'ordered':
1864                 value = self._resolve_field_value(field, value, True)
1865
1866             # try to convert to number
1867             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1868             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1869             if is_num:
1870                 value = val_num
1871
1872             return ((-10, 0) if value is None
1873                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1874                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1875                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1876                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1877                     else (-1, value, 0))
1878
1879         def _calculate_field_preference(self, format, field):
1880             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1881             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1882             if type == 'multiple':
1883                 type = 'field'  # Only 'field' is allowed in multiple for now
1884                 actual_fields = self._get_field_setting(field, 'field')
1885
1886                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1887             else:
1888                 value = get_value(field)
1889             return self._calculate_field_preference_from_value(format, field, type, value)
1890
1891         def calculate_preference(self, format):
1892             # Determine missing protocol
1893             if not format.get('protocol'):
1894                 format['protocol'] = determine_protocol(format)
1895
1896             # Determine missing ext
1897             if not format.get('ext') and 'url' in format:
1898                 format['ext'] = determine_ext(format['url'])
1899             if format.get('vcodec') == 'none':
1900                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1901                 format['video_ext'] = 'none'
1902             else:
1903                 format['video_ext'] = format['ext']
1904                 format['audio_ext'] = 'none'
1905             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1906             #    format['preference'] = -1000
1907
1908             # Determine missing bitrates
1909             if format.get('tbr') is None:
1910                 if format.get('vbr') is not None and format.get('abr') is not None:
1911                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1912             else:
1913                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1914                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1915                 if format.get('acodec') != 'none' and format.get('abr') is None:
1916                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1917
1918             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1919
1920     def _sort_formats(self, formats, field_preference=[]):
1921         if not formats:
1922             return
1923         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1924
1925     def _check_formats(self, formats, video_id):
1926         if formats:
1927             formats[:] = filter(
1928                 lambda f: self._is_valid_url(
1929                     f['url'], video_id,
1930                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1931                 formats)
1932
1933     @staticmethod
1934     def _remove_duplicate_formats(formats):
1935         format_urls = set()
1936         unique_formats = []
1937         for f in formats:
1938             if f['url'] not in format_urls:
1939                 format_urls.add(f['url'])
1940                 unique_formats.append(f)
1941         formats[:] = unique_formats
1942
1943     def _is_valid_url(self, url, video_id, item='video', headers={}):
1944         url = self._proto_relative_url(url, scheme='http:')
1945         # For now assume non HTTP(S) URLs always valid
1946         if not (url.startswith('http://') or url.startswith('https://')):
1947             return True
1948         try:
1949             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1950             return True
1951         except ExtractorError as e:
1952             self.to_screen(
1953                 '%s: %s URL is invalid, skipping: %s'
1954                 % (video_id, item, error_to_compat_str(e.cause)))
1955             return False
1956
1957     def http_scheme(self):
1958         """ Either "http:" or "https:", depending on the user's preferences """
1959         return (
1960             'http:'
1961             if self.get_param('prefer_insecure', False)
1962             else 'https:')
1963
1964     def _proto_relative_url(self, url, scheme=None):
1965         if url is None:
1966             return url
1967         if url.startswith('//'):
1968             if scheme is None:
1969                 scheme = self.http_scheme()
1970             return scheme + url
1971         else:
1972             return url
1973
1974     def _sleep(self, timeout, video_id, msg_template=None):
1975         if msg_template is None:
1976             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1977         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1978         self.to_screen(msg)
1979         time.sleep(timeout)
1980
1981     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1982                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1983                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1984         res = self._download_xml_handle(
1985             manifest_url, video_id, 'Downloading f4m manifest',
1986             'Unable to download f4m manifest',
1987             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1988             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1989             transform_source=transform_source,
1990             fatal=fatal, data=data, headers=headers, query=query)
1991         if res is False:
1992             return []
1993
1994         manifest, urlh = res
1995         manifest_url = urlh.geturl()
1996
1997         return self._parse_f4m_formats(
1998             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1999             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2000
2001     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2002                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2003                            fatal=True, m3u8_id=None):
2004         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2005             return []
2006
2007         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2008         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2009         if akamai_pv is not None and ';' in akamai_pv.text:
2010             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2011             if playerVerificationChallenge.strip() != '':
2012                 return []
2013
2014         formats = []
2015         manifest_version = '1.0'
2016         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2017         if not media_nodes:
2018             manifest_version = '2.0'
2019             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2020         # Remove unsupported DRM protected media from final formats
2021         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2022         media_nodes = remove_encrypted_media(media_nodes)
2023         if not media_nodes:
2024             return formats
2025
2026         manifest_base_url = get_base_url(manifest)
2027
2028         bootstrap_info = xpath_element(
2029             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2030             'bootstrap info', default=None)
2031
2032         vcodec = None
2033         mime_type = xpath_text(
2034             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2035             'base URL', default=None)
2036         if mime_type and mime_type.startswith('audio/'):
2037             vcodec = 'none'
2038
2039         for i, media_el in enumerate(media_nodes):
2040             tbr = int_or_none(media_el.attrib.get('bitrate'))
2041             width = int_or_none(media_el.attrib.get('width'))
2042             height = int_or_none(media_el.attrib.get('height'))
2043             format_id = join_nonempty(f4m_id, tbr or i)
2044             # If <bootstrapInfo> is present, the specified f4m is a
2045             # stream-level manifest, and only set-level manifests may refer to
2046             # external resources.  See section 11.4 and section 4 of F4M spec
2047             if bootstrap_info is None:
2048                 media_url = None
2049                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2050                 if manifest_version == '2.0':
2051                     media_url = media_el.attrib.get('href')
2052                 if media_url is None:
2053                     media_url = media_el.attrib.get('url')
2054                 if not media_url:
2055                     continue
2056                 manifest_url = (
2057                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2058                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2059                 # If media_url is itself a f4m manifest do the recursive extraction
2060                 # since bitrates in parent manifest (this one) and media_url manifest
2061                 # may differ leading to inability to resolve the format by requested
2062                 # bitrate in f4m downloader
2063                 ext = determine_ext(manifest_url)
2064                 if ext == 'f4m':
2065                     f4m_formats = self._extract_f4m_formats(
2066                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2067                         transform_source=transform_source, fatal=fatal)
2068                     # Sometimes stream-level manifest contains single media entry that
2069                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2070                     # At the same time parent's media entry in set-level manifest may
2071                     # contain it. We will copy it from parent in such cases.
2072                     if len(f4m_formats) == 1:
2073                         f = f4m_formats[0]
2074                         f.update({
2075                             'tbr': f.get('tbr') or tbr,
2076                             'width': f.get('width') or width,
2077                             'height': f.get('height') or height,
2078                             'format_id': f.get('format_id') if not tbr else format_id,
2079                             'vcodec': vcodec,
2080                         })
2081                     formats.extend(f4m_formats)
2082                     continue
2083                 elif ext == 'm3u8':
2084                     formats.extend(self._extract_m3u8_formats(
2085                         manifest_url, video_id, 'mp4', preference=preference,
2086                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2087                     continue
2088             formats.append({
2089                 'format_id': format_id,
2090                 'url': manifest_url,
2091                 'manifest_url': manifest_url,
2092                 'ext': 'flv' if bootstrap_info is not None else None,
2093                 'protocol': 'f4m',
2094                 'tbr': tbr,
2095                 'width': width,
2096                 'height': height,
2097                 'vcodec': vcodec,
2098                 'preference': preference,
2099                 'quality': quality,
2100             })
2101         return formats
2102
2103     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2104         return {
2105             'format_id': join_nonempty(m3u8_id, 'meta'),
2106             'url': m3u8_url,
2107             'ext': ext,
2108             'protocol': 'm3u8',
2109             'preference': preference - 100 if preference else -100,
2110             'quality': quality,
2111             'resolution': 'multiple',
2112             'format_note': 'Quality selection URL',
2113         }
2114
2115     def _report_ignoring_subs(self, name):
2116         self.report_warning(bug_reports_message(
2117             f'Ignoring subtitle tracks found in the {name} manifest; '
2118             'if any subtitle tracks are missing,'
2119         ), only_once=True)
2120
2121     def _extract_m3u8_formats(self, *args, **kwargs):
2122         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2123         if subs:
2124             self._report_ignoring_subs('HLS')
2125         return fmts
2126
2127     def _extract_m3u8_formats_and_subtitles(
2128             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2129             preference=None, quality=None, m3u8_id=None, note=None,
2130             errnote=None, fatal=True, live=False, data=None, headers={},
2131             query={}):
2132
2133         res = self._download_webpage_handle(
2134             m3u8_url, video_id,
2135             note='Downloading m3u8 information' if note is None else note,
2136             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2137             fatal=fatal, data=data, headers=headers, query=query)
2138
2139         if res is False:
2140             return [], {}
2141
2142         m3u8_doc, urlh = res
2143         m3u8_url = urlh.geturl()
2144
2145         return self._parse_m3u8_formats_and_subtitles(
2146             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2147             preference=preference, quality=quality, m3u8_id=m3u8_id,
2148             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2149             headers=headers, query=query, video_id=video_id)
2150
2151     def _parse_m3u8_formats_and_subtitles(
2152             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2153             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2154             errnote=None, fatal=True, data=None, headers={}, query={},
2155             video_id=None):
2156         formats, subtitles = [], {}
2157
2158         has_drm = re.search('|'.join([
2159             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2160             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2161         ]), m3u8_doc)
2162
2163         def format_url(url):
2164             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2165
2166         if self.get_param('hls_split_discontinuity', False):
2167             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2168                 if not m3u8_doc:
2169                     if not manifest_url:
2170                         return []
2171                     m3u8_doc = self._download_webpage(
2172                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2173                         note=False, errnote='Failed to download m3u8 playlist information')
2174                     if m3u8_doc is False:
2175                         return []
2176                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2177
2178         else:
2179             def _extract_m3u8_playlist_indices(*args, **kwargs):
2180                 return [None]
2181
2182         # References:
2183         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2184         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2185         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2186
2187         # We should try extracting formats only from master playlists [1, 4.3.4],
2188         # i.e. playlists that describe available qualities. On the other hand
2189         # media playlists [1, 4.3.3] should be returned as is since they contain
2190         # just the media without qualities renditions.
2191         # Fortunately, master playlist can be easily distinguished from media
2192         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2193         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2194         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2195         # media playlist and MUST NOT appear in master playlist thus we can
2196         # clearly detect media playlist with this criterion.
2197
2198         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2199             formats = [{
2200                 'format_id': join_nonempty(m3u8_id, idx),
2201                 'format_index': idx,
2202                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2203                 'ext': ext,
2204                 'protocol': entry_protocol,
2205                 'preference': preference,
2206                 'quality': quality,
2207                 'has_drm': has_drm,
2208             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2209
2210             return formats, subtitles
2211
2212         groups = {}
2213         last_stream_inf = {}
2214
2215         def extract_media(x_media_line):
2216             media = parse_m3u8_attributes(x_media_line)
2217             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2218             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2219             if not (media_type and group_id and name):
2220                 return
2221             groups.setdefault(group_id, []).append(media)
2222             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2223             if media_type == 'SUBTITLES':
2224                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2225                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2226                 # However, lack of URI has been spotted in the wild.
2227                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2228                 if not media.get('URI'):
2229                     return
2230                 url = format_url(media['URI'])
2231                 sub_info = {
2232                     'url': url,
2233                     'ext': determine_ext(url),
2234                 }
2235                 if sub_info['ext'] == 'm3u8':
2236                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2237                     # files may contain is WebVTT:
2238                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2239                     sub_info['ext'] = 'vtt'
2240                     sub_info['protocol'] = 'm3u8_native'
2241                 lang = media.get('LANGUAGE') or 'und'
2242                 subtitles.setdefault(lang, []).append(sub_info)
2243             if media_type not in ('VIDEO', 'AUDIO'):
2244                 return
2245             media_url = media.get('URI')
2246             if media_url:
2247                 manifest_url = format_url(media_url)
2248                 formats.extend({
2249                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2250                     'format_note': name,
2251                     'format_index': idx,
2252                     'url': manifest_url,
2253                     'manifest_url': m3u8_url,
2254                     'language': media.get('LANGUAGE'),
2255                     'ext': ext,
2256                     'protocol': entry_protocol,
2257                     'preference': preference,
2258                     'quality': quality,
2259                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2260                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2261
2262         def build_stream_name():
2263             # Despite specification does not mention NAME attribute for
2264             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2265             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2266             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2267             stream_name = last_stream_inf.get('NAME')
2268             if stream_name:
2269                 return stream_name
2270             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2271             # from corresponding rendition group
2272             stream_group_id = last_stream_inf.get('VIDEO')
2273             if not stream_group_id:
2274                 return
2275             stream_group = groups.get(stream_group_id)
2276             if not stream_group:
2277                 return stream_group_id
2278             rendition = stream_group[0]
2279             return rendition.get('NAME') or stream_group_id
2280
2281         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2282         # chance to detect video only formats when EXT-X-STREAM-INF tags
2283         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2284         for line in m3u8_doc.splitlines():
2285             if line.startswith('#EXT-X-MEDIA:'):
2286                 extract_media(line)
2287
2288         for line in m3u8_doc.splitlines():
2289             if line.startswith('#EXT-X-STREAM-INF:'):
2290                 last_stream_inf = parse_m3u8_attributes(line)
2291             elif line.startswith('#') or not line.strip():
2292                 continue
2293             else:
2294                 tbr = float_or_none(
2295                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2296                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2297                 manifest_url = format_url(line.strip())
2298
2299                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2300                     format_id = [m3u8_id, None, idx]
2301                     # Bandwidth of live streams may differ over time thus making
2302                     # format_id unpredictable. So it's better to keep provided
2303                     # format_id intact.
2304                     if not live:
2305                         stream_name = build_stream_name()
2306                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2307                     f = {
2308                         'format_id': join_nonempty(*format_id),
2309                         'format_index': idx,
2310                         'url': manifest_url,
2311                         'manifest_url': m3u8_url,
2312                         'tbr': tbr,
2313                         'ext': ext,
2314                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2315                         'protocol': entry_protocol,
2316                         'preference': preference,
2317                         'quality': quality,
2318                     }
2319                     resolution = last_stream_inf.get('RESOLUTION')
2320                     if resolution:
2321                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2322                         if mobj:
2323                             f['width'] = int(mobj.group('width'))
2324                             f['height'] = int(mobj.group('height'))
2325                     # Unified Streaming Platform
2326                     mobj = re.search(
2327                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2328                     if mobj:
2329                         abr, vbr = mobj.groups()
2330                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2331                         f.update({
2332                             'vbr': vbr,
2333                             'abr': abr,
2334                         })
2335                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2336                     f.update(codecs)
2337                     audio_group_id = last_stream_inf.get('AUDIO')
2338                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2339                     # references a rendition group MUST have a CODECS attribute.
2340                     # However, this is not always respected, for example, [2]
2341                     # contains EXT-X-STREAM-INF tag which references AUDIO
2342                     # rendition group but does not have CODECS and despite
2343                     # referencing an audio group it represents a complete
2344                     # (with audio and video) format. So, for such cases we will
2345                     # ignore references to rendition groups and treat them
2346                     # as complete formats.
2347                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2348                         audio_group = groups.get(audio_group_id)
2349                         if audio_group and audio_group[0].get('URI'):
2350                             # TODO: update acodec for audio only formats with
2351                             # the same GROUP-ID
2352                             f['acodec'] = 'none'
2353                     if not f.get('ext'):
2354                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2355                     formats.append(f)
2356
2357                     # for DailyMotion
2358                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2359                     if progressive_uri:
2360                         http_f = f.copy()
2361                         del http_f['manifest_url']
2362                         http_f.update({
2363                             'format_id': f['format_id'].replace('hls-', 'http-'),
2364                             'protocol': 'http',
2365                             'url': progressive_uri,
2366                         })
2367                         formats.append(http_f)
2368
2369                 last_stream_inf = {}
2370         return formats, subtitles
2371
2372     def _extract_m3u8_vod_duration(
2373             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2374
2375         m3u8_vod = self._download_webpage(
2376             m3u8_vod_url, video_id,
2377             note='Downloading m3u8 VOD manifest' if note is None else note,
2378             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2379             fatal=False, data=data, headers=headers, query=query)
2380
2381         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2382
2383     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2384         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2385             return None
2386
2387         return int(sum(
2388             float(line[len('#EXTINF:'):].split(',')[0])
2389             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2390
2391     @staticmethod
2392     def _xpath_ns(path, namespace=None):
2393         if not namespace:
2394             return path
2395         out = []
2396         for c in path.split('/'):
2397             if not c or c == '.':
2398                 out.append(c)
2399             else:
2400                 out.append('{%s}%s' % (namespace, c))
2401         return '/'.join(out)
2402
2403     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2404         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2405         if res is False:
2406             assert not fatal
2407             return [], {}
2408
2409         smil, urlh = res
2410         smil_url = urlh.geturl()
2411
2412         namespace = self._parse_smil_namespace(smil)
2413
2414         fmts = self._parse_smil_formats(
2415             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2416         subs = self._parse_smil_subtitles(
2417             smil, namespace=namespace)
2418
2419         return fmts, subs
2420
2421     def _extract_smil_formats(self, *args, **kwargs):
2422         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2423         if subs:
2424             self._report_ignoring_subs('SMIL')
2425         return fmts
2426
2427     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2428         res = self._download_smil(smil_url, video_id, fatal=fatal)
2429         if res is False:
2430             return {}
2431
2432         smil, urlh = res
2433         smil_url = urlh.geturl()
2434
2435         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2436
2437     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2438         return self._download_xml_handle(
2439             smil_url, video_id, 'Downloading SMIL file',
2440             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2441
2442     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2443         namespace = self._parse_smil_namespace(smil)
2444
2445         formats = self._parse_smil_formats(
2446             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2447         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2448
2449         video_id = os.path.splitext(url_basename(smil_url))[0]
2450         title = None
2451         description = None
2452         upload_date = None
2453         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2454             name = meta.attrib.get('name')
2455             content = meta.attrib.get('content')
2456             if not name or not content:
2457                 continue
2458             if not title and name == 'title':
2459                 title = content
2460             elif not description and name in ('description', 'abstract'):
2461                 description = content
2462             elif not upload_date and name == 'date':
2463                 upload_date = unified_strdate(content)
2464
2465         thumbnails = [{
2466             'id': image.get('type'),
2467             'url': image.get('src'),
2468             'width': int_or_none(image.get('width')),
2469             'height': int_or_none(image.get('height')),
2470         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2471
2472         return {
2473             'id': video_id,
2474             'title': title or video_id,
2475             'description': description,
2476             'upload_date': upload_date,
2477             'thumbnails': thumbnails,
2478             'formats': formats,
2479             'subtitles': subtitles,
2480         }
2481
2482     def _parse_smil_namespace(self, smil):
2483         return self._search_regex(
2484             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2485
2486     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2487         base = smil_url
2488         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2489             b = meta.get('base') or meta.get('httpBase')
2490             if b:
2491                 base = b
2492                 break
2493
2494         formats = []
2495         rtmp_count = 0
2496         http_count = 0
2497         m3u8_count = 0
2498         imgs_count = 0
2499
2500         srcs = set()
2501         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2502         for medium in media:
2503             src = medium.get('src')
2504             if not src or src in srcs:
2505                 continue
2506             srcs.add(src)
2507
2508             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2509             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2510             width = int_or_none(medium.get('width'))
2511             height = int_or_none(medium.get('height'))
2512             proto = medium.get('proto')
2513             ext = medium.get('ext')
2514             src_ext = determine_ext(src)
2515             streamer = medium.get('streamer') or base
2516
2517             if proto == 'rtmp' or streamer.startswith('rtmp'):
2518                 rtmp_count += 1
2519                 formats.append({
2520                     'url': streamer,
2521                     'play_path': src,
2522                     'ext': 'flv',
2523                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2524                     'tbr': bitrate,
2525                     'filesize': filesize,
2526                     'width': width,
2527                     'height': height,
2528                 })
2529                 if transform_rtmp_url:
2530                     streamer, src = transform_rtmp_url(streamer, src)
2531                     formats[-1].update({
2532                         'url': streamer,
2533                         'play_path': src,
2534                     })
2535                 continue
2536
2537             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2538             src_url = src_url.strip()
2539
2540             if proto == 'm3u8' or src_ext == 'm3u8':
2541                 m3u8_formats = self._extract_m3u8_formats(
2542                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2543                 if len(m3u8_formats) == 1:
2544                     m3u8_count += 1
2545                     m3u8_formats[0].update({
2546                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2547                         'tbr': bitrate,
2548                         'width': width,
2549                         'height': height,
2550                     })
2551                 formats.extend(m3u8_formats)
2552             elif src_ext == 'f4m':
2553                 f4m_url = src_url
2554                 if not f4m_params:
2555                     f4m_params = {
2556                         'hdcore': '3.2.0',
2557                         'plugin': 'flowplayer-3.2.0.1',
2558                     }
2559                 f4m_url += '&' if '?' in f4m_url else '?'
2560                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2561                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2562             elif src_ext == 'mpd':
2563                 formats.extend(self._extract_mpd_formats(
2564                     src_url, video_id, mpd_id='dash', fatal=False))
2565             elif re.search(r'\.ism/[Mm]anifest', src_url):
2566                 formats.extend(self._extract_ism_formats(
2567                     src_url, video_id, ism_id='mss', fatal=False))
2568             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2569                 http_count += 1
2570                 formats.append({
2571                     'url': src_url,
2572                     'ext': ext or src_ext or 'flv',
2573                     'format_id': 'http-%d' % (bitrate or http_count),
2574                     'tbr': bitrate,
2575                     'filesize': filesize,
2576                     'width': width,
2577                     'height': height,
2578                 })
2579
2580         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2581             src = medium.get('src')
2582             if not src or src in srcs:
2583                 continue
2584             srcs.add(src)
2585
2586             imgs_count += 1
2587             formats.append({
2588                 'format_id': 'imagestream-%d' % (imgs_count),
2589                 'url': src,
2590                 'ext': mimetype2ext(medium.get('type')),
2591                 'acodec': 'none',
2592                 'vcodec': 'none',
2593                 'width': int_or_none(medium.get('width')),
2594                 'height': int_or_none(medium.get('height')),
2595                 'format_note': 'SMIL storyboards',
2596             })
2597
2598         return formats
2599
2600     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2601         urls = []
2602         subtitles = {}
2603         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2604             src = textstream.get('src')
2605             if not src or src in urls:
2606                 continue
2607             urls.append(src)
2608             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2609             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2610             subtitles.setdefault(lang, []).append({
2611                 'url': src,
2612                 'ext': ext,
2613             })
2614         return subtitles
2615
2616     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2617         res = self._download_xml_handle(
2618             xspf_url, playlist_id, 'Downloading xpsf playlist',
2619             'Unable to download xspf manifest', fatal=fatal)
2620         if res is False:
2621             return []
2622
2623         xspf, urlh = res
2624         xspf_url = urlh.geturl()
2625
2626         return self._parse_xspf(
2627             xspf, playlist_id, xspf_url=xspf_url,
2628             xspf_base_url=base_url(xspf_url))
2629
2630     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2631         NS_MAP = {
2632             'xspf': 'http://xspf.org/ns/0/',
2633             's1': 'http://static.streamone.nl/player/ns/0',
2634         }
2635
2636         entries = []
2637         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2638             title = xpath_text(
2639                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2640             description = xpath_text(
2641                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2642             thumbnail = xpath_text(
2643                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2644             duration = float_or_none(
2645                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2646
2647             formats = []
2648             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2649                 format_url = urljoin(xspf_base_url, location.text)
2650                 if not format_url:
2651                     continue
2652                 formats.append({
2653                     'url': format_url,
2654                     'manifest_url': xspf_url,
2655                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2656                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2657                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2658                 })
2659             self._sort_formats(formats)
2660
2661             entries.append({
2662                 'id': playlist_id,
2663                 'title': title,
2664                 'description': description,
2665                 'thumbnail': thumbnail,
2666                 'duration': duration,
2667                 'formats': formats,
2668             })
2669         return entries
2670
2671     def _extract_mpd_formats(self, *args, **kwargs):
2672         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2673         if subs:
2674             self._report_ignoring_subs('DASH')
2675         return fmts
2676
2677     def _extract_mpd_formats_and_subtitles(
2678             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2679             fatal=True, data=None, headers={}, query={}):
2680         res = self._download_xml_handle(
2681             mpd_url, video_id,
2682             note='Downloading MPD manifest' if note is None else note,
2683             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2684             fatal=fatal, data=data, headers=headers, query=query)
2685         if res is False:
2686             return [], {}
2687         mpd_doc, urlh = res
2688         if mpd_doc is None:
2689             return [], {}
2690
2691         # We could have been redirected to a new url when we retrieved our mpd file.
2692         mpd_url = urlh.geturl()
2693         mpd_base_url = base_url(mpd_url)
2694
2695         return self._parse_mpd_formats_and_subtitles(
2696             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2697
2698     def _parse_mpd_formats(self, *args, **kwargs):
2699         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2700         if subs:
2701             self._report_ignoring_subs('DASH')
2702         return fmts
2703
2704     def _parse_mpd_formats_and_subtitles(
2705             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2706         """
2707         Parse formats from MPD manifest.
2708         References:
2709          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2710             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2711          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2712         """
2713         if not self.get_param('dynamic_mpd', True):
2714             if mpd_doc.get('type') == 'dynamic':
2715                 return [], {}
2716
2717         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2718
2719         def _add_ns(path):
2720             return self._xpath_ns(path, namespace)
2721
2722         def is_drm_protected(element):
2723             return element.find(_add_ns('ContentProtection')) is not None
2724
2725         def extract_multisegment_info(element, ms_parent_info):
2726             ms_info = ms_parent_info.copy()
2727
2728             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2729             # common attributes and elements.  We will only extract relevant
2730             # for us.
2731             def extract_common(source):
2732                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2733                 if segment_timeline is not None:
2734                     s_e = segment_timeline.findall(_add_ns('S'))
2735                     if s_e:
2736                         ms_info['total_number'] = 0
2737                         ms_info['s'] = []
2738                         for s in s_e:
2739                             r = int(s.get('r', 0))
2740                             ms_info['total_number'] += 1 + r
2741                             ms_info['s'].append({
2742                                 't': int(s.get('t', 0)),
2743                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2744                                 'd': int(s.attrib['d']),
2745                                 'r': r,
2746                             })
2747                 start_number = source.get('startNumber')
2748                 if start_number:
2749                     ms_info['start_number'] = int(start_number)
2750                 timescale = source.get('timescale')
2751                 if timescale:
2752                     ms_info['timescale'] = int(timescale)
2753                 segment_duration = source.get('duration')
2754                 if segment_duration:
2755                     ms_info['segment_duration'] = float(segment_duration)
2756
2757             def extract_Initialization(source):
2758                 initialization = source.find(_add_ns('Initialization'))
2759                 if initialization is not None:
2760                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2761
2762             segment_list = element.find(_add_ns('SegmentList'))
2763             if segment_list is not None:
2764                 extract_common(segment_list)
2765                 extract_Initialization(segment_list)
2766                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2767                 if segment_urls_e:
2768                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2769             else:
2770                 segment_template = element.find(_add_ns('SegmentTemplate'))
2771                 if segment_template is not None:
2772                     extract_common(segment_template)
2773                     media = segment_template.get('media')
2774                     if media:
2775                         ms_info['media'] = media
2776                     initialization = segment_template.get('initialization')
2777                     if initialization:
2778                         ms_info['initialization'] = initialization
2779                     else:
2780                         extract_Initialization(segment_template)
2781             return ms_info
2782
2783         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2784         formats, subtitles = [], {}
2785         stream_numbers = collections.defaultdict(int)
2786         for period in mpd_doc.findall(_add_ns('Period')):
2787             period_duration = parse_duration(period.get('duration')) or mpd_duration
2788             period_ms_info = extract_multisegment_info(period, {
2789                 'start_number': 1,
2790                 'timescale': 1,
2791             })
2792             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2793                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2794                 for representation in adaptation_set.findall(_add_ns('Representation')):
2795                     representation_attrib = adaptation_set.attrib.copy()
2796                     representation_attrib.update(representation.attrib)
2797                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2798                     mime_type = representation_attrib['mimeType']
2799                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2800
2801                     codec_str = representation_attrib.get('codecs', '')
2802                     # Some kind of binary subtitle found in some youtube livestreams
2803                     if mime_type == 'application/x-rawcc':
2804                         codecs = {'scodec': codec_str}
2805                     else:
2806                         codecs = parse_codecs(codec_str)
2807                     if content_type not in ('video', 'audio', 'text'):
2808                         if mime_type == 'image/jpeg':
2809                             content_type = mime_type
2810                         elif codecs.get('vcodec', 'none') != 'none':
2811                             content_type = 'video'
2812                         elif codecs.get('acodec', 'none') != 'none':
2813                             content_type = 'audio'
2814                         elif codecs.get('scodec', 'none') != 'none':
2815                             content_type = 'text'
2816                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2817                             content_type = 'text'
2818                         else:
2819                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2820                             continue
2821
2822                     base_url = ''
2823                     for element in (representation, adaptation_set, period, mpd_doc):
2824                         base_url_e = element.find(_add_ns('BaseURL'))
2825                         if base_url_e is not None:
2826                             base_url = base_url_e.text + base_url
2827                             if re.match(r'^https?://', base_url):
2828                                 break
2829                     if mpd_base_url and base_url.startswith('/'):
2830                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2831                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2832                         if not mpd_base_url.endswith('/'):
2833                             mpd_base_url += '/'
2834                         base_url = mpd_base_url + base_url
2835                     representation_id = representation_attrib.get('id')
2836                     lang = representation_attrib.get('lang')
2837                     url_el = representation.find(_add_ns('BaseURL'))
2838                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2839                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2840                     if representation_id is not None:
2841                         format_id = representation_id
2842                     else:
2843                         format_id = content_type
2844                     if mpd_id:
2845                         format_id = mpd_id + '-' + format_id
2846                     if content_type in ('video', 'audio'):
2847                         f = {
2848                             'format_id': format_id,
2849                             'manifest_url': mpd_url,
2850                             'ext': mimetype2ext(mime_type),
2851                             'width': int_or_none(representation_attrib.get('width')),
2852                             'height': int_or_none(representation_attrib.get('height')),
2853                             'tbr': float_or_none(bandwidth, 1000),
2854                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2855                             'fps': int_or_none(representation_attrib.get('frameRate')),
2856                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2857                             'format_note': 'DASH %s' % content_type,
2858                             'filesize': filesize,
2859                             'container': mimetype2ext(mime_type) + '_dash',
2860                             **codecs
2861                         }
2862                     elif content_type == 'text':
2863                         f = {
2864                             'ext': mimetype2ext(mime_type),
2865                             'manifest_url': mpd_url,
2866                             'filesize': filesize,
2867                         }
2868                     elif content_type == 'image/jpeg':
2869                         # See test case in VikiIE
2870                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2871                         f = {
2872                             'format_id': format_id,
2873                             'ext': 'mhtml',
2874                             'manifest_url': mpd_url,
2875                             'format_note': 'DASH storyboards (jpeg)',
2876                             'acodec': 'none',
2877                             'vcodec': 'none',
2878                         }
2879                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2880                         f['has_drm'] = True
2881                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2882
2883                     def prepare_template(template_name, identifiers):
2884                         tmpl = representation_ms_info[template_name]
2885                         # First of, % characters outside $...$ templates
2886                         # must be escaped by doubling for proper processing
2887                         # by % operator string formatting used further (see
2888                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2889                         t = ''
2890                         in_template = False
2891                         for c in tmpl:
2892                             t += c
2893                             if c == '$':
2894                                 in_template = not in_template
2895                             elif c == '%' and not in_template:
2896                                 t += c
2897                         # Next, $...$ templates are translated to their
2898                         # %(...) counterparts to be used with % operator
2899                         if representation_id is not None:
2900                             t = t.replace('$RepresentationID$', representation_id)
2901                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2902                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2903                         t.replace('$$', '$')
2904                         return t
2905
2906                     # @initialization is a regular template like @media one
2907                     # so it should be handled just the same way (see
2908                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2909                     if 'initialization' in representation_ms_info:
2910                         initialization_template = prepare_template(
2911                             'initialization',
2912                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2913                             # $Time$ shall not be included for @initialization thus
2914                             # only $Bandwidth$ remains
2915                             ('Bandwidth', ))
2916                         representation_ms_info['initialization_url'] = initialization_template % {
2917                             'Bandwidth': bandwidth,
2918                         }
2919
2920                     def location_key(location):
2921                         return 'url' if re.match(r'^https?://', location) else 'path'
2922
2923                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2924
2925                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2926                         media_location_key = location_key(media_template)
2927
2928                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2929                         # can't be used at the same time
2930                         if '%(Number' in media_template and 's' not in representation_ms_info:
2931                             segment_duration = None
2932                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2933                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2934                                 representation_ms_info['total_number'] = int(math.ceil(
2935                                     float_or_none(period_duration, segment_duration, default=0)))
2936                             representation_ms_info['fragments'] = [{
2937                                 media_location_key: media_template % {
2938                                     'Number': segment_number,
2939                                     'Bandwidth': bandwidth,
2940                                 },
2941                                 'duration': segment_duration,
2942                             } for segment_number in range(
2943                                 representation_ms_info['start_number'],
2944                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2945                         else:
2946                             # $Number*$ or $Time$ in media template with S list available
2947                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2948                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2949                             representation_ms_info['fragments'] = []
2950                             segment_time = 0
2951                             segment_d = None
2952                             segment_number = representation_ms_info['start_number']
2953
2954                             def add_segment_url():
2955                                 segment_url = media_template % {
2956                                     'Time': segment_time,
2957                                     'Bandwidth': bandwidth,
2958                                     'Number': segment_number,
2959                                 }
2960                                 representation_ms_info['fragments'].append({
2961                                     media_location_key: segment_url,
2962                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2963                                 })
2964
2965                             for num, s in enumerate(representation_ms_info['s']):
2966                                 segment_time = s.get('t') or segment_time
2967                                 segment_d = s['d']
2968                                 add_segment_url()
2969                                 segment_number += 1
2970                                 for r in range(s.get('r', 0)):
2971                                     segment_time += segment_d
2972                                     add_segment_url()
2973                                     segment_number += 1
2974                                 segment_time += segment_d
2975                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2976                         # No media template
2977                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2978                         # or any YouTube dashsegments video
2979                         fragments = []
2980                         segment_index = 0
2981                         timescale = representation_ms_info['timescale']
2982                         for s in representation_ms_info['s']:
2983                             duration = float_or_none(s['d'], timescale)
2984                             for r in range(s.get('r', 0) + 1):
2985                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2986                                 fragments.append({
2987                                     location_key(segment_uri): segment_uri,
2988                                     'duration': duration,
2989                                 })
2990                                 segment_index += 1
2991                         representation_ms_info['fragments'] = fragments
2992                     elif 'segment_urls' in representation_ms_info:
2993                         # Segment URLs with no SegmentTimeline
2994                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2995                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2996                         fragments = []
2997                         segment_duration = float_or_none(
2998                             representation_ms_info['segment_duration'],
2999                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3000                         for segment_url in representation_ms_info['segment_urls']:
3001                             fragment = {
3002                                 location_key(segment_url): segment_url,
3003                             }
3004                             if segment_duration:
3005                                 fragment['duration'] = segment_duration
3006                             fragments.append(fragment)
3007                         representation_ms_info['fragments'] = fragments
3008                     # If there is a fragments key available then we correctly recognized fragmented media.
3009                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3010                     # assumption is not necessarily correct since we may simply have no support for
3011                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3012                     if 'fragments' in representation_ms_info:
3013                         f.update({
3014                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3015                             'url': mpd_url or base_url,
3016                             'fragment_base_url': base_url,
3017                             'fragments': [],
3018                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3019                         })
3020                         if 'initialization_url' in representation_ms_info:
3021                             initialization_url = representation_ms_info['initialization_url']
3022                             if not f.get('url'):
3023                                 f['url'] = initialization_url
3024                             f['fragments'].append({location_key(initialization_url): initialization_url})
3025                         f['fragments'].extend(representation_ms_info['fragments'])
3026                         if not period_duration:
3027                             period_duration = try_get(
3028                                 representation_ms_info,
3029                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3030                     else:
3031                         # Assuming direct URL to unfragmented media.
3032                         f['url'] = base_url
3033                     if content_type in ('video', 'audio', 'image/jpeg'):
3034                         f['manifest_stream_number'] = stream_numbers[f['url']]
3035                         stream_numbers[f['url']] += 1
3036                         formats.append(f)
3037                     elif content_type == 'text':
3038                         subtitles.setdefault(lang or 'und', []).append(f)
3039
3040         return formats, subtitles
3041
3042     def _extract_ism_formats(self, *args, **kwargs):
3043         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3044         if subs:
3045             self._report_ignoring_subs('ISM')
3046         return fmts
3047
3048     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3049         res = self._download_xml_handle(
3050             ism_url, video_id,
3051             note='Downloading ISM manifest' if note is None else note,
3052             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3053             fatal=fatal, data=data, headers=headers, query=query)
3054         if res is False:
3055             return [], {}
3056         ism_doc, urlh = res
3057         if ism_doc is None:
3058             return [], {}
3059
3060         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3061
3062     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3063         """
3064         Parse formats from ISM manifest.
3065         References:
3066          1. [MS-SSTR]: Smooth Streaming Protocol,
3067             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3068         """
3069         if ism_doc.get('IsLive') == 'TRUE':
3070             return [], {}
3071
3072         duration = int(ism_doc.attrib['Duration'])
3073         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3074
3075         formats = []
3076         subtitles = {}
3077         for stream in ism_doc.findall('StreamIndex'):
3078             stream_type = stream.get('Type')
3079             if stream_type not in ('video', 'audio', 'text'):
3080                 continue
3081             url_pattern = stream.attrib['Url']
3082             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3083             stream_name = stream.get('Name')
3084             stream_language = stream.get('Language', 'und')
3085             for track in stream.findall('QualityLevel'):
3086                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3087                 # TODO: add support for WVC1 and WMAP
3088                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3089                     self.report_warning('%s is not a supported codec' % fourcc)
3090                     continue
3091                 tbr = int(track.attrib['Bitrate']) // 1000
3092                 # [1] does not mention Width and Height attributes. However,
3093                 # they're often present while MaxWidth and MaxHeight are
3094                 # missing, so should be used as fallbacks
3095                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3096                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3097                 sampling_rate = int_or_none(track.get('SamplingRate'))
3098
3099                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3100                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3101
3102                 fragments = []
3103                 fragment_ctx = {
3104                     'time': 0,
3105                 }
3106                 stream_fragments = stream.findall('c')
3107                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3108                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3109                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3110                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3111                     if not fragment_ctx['duration']:
3112                         try:
3113                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3114                         except IndexError:
3115                             next_fragment_time = duration
3116                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3117                     for _ in range(fragment_repeat):
3118                         fragments.append({
3119                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3120                             'duration': fragment_ctx['duration'] / stream_timescale,
3121                         })
3122                         fragment_ctx['time'] += fragment_ctx['duration']
3123
3124                 if stream_type == 'text':
3125                     subtitles.setdefault(stream_language, []).append({
3126                         'ext': 'ismt',
3127                         'protocol': 'ism',
3128                         'url': ism_url,
3129                         'manifest_url': ism_url,
3130                         'fragments': fragments,
3131                         '_download_params': {
3132                             'stream_type': stream_type,
3133                             'duration': duration,
3134                             'timescale': stream_timescale,
3135                             'fourcc': fourcc,
3136                             'language': stream_language,
3137                             'codec_private_data': track.get('CodecPrivateData'),
3138                         }
3139                     })
3140                 elif stream_type in ('video', 'audio'):
3141                     formats.append({
3142                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3143                         'url': ism_url,
3144                         'manifest_url': ism_url,
3145                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3146                         'width': width,
3147                         'height': height,
3148                         'tbr': tbr,
3149                         'asr': sampling_rate,
3150                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3151                         'acodec': 'none' if stream_type == 'video' else fourcc,
3152                         'protocol': 'ism',
3153                         'fragments': fragments,
3154                         'has_drm': ism_doc.find('Protection') is not None,
3155                         '_download_params': {
3156                             'stream_type': stream_type,
3157                             'duration': duration,
3158                             'timescale': stream_timescale,
3159                             'width': width or 0,
3160                             'height': height or 0,
3161                             'fourcc': fourcc,
3162                             'language': stream_language,
3163                             'codec_private_data': track.get('CodecPrivateData'),
3164                             'sampling_rate': sampling_rate,
3165                             'channels': int_or_none(track.get('Channels', 2)),
3166                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3167                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3168                         },
3169                     })
3170         return formats, subtitles
3171
3172     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3173         def absolute_url(item_url):
3174             return urljoin(base_url, item_url)
3175
3176         def parse_content_type(content_type):
3177             if not content_type:
3178                 return {}
3179             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3180             if ctr:
3181                 mimetype, codecs = ctr.groups()
3182                 f = parse_codecs(codecs)
3183                 f['ext'] = mimetype2ext(mimetype)
3184                 return f
3185             return {}
3186
3187         def _media_formats(src, cur_media_type, type_info=None):
3188             type_info = type_info or {}
3189             full_url = absolute_url(src)
3190             ext = type_info.get('ext') or determine_ext(full_url)
3191             if ext == 'm3u8':
3192                 is_plain_url = False
3193                 formats = self._extract_m3u8_formats(
3194                     full_url, video_id, ext='mp4',
3195                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3196                     preference=preference, quality=quality, fatal=False)
3197             elif ext == 'mpd':
3198                 is_plain_url = False
3199                 formats = self._extract_mpd_formats(
3200                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3201             else:
3202                 is_plain_url = True
3203                 formats = [{
3204                     'url': full_url,
3205                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3206                     'ext': ext,
3207                 }]
3208             return is_plain_url, formats
3209
3210         entries = []
3211         # amp-video and amp-audio are very similar to their HTML5 counterparts
3212         # so we wll include them right here (see
3213         # https://www.ampproject.org/docs/reference/components/amp-video)
3214         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3215         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3216         media_tags = [(media_tag, media_tag_name, media_type, '')
3217                       for media_tag, media_tag_name, media_type
3218                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3219         media_tags.extend(re.findall(
3220             # We only allow video|audio followed by a whitespace or '>'.
3221             # Allowing more characters may end up in significant slow down (see
3222             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3223             # http://www.porntrex.com/maps/videositemap.xml).
3224             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3225         for media_tag, _, media_type, media_content in media_tags:
3226             media_info = {
3227                 'formats': [],
3228                 'subtitles': {},
3229             }
3230             media_attributes = extract_attributes(media_tag)
3231             src = strip_or_none(media_attributes.get('src'))
3232             if src:
3233                 f = parse_content_type(media_attributes.get('type'))
3234                 _, formats = _media_formats(src, media_type, f)
3235                 media_info['formats'].extend(formats)
3236             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3237             if media_content:
3238                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3239                     s_attr = extract_attributes(source_tag)
3240                     # data-video-src and data-src are non standard but seen
3241                     # several times in the wild
3242                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3243                     if not src:
3244                         continue
3245                     f = parse_content_type(s_attr.get('type'))
3246                     is_plain_url, formats = _media_formats(src, media_type, f)
3247                     if is_plain_url:
3248                         # width, height, res, label and title attributes are
3249                         # all not standard but seen several times in the wild
3250                         labels = [
3251                             s_attr.get(lbl)
3252                             for lbl in ('label', 'title')
3253                             if str_or_none(s_attr.get(lbl))
3254                         ]
3255                         width = int_or_none(s_attr.get('width'))
3256                         height = (int_or_none(s_attr.get('height'))
3257                                   or int_or_none(s_attr.get('res')))
3258                         if not width or not height:
3259                             for lbl in labels:
3260                                 resolution = parse_resolution(lbl)
3261                                 if not resolution:
3262                                     continue
3263                                 width = width or resolution.get('width')
3264                                 height = height or resolution.get('height')
3265                         for lbl in labels:
3266                             tbr = parse_bitrate(lbl)
3267                             if tbr:
3268                                 break
3269                         else:
3270                             tbr = None
3271                         f.update({
3272                             'width': width,
3273                             'height': height,
3274                             'tbr': tbr,
3275                             'format_id': s_attr.get('label') or s_attr.get('title'),
3276                         })
3277                         f.update(formats[0])
3278                         media_info['formats'].append(f)
3279                     else:
3280                         media_info['formats'].extend(formats)
3281                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3282                     track_attributes = extract_attributes(track_tag)
3283                     kind = track_attributes.get('kind')
3284                     if not kind or kind in ('subtitles', 'captions'):
3285                         src = strip_or_none(track_attributes.get('src'))
3286                         if not src:
3287                             continue
3288                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3289                         media_info['subtitles'].setdefault(lang, []).append({
3290                             'url': absolute_url(src),
3291                         })
3292             for f in media_info['formats']:
3293                 f.setdefault('http_headers', {})['Referer'] = base_url
3294             if media_info['formats'] or media_info['subtitles']:
3295                 entries.append(media_info)
3296         return entries
3297
3298     def _extract_akamai_formats(self, *args, **kwargs):
3299         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3300         if subs:
3301             self._report_ignoring_subs('akamai')
3302         return fmts
3303
3304     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3305         signed = 'hdnea=' in manifest_url
3306         if not signed:
3307             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3308             manifest_url = re.sub(
3309                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3310                 '', manifest_url).strip('?')
3311
3312         formats = []
3313         subtitles = {}
3314
3315         hdcore_sign = 'hdcore=3.7.0'
3316         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3317         hds_host = hosts.get('hds')
3318         if hds_host:
3319             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3320         if 'hdcore=' not in f4m_url:
3321             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3322         f4m_formats = self._extract_f4m_formats(
3323             f4m_url, video_id, f4m_id='hds', fatal=False)
3324         for entry in f4m_formats:
3325             entry.update({'extra_param_to_segment_url': hdcore_sign})
3326         formats.extend(f4m_formats)
3327
3328         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3329         hls_host = hosts.get('hls')
3330         if hls_host:
3331             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3332         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3333             m3u8_url, video_id, 'mp4', 'm3u8_native',
3334             m3u8_id='hls', fatal=False)
3335         formats.extend(m3u8_formats)
3336         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3337
3338         http_host = hosts.get('http')
3339         if http_host and m3u8_formats and not signed:
3340             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3341             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3342             qualities_length = len(qualities)
3343             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3344                 i = 0
3345                 for f in m3u8_formats:
3346                     if f['vcodec'] != 'none':
3347                         for protocol in ('http', 'https'):
3348                             http_f = f.copy()
3349                             del http_f['manifest_url']
3350                             http_url = re.sub(
3351                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3352                             http_f.update({
3353                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3354                                 'url': http_url,
3355                                 'protocol': protocol,
3356                             })
3357                             formats.append(http_f)
3358                         i += 1
3359
3360         return formats, subtitles
3361
3362     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3363         query = compat_urlparse.urlparse(url).query
3364         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3365         mobj = re.search(
3366             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3367         url_base = mobj.group('url')
3368         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3369         formats = []
3370
3371         def manifest_url(manifest):
3372             m_url = f'{http_base_url}/{manifest}'
3373             if query:
3374                 m_url += '?%s' % query
3375             return m_url
3376
3377         if 'm3u8' not in skip_protocols:
3378             formats.extend(self._extract_m3u8_formats(
3379                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3380                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3381         if 'f4m' not in skip_protocols:
3382             formats.extend(self._extract_f4m_formats(
3383                 manifest_url('manifest.f4m'),
3384                 video_id, f4m_id='hds', fatal=False))
3385         if 'dash' not in skip_protocols:
3386             formats.extend(self._extract_mpd_formats(
3387                 manifest_url('manifest.mpd'),
3388                 video_id, mpd_id='dash', fatal=False))
3389         if re.search(r'(?:/smil:|\.smil)', url_base):
3390             if 'smil' not in skip_protocols:
3391                 rtmp_formats = self._extract_smil_formats(
3392                     manifest_url('jwplayer.smil'),
3393                     video_id, fatal=False)
3394                 for rtmp_format in rtmp_formats:
3395                     rtsp_format = rtmp_format.copy()
3396                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3397                     del rtsp_format['play_path']
3398                     del rtsp_format['ext']
3399                     rtsp_format.update({
3400                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3401                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3402                         'protocol': 'rtsp',
3403                     })
3404                     formats.extend([rtmp_format, rtsp_format])
3405         else:
3406             for protocol in ('rtmp', 'rtsp'):
3407                 if protocol not in skip_protocols:
3408                     formats.append({
3409                         'url': f'{protocol}:{url_base}',
3410                         'format_id': protocol,
3411                         'protocol': protocol,
3412                     })
3413         return formats
3414
3415     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3416         mobj = re.search(
3417             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3418             webpage)
3419         if mobj:
3420             try:
3421                 jwplayer_data = self._parse_json(mobj.group('options'),
3422                                                  video_id=video_id,
3423                                                  transform_source=transform_source)
3424             except ExtractorError:
3425                 pass
3426             else:
3427                 if isinstance(jwplayer_data, dict):
3428                     return jwplayer_data
3429
3430     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3431         jwplayer_data = self._find_jwplayer_data(
3432             webpage, video_id, transform_source=js_to_json)
3433         return self._parse_jwplayer_data(
3434             jwplayer_data, video_id, *args, **kwargs)
3435
3436     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3437                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3438         # JWPlayer backward compatibility: flattened playlists
3439         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3440         if 'playlist' not in jwplayer_data:
3441             jwplayer_data = {'playlist': [jwplayer_data]}
3442
3443         entries = []
3444
3445         # JWPlayer backward compatibility: single playlist item
3446         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3447         if not isinstance(jwplayer_data['playlist'], list):
3448             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3449
3450         for video_data in jwplayer_data['playlist']:
3451             # JWPlayer backward compatibility: flattened sources
3452             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3453             if 'sources' not in video_data:
3454                 video_data['sources'] = [video_data]
3455
3456             this_video_id = video_id or video_data['mediaid']
3457
3458             formats = self._parse_jwplayer_formats(
3459                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3460                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3461
3462             subtitles = {}
3463             tracks = video_data.get('tracks')
3464             if tracks and isinstance(tracks, list):
3465                 for track in tracks:
3466                     if not isinstance(track, dict):
3467                         continue
3468                     track_kind = track.get('kind')
3469                     if not track_kind or not isinstance(track_kind, compat_str):
3470                         continue
3471                     if track_kind.lower() not in ('captions', 'subtitles'):
3472                         continue
3473                     track_url = urljoin(base_url, track.get('file'))
3474                     if not track_url:
3475                         continue
3476                     subtitles.setdefault(track.get('label') or 'en', []).append({
3477                         'url': self._proto_relative_url(track_url)
3478                     })
3479
3480             entry = {
3481                 'id': this_video_id,
3482                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3483                 'description': clean_html(video_data.get('description')),
3484                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3485                 'timestamp': int_or_none(video_data.get('pubdate')),
3486                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3487                 'subtitles': subtitles,
3488             }
3489             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3490             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3491                 entry.update({
3492                     '_type': 'url_transparent',
3493                     'url': formats[0]['url'],
3494                 })
3495             else:
3496                 self._sort_formats(formats)
3497                 entry['formats'] = formats
3498             entries.append(entry)
3499         if len(entries) == 1:
3500             return entries[0]
3501         else:
3502             return self.playlist_result(entries)
3503
3504     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3505                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3506         urls = []
3507         formats = []
3508         for source in jwplayer_sources_data:
3509             if not isinstance(source, dict):
3510                 continue
3511             source_url = urljoin(
3512                 base_url, self._proto_relative_url(source.get('file')))
3513             if not source_url or source_url in urls:
3514                 continue
3515             urls.append(source_url)
3516             source_type = source.get('type') or ''
3517             ext = mimetype2ext(source_type) or determine_ext(source_url)
3518             if source_type == 'hls' or ext == 'm3u8':
3519                 formats.extend(self._extract_m3u8_formats(
3520                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3521                     m3u8_id=m3u8_id, fatal=False))
3522             elif source_type == 'dash' or ext == 'mpd':
3523                 formats.extend(self._extract_mpd_formats(
3524                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3525             elif ext == 'smil':
3526                 formats.extend(self._extract_smil_formats(
3527                     source_url, video_id, fatal=False))
3528             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3529             elif source_type.startswith('audio') or ext in (
3530                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3531                 formats.append({
3532                     'url': source_url,
3533                     'vcodec': 'none',
3534                     'ext': ext,
3535                 })
3536             else:
3537                 height = int_or_none(source.get('height'))
3538                 if height is None:
3539                     # Often no height is provided but there is a label in
3540                     # format like "1080p", "720p SD", or 1080.
3541                     height = int_or_none(self._search_regex(
3542                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3543                         'height', default=None))
3544                 a_format = {
3545                     'url': source_url,
3546                     'width': int_or_none(source.get('width')),
3547                     'height': height,
3548                     'tbr': int_or_none(source.get('bitrate')),
3549                     'ext': ext,
3550                 }
3551                 if source_url.startswith('rtmp'):
3552                     a_format['ext'] = 'flv'
3553                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3554                     # of jwplayer.flash.swf
3555                     rtmp_url_parts = re.split(
3556                         r'((?:mp4|mp3|flv):)', source_url, 1)
3557                     if len(rtmp_url_parts) == 3:
3558                         rtmp_url, prefix, play_path = rtmp_url_parts
3559                         a_format.update({
3560                             'url': rtmp_url,
3561                             'play_path': prefix + play_path,
3562                         })
3563                     if rtmp_params:
3564                         a_format.update(rtmp_params)
3565                 formats.append(a_format)
3566         return formats
3567
3568     def _live_title(self, name):
3569         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3570         return name
3571
3572     def _int(self, v, name, fatal=False, **kwargs):
3573         res = int_or_none(v, **kwargs)
3574         if res is None:
3575             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3576             if fatal:
3577                 raise ExtractorError(msg)
3578             else:
3579                 self.report_warning(msg)
3580         return res
3581
3582     def _float(self, v, name, fatal=False, **kwargs):
3583         res = float_or_none(v, **kwargs)
3584         if res is None:
3585             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3586             if fatal:
3587                 raise ExtractorError(msg)
3588             else:
3589                 self.report_warning(msg)
3590         return res
3591
3592     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3593                     path='/', secure=False, discard=False, rest={}, **kwargs):
3594         cookie = compat_cookiejar_Cookie(
3595             0, name, value, port, port is not None, domain, True,
3596             domain.startswith('.'), path, True, secure, expire_time,
3597             discard, None, None, rest)
3598         self._downloader.cookiejar.set_cookie(cookie)
3599
3600     def _get_cookies(self, url):
3601         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3602         return compat_cookies_SimpleCookie(self._downloader._calc_cookies(url))
3603
3604     def _apply_first_set_cookie_header(self, url_handle, cookie):
3605         """
3606         Apply first Set-Cookie header instead of the last. Experimental.
3607
3608         Some sites (e.g. [1-3]) may serve two cookies under the same name
3609         in Set-Cookie header and expect the first (old) one to be set rather
3610         than second (new). However, as of RFC6265 the newer one cookie
3611         should be set into cookie store what actually happens.
3612         We will workaround this issue by resetting the cookie to
3613         the first one manually.
3614         1. https://new.vk.com/
3615         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3616         3. https://learning.oreilly.com/
3617         """
3618         for header, cookies in url_handle.headers.items():
3619             if header.lower() != 'set-cookie':
3620                 continue
3621             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3622             cookie_value = re.search(
3623                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3624             if cookie_value:
3625                 value, domain = cookie_value.groups()
3626                 self._set_cookie(domain, cookie, value)
3627                 break
3628
3629     @classmethod
3630     def get_testcases(cls, include_onlymatching=False):
3631         t = getattr(cls, '_TEST', None)
3632         if t:
3633             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3634             tests = [t]
3635         else:
3636             tests = getattr(cls, '_TESTS', [])
3637         for t in tests:
3638             if not include_onlymatching and t.get('only_matching', False):
3639                 continue
3640             t['name'] = cls.ie_key()
3641             yield t
3642
3643     @classproperty
3644     def age_limit(cls):
3645         """Get age limit from the testcases"""
3646         return max(traverse_obj(
3647             tuple(cls.get_testcases(include_onlymatching=False)),
3648             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3649
3650     @classmethod
3651     def is_suitable(cls, age_limit):
3652         """Test whether the extractor is generally suitable for the given age limit"""
3653         return not age_restricted(cls.age_limit, age_limit)
3654
3655     @classmethod
3656     def description(cls, *, markdown=True, search_examples=None):
3657         """Description of the extractor"""
3658         desc = ''
3659         if cls._NETRC_MACHINE:
3660             if markdown:
3661                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3662             else:
3663                 desc += f' [{cls._NETRC_MACHINE}]'
3664         if cls.IE_DESC is False:
3665             desc += ' [HIDDEN]'
3666         elif cls.IE_DESC:
3667             desc += f' {cls.IE_DESC}'
3668         if cls.SEARCH_KEY:
3669             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3670             if search_examples:
3671                 _COUNTS = ('', '5', '10', 'all')
3672                 desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3673         if not cls.working():
3674             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3675
3676         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3677         return f'{name}:{desc}' if desc else name
3678
3679     def extract_subtitles(self, *args, **kwargs):
3680         if (self.get_param('writesubtitles', False)
3681                 or self.get_param('listsubtitles')):
3682             return self._get_subtitles(*args, **kwargs)
3683         return {}
3684
3685     def _get_subtitles(self, *args, **kwargs):
3686         raise NotImplementedError('This method must be implemented by subclasses')
3687
3688     def extract_comments(self, *args, **kwargs):
3689         if not self.get_param('getcomments'):
3690             return None
3691         generator = self._get_comments(*args, **kwargs)
3692
3693         def extractor():
3694             comments = []
3695             interrupted = True
3696             try:
3697                 while True:
3698                     comments.append(next(generator))
3699             except StopIteration:
3700                 interrupted = False
3701             except KeyboardInterrupt:
3702                 self.to_screen('Interrupted by user')
3703             except Exception as e:
3704                 if self.get_param('ignoreerrors') is not True:
3705                     raise
3706                 self._downloader.report_error(e)
3707             comment_count = len(comments)
3708             self.to_screen(f'Extracted {comment_count} comments')
3709             return {
3710                 'comments': comments,
3711                 'comment_count': None if interrupted else comment_count
3712             }
3713         return extractor
3714
3715     def _get_comments(self, *args, **kwargs):
3716         raise NotImplementedError('This method must be implemented by subclasses')
3717
3718     @staticmethod
3719     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3720         """ Merge subtitle items for one language. Items with duplicated URLs/data
3721         will be dropped. """
3722         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3723         ret = list(subtitle_list1)
3724         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3725         return ret
3726
3727     @classmethod
3728     def _merge_subtitles(cls, *dicts, target=None):
3729         """ Merge subtitle dictionaries, language by language. """
3730         if target is None:
3731             target = {}
3732         for d in dicts:
3733             for lang, subs in d.items():
3734                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3735         return target
3736
3737     def extract_automatic_captions(self, *args, **kwargs):
3738         if (self.get_param('writeautomaticsub', False)
3739                 or self.get_param('listsubtitles')):
3740             return self._get_automatic_captions(*args, **kwargs)
3741         return {}
3742
3743     def _get_automatic_captions(self, *args, **kwargs):
3744         raise NotImplementedError('This method must be implemented by subclasses')
3745
3746     @functools.cached_property
3747     def _cookies_passed(self):
3748         """Whether cookies have been passed to YoutubeDL"""
3749         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3750
3751     def mark_watched(self, *args, **kwargs):
3752         if not self.get_param('mark_watched', False):
3753             return
3754         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3755             self._mark_watched(*args, **kwargs)
3756
3757     def _mark_watched(self, *args, **kwargs):
3758         raise NotImplementedError('This method must be implemented by subclasses')
3759
3760     def geo_verification_headers(self):
3761         headers = {}
3762         geo_verification_proxy = self.get_param('geo_verification_proxy')
3763         if geo_verification_proxy:
3764             headers['Ytdl-request-proxy'] = geo_verification_proxy
3765         return headers
3766
3767     def _generic_id(self, url):
3768         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3769
3770     def _generic_title(self, url):
3771         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3772
3773     @staticmethod
3774     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3775         all_known = all(map(
3776             lambda x: x is not None,
3777             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3778         return (
3779             'private' if is_private
3780             else 'premium_only' if needs_premium
3781             else 'subscriber_only' if needs_subscription
3782             else 'needs_auth' if needs_auth
3783             else 'unlisted' if is_unlisted
3784             else 'public' if all_known
3785             else None)
3786
3787     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3788         '''
3789         @returns            A list of values for the extractor argument given by "key"
3790                             or "default" if no such key is present
3791         @param default      The default value to return when the key is not present (default: [])
3792         @param casesense    When false, the values are converted to lower case
3793         '''
3794         val = traverse_obj(
3795             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3796         if val is None:
3797             return [] if default is NO_DEFAULT else default
3798         return list(val) if casesense else [x.lower() for x in val]
3799
3800     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3801         if not playlist_id or not video_id:
3802             return not video_id
3803
3804         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3805         if no_playlist is not None:
3806             return not no_playlist
3807
3808         video_id = '' if video_id is True else f' {video_id}'
3809         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3810         if self.get_param('noplaylist'):
3811             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3812             return False
3813         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3814         return True
3815
3816
3817 class SearchInfoExtractor(InfoExtractor):
3818     """
3819     Base class for paged search queries extractors.
3820     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3821     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3822     """
3823
3824     _MAX_RESULTS = float('inf')
3825
3826     @classmethod
3827     def _make_valid_url(cls):
3828         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3829
3830     def _real_extract(self, query):
3831         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3832         if prefix == '':
3833             return self._get_n_results(query, 1)
3834         elif prefix == 'all':
3835             return self._get_n_results(query, self._MAX_RESULTS)
3836         else:
3837             n = int(prefix)
3838             if n <= 0:
3839                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3840             elif n > self._MAX_RESULTS:
3841                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3842                 n = self._MAX_RESULTS
3843             return self._get_n_results(query, n)
3844
3845     def _get_n_results(self, query, n):
3846         """Get a specified number of results for a query.
3847         Either this function or _search_results must be overridden by subclasses """
3848         return self.playlist_result(
3849             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3850             query, query)
3851
3852     def _search_results(self, query):
3853         """Returns an iterator of search results"""
3854         raise NotImplementedError('This method must be implemented by subclasses')
3855
3856     @classproperty
3857     def SEARCH_KEY(cls):
3858         return cls._SEARCH_KEY