yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import hashlib
   4 import itertools
   5 import json
   6 import math
   7 import netrc
   8 import os
   9 import random
  10 import sys
  11 import time
  12 import xml.etree.ElementTree
  13
  14 from ..compat import functools, re  # isort: split
  15 from ..compat import (
  16     compat_cookiejar_Cookie,
  17     compat_cookies_SimpleCookie,
  18     compat_etree_fromstring,
  19     compat_expanduser,
  20     compat_getpass,
  21     compat_http_client,
  22     compat_os_name,
  23     compat_str,
  24     compat_urllib_error,
  25     compat_urllib_parse_unquote,
  26     compat_urllib_parse_urlencode,
  27     compat_urllib_request,
  28     compat_urlparse,
  29 )
  30 from ..downloader import FileDownloader
  31 from ..downloader.f4m import get_base_url, remove_encrypted_media
  32 from ..utils import (
  33     JSON_LD_RE,
  34     NO_DEFAULT,
  35     ExtractorError,
  36     GeoRestrictedError,
  37     GeoUtils,
  38     LenientJSONDecoder,
  39     RegexNotFoundError,
  40     UnsupportedError,
  41     age_restricted,
  42     base_url,
  43     bug_reports_message,
  44     classproperty,
  45     clean_html,
  46     determine_ext,
  47     determine_protocol,
  48     dict_get,
  49     encode_data_uri,
  50     error_to_compat_str,
  51     extract_attributes,
  52     filter_dict,
  53     fix_xml_ampersands,
  54     float_or_none,
  55     format_field,
  56     int_or_none,
  57     join_nonempty,
  58     js_to_json,
  59     mimetype2ext,
  60     network_exceptions,
  61     orderedSet,
  62     parse_bitrate,
  63     parse_codecs,
  64     parse_duration,
  65     parse_iso8601,
  66     parse_m3u8_attributes,
  67     parse_resolution,
  68     sanitize_filename,
  69     sanitized_Request,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     traverse_obj,
  74     try_get,
  75     unescapeHTML,
  76     unified_strdate,
  77     unified_timestamp,
  78     update_Request,
  79     update_url_query,
  80     url_basename,
  81     url_or_none,
  82     urljoin,
  83     variadic,
  84     xpath_element,
  85     xpath_text,
  86     xpath_with_ns,
  87 )
  88
  89
  90 class InfoExtractor:
  91     """Information Extractor class.
  92
  93     Information extractors are the classes that, given a URL, extract
  94     information about the video (or videos) the URL refers to. This
  95     information includes the real video URL, the video title, author and
  96     others. The information is stored in a dictionary which is then
  97     passed to the YoutubeDL. The YoutubeDL processes this
  98     information possibly downloading the video to the file system, among
  99     other possible outcomes.
 100
 101     The type field determines the type of the result.
 102     By far the most common value (and the default if _type is missing) is
 103     "video", which indicates a single video.
 104
 105     For a video, the dictionaries must include the following fields:
 106
 107     id:             Video identifier.
 108     title:          Video title, unescaped. Set to an empty string if video has
 109                     no title as opposed to "None" which signifies that the
 110                     extractor failed to obtain a title
 111
 112     Additionally, it must contain either a formats entry or a url one:
 113
 114     formats:        A list of dictionaries for each format available, ordered
 115                     from worst to best quality.
 116
 117                     Potential fields:
 118                     * url        The mandatory URL representing the media:
 119                                    for plain file media - HTTP URL of this file,
 120                                    for RTMP - RTMP URL,
 121                                    for HLS - URL of the M3U8 media playlist,
 122                                    for HDS - URL of the F4M manifest,
 123                                    for DASH
 124                                      - HTTP URL to plain file media (in case of
 125                                        unfragmented media)
 126                                      - URL of the MPD manifest or base URL
 127                                        representing the media if MPD manifest
 128                                        is parsed from a string (in case of
 129                                        fragmented media)
 130                                    for MSS - URL of the ISM manifest.
 131                     * manifest_url
 132                                  The URL of the manifest file in case of
 133                                  fragmented media:
 134                                    for HLS - URL of the M3U8 master playlist,
 135                                    for HDS - URL of the F4M manifest,
 136                                    for DASH - URL of the MPD manifest,
 137                                    for MSS - URL of the ISM manifest.
 138                     * manifest_stream_number  (For internal use only)
 139                                  The index of the stream in the manifest file
 140                     * ext        Will be calculated from URL if missing
 141                     * format     A human-readable description of the format
 142                                  ("mp4 container with h264/opus").
 143                                  Calculated from the format_id, width, height.
 144                                  and format_note fields if missing.
 145                     * format_id  A short description of the format
 146                                  ("mp4_h264_opus" or "19").
 147                                 Technically optional, but strongly recommended.
 148                     * format_note Additional info about the format
 149                                  ("3D" or "DASH video")
 150                     * width      Width of the video, if known
 151                     * height     Height of the video, if known
 152                     * resolution Textual description of width and height
 153                     * dynamic_range The dynamic range of the video. One of:
 154                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 155                     * tbr        Average bitrate of audio and video in KBit/s
 156                     * abr        Average audio bitrate in KBit/s
 157                     * acodec     Name of the audio codec in use
 158                     * asr        Audio sampling rate in Hertz
 159                     * vbr        Average video bitrate in KBit/s
 160                     * fps        Frame rate
 161                     * vcodec     Name of the video codec in use
 162                     * container  Name of the container format
 163                     * filesize   The number of bytes, if known in advance
 164                     * filesize_approx  An estimate for the number of bytes
 165                     * player_url SWF Player URL (used for rtmpdump).
 166                     * protocol   The protocol that will be used for the actual
 167                                  download, lower-case. One of "http", "https" or
 168                                  one of the protocols defined in downloader.PROTOCOL_MAP
 169                     * fragment_base_url
 170                                  Base URL for fragments. Each fragment's path
 171                                  value (if present) will be relative to
 172                                  this URL.
 173                     * fragments  A list of fragments of a fragmented media.
 174                                  Each fragment entry must contain either an url
 175                                  or a path. If an url is present it should be
 176                                  considered by a client. Otherwise both path and
 177                                  fragment_base_url must be present. Here is
 178                                  the list of all potential fields:
 179                                  * "url" - fragment's URL
 180                                  * "path" - fragment's path relative to
 181                                             fragment_base_url
 182                                  * "duration" (optional, int or float)
 183                                  * "filesize" (optional, int)
 184                     * is_from_start  Is a live format that can be downloaded
 185                                 from the start. Boolean
 186                     * preference Order number of this format. If this field is
 187                                  present and not None, the formats get sorted
 188                                  by this field, regardless of all other values.
 189                                  -1 for default (order by other properties),
 190                                  -2 or smaller for less than default.
 191                                  < -1000 to hide the format (if there is
 192                                     another one which is strictly better)
 193                     * language   Language code, e.g. "de" or "en-US".
 194                     * language_preference  Is this in the language mentioned in
 195                                  the URL?
 196                                  10 if it's what the URL is about,
 197                                  -1 for default (don't know),
 198                                  -10 otherwise, other values reserved for now.
 199                     * quality    Order number of the video quality of this
 200                                  format, irrespective of the file format.
 201                                  -1 for default (order by other properties),
 202                                  -2 or smaller for less than default.
 203                     * source_preference  Order number for this video source
 204                                   (quality takes higher priority)
 205                                  -1 for default (order by other properties),
 206                                  -2 or smaller for less than default.
 207                     * http_headers  A dictionary of additional HTTP headers
 208                                  to add to the request.
 209                     * stretched_ratio  If given and not 1, indicates that the
 210                                  video's pixels are not square.
 211                                  width : height ratio as float.
 212                     * no_resume  The server does not support resuming the
 213                                  (HTTP or RTMP) download. Boolean.
 214                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 215                     * downloader_options  A dictionary of downloader options
 216                                  (For internal use only)
 217                                  * http_chunk_size Chunk size for HTTP downloads
 218                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 219                     RTMP formats can also have the additional fields: page_url,
 220                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 221                     rtmp_protocol, rtmp_real_time
 222
 223     url:            Final video URL.
 224     ext:            Video filename extension.
 225     format:         The video format, defaults to ext (used for --get-format)
 226     player_url:     SWF Player URL (used for rtmpdump).
 227
 228     The following fields are optional:
 229
 230     direct:         True if a direct video file was given (must only be set by GenericIE)
 231     alt_title:      A secondary title of the video.
 232     display_id      An alternative identifier for the video, not necessarily
 233                     unique, but available before title. Typically, id is
 234                     something like "4234987", title "Dancing naked mole rats",
 235                     and display_id "dancing-naked-mole-rats"
 236     thumbnails:     A list of dictionaries, with the following entries:
 237                         * "id" (optional, string) - Thumbnail format ID
 238                         * "url"
 239                         * "preference" (optional, int) - quality of the image
 240                         * "width" (optional, int)
 241                         * "height" (optional, int)
 242                         * "resolution" (optional, string "{width}x{height}",
 243                                         deprecated)
 244                         * "filesize" (optional, int)
 245                         * "http_headers" (dict) - HTTP headers for the request
 246     thumbnail:      Full URL to a video thumbnail image.
 247     description:    Full video description.
 248     uploader:       Full name of the video uploader.
 249     license:        License name the video is licensed under.
 250     creator:        The creator of the video.
 251     timestamp:      UNIX timestamp of the moment the video was uploaded
 252     upload_date:    Video upload date in UTC (YYYYMMDD).
 253                     If not explicitly set, calculated from timestamp
 254     release_timestamp: UNIX timestamp of the moment the video was released.
 255                     If it is not clear whether to use timestamp or this, use the former
 256     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 257                     If not explicitly set, calculated from release_timestamp
 258     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 259     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 260                     If not explicitly set, calculated from modified_timestamp
 261     uploader_id:    Nickname or id of the video uploader.
 262     uploader_url:   Full URL to a personal webpage of the video uploader.
 263     channel:        Full name of the channel the video is uploaded on.
 264                     Note that channel fields may or may not repeat uploader
 265                     fields. This depends on a particular extractor.
 266     channel_id:     Id of the channel.
 267     channel_url:    Full URL to a channel webpage.
 268     channel_follower_count: Number of followers of the channel.
 269     location:       Physical location where the video was filmed.
 270     subtitles:      The available subtitles as a dictionary in the format
 271                     {tag: subformats}. "tag" is usually a language code, and
 272                     "subformats" is a list sorted from lower to higher
 273                     preference, each element is a dictionary with the "ext"
 274                     entry and one of:
 275                         * "data": The subtitles file contents
 276                         * "url": A URL pointing to the subtitles file
 277                     It can optionally also have:
 278                         * "name": Name or description of the subtitles
 279                         * "http_headers": A dictionary of additional HTTP headers
 280                                   to add to the request.
 281                     "ext" will be calculated from URL if missing
 282     automatic_captions: Like 'subtitles'; contains automatically generated
 283                     captions instead of normal subtitles
 284     duration:       Length of the video in seconds, as an integer or float.
 285     view_count:     How many users have watched the video on the platform.
 286     like_count:     Number of positive ratings of the video
 287     dislike_count:  Number of negative ratings of the video
 288     repost_count:   Number of reposts of the video
 289     average_rating: Average rating give by users, the scale used depends on the webpage
 290     comment_count:  Number of comments on the video
 291     comments:       A list of comments, each with one or more of the following
 292                     properties (all but one of text or html optional):
 293                         * "author" - human-readable name of the comment author
 294                         * "author_id" - user ID of the comment author
 295                         * "author_thumbnail" - The thumbnail of the comment author
 296                         * "id" - Comment ID
 297                         * "html" - Comment as HTML
 298                         * "text" - Plain text of the comment
 299                         * "timestamp" - UNIX timestamp of comment
 300                         * "parent" - ID of the comment this one is replying to.
 301                                      Set to "root" to indicate that this is a
 302                                      comment to the original video.
 303                         * "like_count" - Number of positive ratings of the comment
 304                         * "dislike_count" - Number of negative ratings of the comment
 305                         * "is_favorited" - Whether the comment is marked as
 306                                            favorite by the video uploader
 307                         * "author_is_uploader" - Whether the comment is made by
 308                                                  the video uploader
 309     age_limit:      Age restriction for the video, as an integer (years)
 310     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 311                     should allow to get the same result again. (It will be set
 312                     by YoutubeDL if it's missing)
 313     categories:     A list of categories that the video falls in, for example
 314                     ["Sports", "Berlin"]
 315     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 316     cast:           A list of the video cast
 317     is_live:        True, False, or None (=unknown). Whether this video is a
 318                     live stream that goes on instead of a fixed-length video.
 319     was_live:       True, False, or None (=unknown). Whether this video was
 320                     originally a live stream.
 321     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 322                     If absent, automatically set from is_live, was_live
 323     start_time:     Time in seconds where the reproduction should start, as
 324                     specified in the URL.
 325     end_time:       Time in seconds where the reproduction should end, as
 326                     specified in the URL.
 327     chapters:       A list of dictionaries, with the following entries:
 328                         * "start_time" - The start time of the chapter in seconds
 329                         * "end_time" - The end time of the chapter in seconds
 330                         * "title" (optional, string)
 331     playable_in_embed: Whether this video is allowed to play in embedded
 332                     players on other sites. Can be True (=always allowed),
 333                     False (=never allowed), None (=unknown), or a string
 334                     specifying the criteria for embedability (Eg: 'whitelist')
 335     availability:   Under what condition the video is available. One of
 336                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 337                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 338                     to set it
 339     __post_extractor: A function to be called just before the metadata is
 340                     written to either disk, logger or console. The function
 341                     must return a dict which will be added to the info_dict.
 342                     This is usefull for additional information that is
 343                     time-consuming to extract. Note that the fields thus
 344                     extracted will not be available to output template and
 345                     match_filter. So, only "comments" and "comment_count" are
 346                     currently allowed to be extracted via this method.
 347
 348     The following fields should only be used when the video belongs to some logical
 349     chapter or section:
 350
 351     chapter:        Name or title of the chapter the video belongs to.
 352     chapter_number: Number of the chapter the video belongs to, as an integer.
 353     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 354
 355     The following fields should only be used when the video is an episode of some
 356     series, programme or podcast:
 357
 358     series:         Title of the series or programme the video episode belongs to.
 359     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 360     season:         Title of the season the video episode belongs to.
 361     season_number:  Number of the season the video episode belongs to, as an integer.
 362     season_id:      Id of the season the video episode belongs to, as a unicode string.
 363     episode:        Title of the video episode. Unlike mandatory video title field,
 364                     this field should denote the exact title of the video episode
 365                     without any kind of decoration.
 366     episode_number: Number of the video episode within a season, as an integer.
 367     episode_id:     Id of the video episode, as a unicode string.
 368
 369     The following fields should only be used when the media is a track or a part of
 370     a music album:
 371
 372     track:          Title of the track.
 373     track_number:   Number of the track within an album or a disc, as an integer.
 374     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 375                     as a unicode string.
 376     artist:         Artist(s) of the track.
 377     genre:          Genre(s) of the track.
 378     album:          Title of the album the track belongs to.
 379     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 380     album_artist:   List of all artists appeared on the album (e.g.
 381                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 382                     and compilations).
 383     disc_number:    Number of the disc or other physical medium the track belongs to,
 384                     as an integer.
 385     release_year:   Year (YYYY) when the album was released.
 386     composer:       Composer of the piece
 387
 388     Unless mentioned otherwise, the fields should be Unicode strings.
 389
 390     Unless mentioned otherwise, None is equivalent to absence of information.
 391
 392
 393     _type "playlist" indicates multiple videos.
 394     There must be a key "entries", which is a list, an iterable, or a PagedList
 395     object, each element of which is a valid dictionary by this specification.
 396
 397     Additionally, playlists can have "id", "title", and any other relevent
 398     attributes with the same semantics as videos (see above).
 399
 400     It can also have the following optional fields:
 401
 402     playlist_count: The total number of videos in a playlist. If not given,
 403                     YoutubeDL tries to calculate it from "entries"
 404
 405
 406     _type "multi_video" indicates that there are multiple videos that
 407     form a single show, for examples multiple acts of an opera or TV episode.
 408     It must have an entries key like a playlist and contain all the keys
 409     required for a video at the same time.
 410
 411
 412     _type "url" indicates that the video must be extracted from another
 413     location, possibly by a different extractor. Its only required key is:
 414     "url" - the next URL to extract.
 415     The key "ie_key" can be set to the class name (minus the trailing "IE",
 416     e.g. "Youtube") if the extractor class is known in advance.
 417     Additionally, the dictionary may have any properties of the resolved entity
 418     known in advance, for example "title" if the title of the referred video is
 419     known ahead of time.
 420
 421
 422     _type "url_transparent" entities have the same specification as "url", but
 423     indicate that the given additional information is more precise than the one
 424     associated with the resolved URL.
 425     This is useful when a site employs a video service that hosts the video and
 426     its technical metadata, but that video service does not embed a useful
 427     title, description etc.
 428
 429
 430     Subclasses of this should define a _VALID_URL regexp and, re-define the
 431     _real_extract() and (optionally) _real_initialize() methods.
 432     Probably, they should also be added to the list of extractors.
 433
 434     Subclasses may also override suitable() if necessary, but ensure the function
 435     signature is preserved and that this function imports everything it needs
 436     (except other extractors), so that lazy_extractors works correctly.
 437
 438     To support username + password (or netrc) login, the extractor must define a
 439     _NETRC_MACHINE and re-define _perform_login(username, password) and
 440     (optionally) _initialize_pre_login() methods. The _perform_login method will
 441     be called between _initialize_pre_login and _real_initialize if credentials
 442     are passed by the user. In cases where it is necessary to have the login
 443     process as part of the extraction rather than initialization, _perform_login
 444     can be left undefined.
 445
 446     _GEO_BYPASS attribute may be set to False in order to disable
 447     geo restriction bypass mechanisms for a particular extractor.
 448     Though it won't disable explicit geo restriction bypass based on
 449     country code provided with geo_bypass_country.
 450
 451     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 452     countries for this extractor. One of these countries will be used by
 453     geo restriction bypass mechanism right away in order to bypass
 454     geo restriction, of course, if the mechanism is not disabled.
 455
 456     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 457     IP blocks in CIDR notation for this extractor. One of these IP blocks
 458     will be used by geo restriction bypass mechanism similarly
 459     to _GEO_COUNTRIES.
 460
 461     The _WORKING attribute should be set to False for broken IEs
 462     in order to warn the users and skip the tests.
 463     """
 464
 465     _ready = False
 466     _downloader = None
 467     _x_forwarded_for_ip = None
 468     _GEO_BYPASS = True
 469     _GEO_COUNTRIES = None
 470     _GEO_IP_BLOCKS = None
 471     _WORKING = True
 472     _NETRC_MACHINE = None
 473     IE_DESC = None
 474     SEARCH_KEY = None
 475
 476     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 477         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 478         return {
 479             None: '',
 480             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 481             'password': f'Use {password_hint}',
 482             'cookies': (
 483                 'Use --cookies-from-browser or --cookies for the authentication. '
 484                 'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 485         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 486
 487     def __init__(self, downloader=None):
 488         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 489         If a downloader is not passed during initialization,
 490         it must be set using "set_downloader()" before "extract()" is called"""
 491         self._ready = False
 492         self._x_forwarded_for_ip = None
 493         self._printed_messages = set()
 494         self.set_downloader(downloader)
 495
 496     @classmethod
 497     def _match_valid_url(cls, url):
 498         # This does not use has/getattr intentionally - we want to know whether
 499         # we have cached the regexp for *this* class, whereas getattr would also
 500         # match the superclass
 501         if '_VALID_URL_RE' not in cls.__dict__:
 502             if '_VALID_URL' not in cls.__dict__:
 503                 cls._VALID_URL = cls._make_valid_url()
 504             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 505         return cls._VALID_URL_RE.match(url)
 506
 507     @classmethod
 508     def suitable(cls, url):
 509         """Receives a URL and returns True if suitable for this IE."""
 510         # This function must import everything it needs (except other extractors),
 511         # so that lazy_extractors works correctly
 512         return cls._match_valid_url(url) is not None
 513
 514     @classmethod
 515     def _match_id(cls, url):
 516         return cls._match_valid_url(url).group('id')
 517
 518     @classmethod
 519     def get_temp_id(cls, url):
 520         try:
 521             return cls._match_id(url)
 522         except (IndexError, AttributeError):
 523             return None
 524
 525     @classmethod
 526     def working(cls):
 527         """Getter method for _WORKING."""
 528         return cls._WORKING
 529
 530     @classmethod
 531     def supports_login(cls):
 532         return bool(cls._NETRC_MACHINE)
 533
 534     def initialize(self):
 535         """Initializes an instance (authentication, etc)."""
 536         self._printed_messages = set()
 537         self._initialize_geo_bypass({
 538             'countries': self._GEO_COUNTRIES,
 539             'ip_blocks': self._GEO_IP_BLOCKS,
 540         })
 541         if not self._ready:
 542             self._initialize_pre_login()
 543             if self.supports_login():
 544                 username, password = self._get_login_info()
 545                 if username:
 546                     self._perform_login(username, password)
 547             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 548                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 549             self._real_initialize()
 550             self._ready = True
 551
 552     def _initialize_geo_bypass(self, geo_bypass_context):
 553         """
 554         Initialize geo restriction bypass mechanism.
 555
 556         This method is used to initialize geo bypass mechanism based on faking
 557         X-Forwarded-For HTTP header. A random country from provided country list
 558         is selected and a random IP belonging to this country is generated. This
 559         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 560         HTTP requests.
 561
 562         This method will be used for initial geo bypass mechanism initialization
 563         during the instance initialization with _GEO_COUNTRIES and
 564         _GEO_IP_BLOCKS.
 565
 566         You may also manually call it from extractor's code if geo bypass
 567         information is not available beforehand (e.g. obtained during
 568         extraction) or due to some other reason. In this case you should pass
 569         this information in geo bypass context passed as first argument. It may
 570         contain following fields:
 571
 572         countries:  List of geo unrestricted countries (similar
 573                     to _GEO_COUNTRIES)
 574         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 575                     (similar to _GEO_IP_BLOCKS)
 576
 577         """
 578         if not self._x_forwarded_for_ip:
 579
 580             # Geo bypass mechanism is explicitly disabled by user
 581             if not self.get_param('geo_bypass', True):
 582                 return
 583
 584             if not geo_bypass_context:
 585                 geo_bypass_context = {}
 586
 587             # Backward compatibility: previously _initialize_geo_bypass
 588             # expected a list of countries, some 3rd party code may still use
 589             # it this way
 590             if isinstance(geo_bypass_context, (list, tuple)):
 591                 geo_bypass_context = {
 592                     'countries': geo_bypass_context,
 593                 }
 594
 595             # The whole point of geo bypass mechanism is to fake IP
 596             # as X-Forwarded-For HTTP header based on some IP block or
 597             # country code.
 598
 599             # Path 1: bypassing based on IP block in CIDR notation
 600
 601             # Explicit IP block specified by user, use it right away
 602             # regardless of whether extractor is geo bypassable or not
 603             ip_block = self.get_param('geo_bypass_ip_block', None)
 604
 605             # Otherwise use random IP block from geo bypass context but only
 606             # if extractor is known as geo bypassable
 607             if not ip_block:
 608                 ip_blocks = geo_bypass_context.get('ip_blocks')
 609                 if self._GEO_BYPASS and ip_blocks:
 610                     ip_block = random.choice(ip_blocks)
 611
 612             if ip_block:
 613                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 614                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 615                 return
 616
 617             # Path 2: bypassing based on country code
 618
 619             # Explicit country code specified by user, use it right away
 620             # regardless of whether extractor is geo bypassable or not
 621             country = self.get_param('geo_bypass_country', None)
 622
 623             # Otherwise use random country code from geo bypass context but
 624             # only if extractor is known as geo bypassable
 625             if not country:
 626                 countries = geo_bypass_context.get('countries')
 627                 if self._GEO_BYPASS and countries:
 628                     country = random.choice(countries)
 629
 630             if country:
 631                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 632                 self._downloader.write_debug(
 633                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 634
 635     def extract(self, url):
 636         """Extracts URL information and returns it in list of dicts."""
 637         try:
 638             for _ in range(2):
 639                 try:
 640                     self.initialize()
 641                     self.write_debug('Extracting URL: %s' % url)
 642                     ie_result = self._real_extract(url)
 643                     if ie_result is None:
 644                         return None
 645                     if self._x_forwarded_for_ip:
 646                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 647                     subtitles = ie_result.get('subtitles')
 648                     if (subtitles and 'live_chat' in subtitles
 649                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 650                         del subtitles['live_chat']
 651                     return ie_result
 652                 except GeoRestrictedError as e:
 653                     if self.__maybe_fake_ip_and_retry(e.countries):
 654                         continue
 655                     raise
 656         except UnsupportedError:
 657             raise
 658         except ExtractorError as e:
 659             kwargs = {
 660                 'video_id': e.video_id or self.get_temp_id(url),
 661                 'ie': self.IE_NAME,
 662                 'tb': e.traceback or sys.exc_info()[2],
 663                 'expected': e.expected,
 664                 'cause': e.cause
 665             }
 666             if hasattr(e, 'countries'):
 667                 kwargs['countries'] = e.countries
 668             raise type(e)(e.orig_msg, **kwargs)
 669         except compat_http_client.IncompleteRead as e:
 670             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 671         except (KeyError, StopIteration) as e:
 672             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 673
 674     def __maybe_fake_ip_and_retry(self, countries):
 675         if (not self.get_param('geo_bypass_country', None)
 676                 and self._GEO_BYPASS
 677                 and self.get_param('geo_bypass', True)
 678                 and not self._x_forwarded_for_ip
 679                 and countries):
 680             country_code = random.choice(countries)
 681             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 682             if self._x_forwarded_for_ip:
 683                 self.report_warning(
 684                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 685                     % (self._x_forwarded_for_ip, country_code.upper()))
 686                 return True
 687         return False
 688
 689     def set_downloader(self, downloader):
 690         """Sets a YoutubeDL instance as the downloader for this IE."""
 691         self._downloader = downloader
 692
 693     def _initialize_pre_login(self):
 694         """ Intialization before login. Redefine in subclasses."""
 695         pass
 696
 697     def _perform_login(self, username, password):
 698         """ Login with username and password. Redefine in subclasses."""
 699         pass
 700
 701     def _real_initialize(self):
 702         """Real initialization process. Redefine in subclasses."""
 703         pass
 704
 705     def _real_extract(self, url):
 706         """Real extraction process. Redefine in subclasses."""
 707         raise NotImplementedError('This method must be implemented by subclasses')
 708
 709     @classmethod
 710     def ie_key(cls):
 711         """A string for getting the InfoExtractor with get_info_extractor"""
 712         return cls.__name__[:-2]
 713
 714     @classproperty
 715     def IE_NAME(cls):
 716         return cls.__name__[:-2]
 717
 718     @staticmethod
 719     def __can_accept_status_code(err, expected_status):
 720         assert isinstance(err, compat_urllib_error.HTTPError)
 721         if expected_status is None:
 722             return False
 723         elif callable(expected_status):
 724             return expected_status(err.code) is True
 725         else:
 726             return err.code in variadic(expected_status)
 727
 728     def _create_request(self, url_or_request, data=None, headers={}, query={}):
 729         if isinstance(url_or_request, compat_urllib_request.Request):
 730             return update_Request(url_or_request, data=data, headers=headers, query=query)
 731         if query:
 732             url_or_request = update_url_query(url_or_request, query)
 733         return sanitized_Request(url_or_request, data, headers)
 734
 735     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 736         """
 737         Return the response handle.
 738
 739         See _download_webpage docstring for arguments specification.
 740         """
 741         if not self._downloader._first_webpage_request:
 742             sleep_interval = self.get_param('sleep_interval_requests') or 0
 743             if sleep_interval > 0:
 744                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 745                 time.sleep(sleep_interval)
 746         else:
 747             self._downloader._first_webpage_request = False
 748
 749         if note is None:
 750             self.report_download_webpage(video_id)
 751         elif note is not False:
 752             if video_id is None:
 753                 self.to_screen(str(note))
 754             else:
 755                 self.to_screen(f'{video_id}: {note}')
 756
 757         # Some sites check X-Forwarded-For HTTP header in order to figure out
 758         # the origin of the client behind proxy. This allows bypassing geo
 759         # restriction by faking this header's value to IP that belongs to some
 760         # geo unrestricted country. We will do so once we encounter any
 761         # geo restriction error.
 762         if self._x_forwarded_for_ip:
 763             if 'X-Forwarded-For' not in headers:
 764                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 765
 766         try:
 767             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 768         except network_exceptions as err:
 769             if isinstance(err, compat_urllib_error.HTTPError):
 770                 if self.__can_accept_status_code(err, expected_status):
 771                     # Retain reference to error to prevent file object from
 772                     # being closed before it can be read. Works around the
 773                     # effects of <https://bugs.python.org/issue15002>
 774                     # introduced in Python 3.4.1.
 775                     err.fp._error = err
 776                     return err.fp
 777
 778             if errnote is False:
 779                 return False
 780             if errnote is None:
 781                 errnote = 'Unable to download webpage'
 782
 783             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 784             if fatal:
 785                 raise ExtractorError(errmsg, cause=err)
 786             else:
 787                 self.report_warning(errmsg)
 788                 return False
 789
 790     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 791                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 792         """
 793         Return a tuple (page content as string, URL handle).
 794
 795         Arguments:
 796         url_or_request -- plain text URL as a string or
 797             a compat_urllib_request.Requestobject
 798         video_id -- Video/playlist/item identifier (string)
 799
 800         Keyword arguments:
 801         note -- note printed before downloading (string)
 802         errnote -- note printed in case of an error (string)
 803         fatal -- flag denoting whether error should be considered fatal,
 804             i.e. whether it should cause ExtractionError to be raised,
 805             otherwise a warning will be reported and extraction continued
 806         encoding -- encoding for a page content decoding, guessed automatically
 807             when not explicitly specified
 808         data -- POST data (bytes)
 809         headers -- HTTP headers (dict)
 810         query -- URL query (dict)
 811         expected_status -- allows to accept failed HTTP requests (non 2xx
 812             status code) by explicitly specifying a set of accepted status
 813             codes. Can be any of the following entities:
 814                 - an integer type specifying an exact failed status code to
 815                   accept
 816                 - a list or a tuple of integer types specifying a list of
 817                   failed status codes to accept
 818                 - a callable accepting an actual failed status code and
 819                   returning True if it should be accepted
 820             Note that this argument does not affect success status codes (2xx)
 821             which are always accepted.
 822         """
 823
 824         # Strip hashes from the URL (#1038)
 825         if isinstance(url_or_request, (compat_str, str)):
 826             url_or_request = url_or_request.partition('#')[0]
 827
 828         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 829         if urlh is False:
 830             assert not fatal
 831             return False
 832         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 833         return (content, urlh)
 834
 835     @staticmethod
 836     def _guess_encoding_from_content(content_type, webpage_bytes):
 837         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 838         if m:
 839             encoding = m.group(1)
 840         else:
 841             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 842                           webpage_bytes[:1024])
 843             if m:
 844                 encoding = m.group(1).decode('ascii')
 845             elif webpage_bytes.startswith(b'\xff\xfe'):
 846                 encoding = 'utf-16'
 847             else:
 848                 encoding = 'utf-8'
 849
 850         return encoding
 851
 852     def __check_blocked(self, content):
 853         first_block = content[:512]
 854         if ('<title>Access to this site is blocked</title>' in content
 855                 and 'Websense' in first_block):
 856             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 857             blocked_iframe = self._html_search_regex(
 858                 r'<iframe src="([^"]+)"', content,
 859                 'Websense information URL', default=None)
 860             if blocked_iframe:
 861                 msg += ' Visit %s for more details' % blocked_iframe
 862             raise ExtractorError(msg, expected=True)
 863         if '<title>The URL you requested has been blocked</title>' in first_block:
 864             msg = (
 865                 'Access to this webpage has been blocked by Indian censorship. '
 866                 'Use a VPN or proxy server (with --proxy) to route around it.')
 867             block_msg = self._html_search_regex(
 868                 r'</h1><p>(.*?)</p>',
 869                 content, 'block message', default=None)
 870             if block_msg:
 871                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 872             raise ExtractorError(msg, expected=True)
 873         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 874                 and 'blocklist.rkn.gov.ru' in content):
 875             raise ExtractorError(
 876                 'Access to this webpage has been blocked by decision of the Russian government. '
 877                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 878                 expected=True)
 879
 880     def _request_dump_filename(self, url, video_id):
 881         basen = f'{video_id}_{url}'
 882         trim_length = self.get_param('trim_file_name') or 240
 883         if len(basen) > trim_length:
 884             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 885             basen = basen[:trim_length - len(h)] + h
 886         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 887         # Working around MAX_PATH limitation on Windows (see
 888         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 889         if compat_os_name == 'nt':
 890             absfilepath = os.path.abspath(filename)
 891             if len(absfilepath) > 259:
 892                 filename = fR'\\?\{absfilepath}'
 893         return filename
 894
 895     def __decode_webpage(self, webpage_bytes, encoding, headers):
 896         if not encoding:
 897             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 898         try:
 899             return webpage_bytes.decode(encoding, 'replace')
 900         except LookupError:
 901             return webpage_bytes.decode('utf-8', 'replace')
 902
 903     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 904         webpage_bytes = urlh.read()
 905         if prefix is not None:
 906             webpage_bytes = prefix + webpage_bytes
 907         if self.get_param('dump_intermediate_pages', False):
 908             self.to_screen('Dumping request to ' + urlh.geturl())
 909             dump = base64.b64encode(webpage_bytes).decode('ascii')
 910             self._downloader.to_screen(dump)
 911         if self.get_param('write_pages'):
 912             filename = self._request_dump_filename(urlh.geturl(), video_id)
 913             self.to_screen(f'Saving request to {filename}')
 914             with open(filename, 'wb') as outf:
 915                 outf.write(webpage_bytes)
 916
 917         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 918         self.__check_blocked(content)
 919
 920         return content
 921
 922     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 923         if transform_source:
 924             xml_string = transform_source(xml_string)
 925         try:
 926             return compat_etree_fromstring(xml_string.encode('utf-8'))
 927         except xml.etree.ElementTree.ParseError as ve:
 928             errmsg = '%s: Failed to parse XML ' % video_id
 929             if fatal:
 930                 raise ExtractorError(errmsg, cause=ve)
 931             else:
 932                 self.report_warning(errmsg + str(ve))
 933
 934     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, **parser_kwargs):
 935         try:
 936             return json.loads(
 937                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 938         except ValueError as ve:
 939             errmsg = f'{video_id}: Failed to parse JSON'
 940             if fatal:
 941                 raise ExtractorError(errmsg, cause=ve)
 942             else:
 943                 self.report_warning(f'{errmsg}: {ve}')
 944
 945     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 946         return self._parse_json(
 947             data[data.find('{'):data.rfind('}') + 1],
 948             video_id, transform_source, fatal)
 949
 950     def __create_download_methods(name, parser, note, errnote, return_value):
 951
 952         def parse(ie, content, *args, **kwargs):
 953             if parser is None:
 954                 return content
 955             # parser is fetched by name so subclasses can override it
 956             return getattr(ie, parser)(content, *args, **kwargs)
 957
 958         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 959                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 960             res = self._download_webpage_handle(
 961                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 962                 data=data, headers=headers, query=query, expected_status=expected_status)
 963             if res is False:
 964                 return res
 965             content, urlh = res
 966             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal), urlh
 967
 968         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 969                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 970             if self.get_param('load_pages'):
 971                 url_or_request = self._create_request(url_or_request, data, headers, query)
 972                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
 973                 self.to_screen(f'Loading request from {filename}')
 974                 try:
 975                     with open(filename, 'rb') as dumpf:
 976                         webpage_bytes = dumpf.read()
 977                 except OSError as e:
 978                     self.report_warning(f'Unable to load request from disk: {e}')
 979                 else:
 980                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
 981                     return parse(self, content, video_id, transform_source, fatal)
 982             kwargs = {
 983                 'note': note,
 984                 'errnote': errnote,
 985                 'transform_source': transform_source,
 986                 'fatal': fatal,
 987                 'encoding': encoding,
 988                 'data': data,
 989                 'headers': headers,
 990                 'query': query,
 991                 'expected_status': expected_status,
 992             }
 993             if parser is None:
 994                 kwargs.pop('transform_source')
 995             # The method is fetched by name so subclasses can override _download_..._handle
 996             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
 997             return res if res is False else res[0]
 998
 999         def impersonate(func, name, return_value):
1000             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1001             func.__doc__ = f'''
1002                 @param transform_source     Apply this transformation before parsing
1003                 @returns                    {return_value}
1004
1005                 See _download_webpage_handle docstring for other arguments specification
1006             '''
1007
1008         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1009         impersonate(download_content, f'_download_{name}', f'{return_value}')
1010         return download_handle, download_content
1011
1012     _download_xml_handle, _download_xml = __create_download_methods(
1013         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1014     _download_json_handle, _download_json = __create_download_methods(
1015         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1016     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1017         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1018     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1019
1020     def _download_webpage(
1021             self, url_or_request, video_id, note=None, errnote=None,
1022             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1023         """
1024         Return the data of the page as a string.
1025
1026         Keyword arguments:
1027         tries -- number of tries
1028         timeout -- sleep interval between tries
1029
1030         See _download_webpage_handle docstring for other arguments specification.
1031         """
1032
1033         R''' # NB: These are unused; should they be deprecated?
1034         if tries != 1:
1035             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1036         if timeout is NO_DEFAULT:
1037             timeout = 5
1038         else:
1039             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1040         '''
1041
1042         try_count = 0
1043         while True:
1044             try:
1045                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1046             except compat_http_client.IncompleteRead as e:
1047                 try_count += 1
1048                 if try_count >= tries:
1049                     raise e
1050                 self._sleep(timeout, video_id)
1051
1052     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1053         idstr = format_field(video_id, None, '%s: ')
1054         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1055         if only_once:
1056             if f'WARNING: {msg}' in self._printed_messages:
1057                 return
1058             self._printed_messages.add(f'WARNING: {msg}')
1059         self._downloader.report_warning(msg, *args, **kwargs)
1060
1061     def to_screen(self, msg, *args, **kwargs):
1062         """Print msg to screen, prefixing it with '[ie_name]'"""
1063         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1064
1065     def write_debug(self, msg, *args, **kwargs):
1066         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1067
1068     def get_param(self, name, default=None, *args, **kwargs):
1069         if self._downloader:
1070             return self._downloader.params.get(name, default, *args, **kwargs)
1071         return default
1072
1073     def report_drm(self, video_id, partial=False):
1074         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1075
1076     def report_extraction(self, id_or_name):
1077         """Report information extraction."""
1078         self.to_screen('%s: Extracting information' % id_or_name)
1079
1080     def report_download_webpage(self, video_id):
1081         """Report webpage download."""
1082         self.to_screen('%s: Downloading webpage' % video_id)
1083
1084     def report_age_confirmation(self):
1085         """Report attempt to confirm age."""
1086         self.to_screen('Confirming age')
1087
1088     def report_login(self):
1089         """Report attempt to log in."""
1090         self.to_screen('Logging in')
1091
1092     def raise_login_required(
1093             self, msg='This video is only available for registered users',
1094             metadata_available=False, method=NO_DEFAULT):
1095         if metadata_available and (
1096                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1097             self.report_warning(msg)
1098             return
1099         msg += format_field(self._login_hint(method), None, '. %s')
1100         raise ExtractorError(msg, expected=True)
1101
1102     def raise_geo_restricted(
1103             self, msg='This video is not available from your location due to geo restriction',
1104             countries=None, metadata_available=False):
1105         if metadata_available and (
1106                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1107             self.report_warning(msg)
1108         else:
1109             raise GeoRestrictedError(msg, countries=countries)
1110
1111     def raise_no_formats(self, msg, expected=False, video_id=None):
1112         if expected and (
1113                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1114             self.report_warning(msg, video_id)
1115         elif isinstance(msg, ExtractorError):
1116             raise msg
1117         else:
1118             raise ExtractorError(msg, expected=expected, video_id=video_id)
1119
1120     # Methods for following #608
1121     @staticmethod
1122     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1123         """Returns a URL that points to a page that should be processed"""
1124         if ie is not None:
1125             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1126         if video_id is not None:
1127             kwargs['id'] = video_id
1128         if video_title is not None:
1129             kwargs['title'] = video_title
1130         return {
1131             **kwargs,
1132             '_type': 'url_transparent' if url_transparent else 'url',
1133             'url': url,
1134         }
1135
1136     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1137         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1138                 for m in orderedSet(map(getter, matches) if getter else matches))
1139         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1140
1141     @staticmethod
1142     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1143         """Returns a playlist"""
1144         if playlist_id:
1145             kwargs['id'] = playlist_id
1146         if playlist_title:
1147             kwargs['title'] = playlist_title
1148         if playlist_description is not None:
1149             kwargs['description'] = playlist_description
1150         return {
1151             **kwargs,
1152             '_type': 'multi_video' if multi_video else 'playlist',
1153             'entries': entries,
1154         }
1155
1156     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1157         """
1158         Perform a regex search on the given string, using a single or a list of
1159         patterns returning the first matching group.
1160         In case of failure return a default value or raise a WARNING or a
1161         RegexNotFoundError, depending on fatal, specifying the field name.
1162         """
1163         if string is None:
1164             mobj = None
1165         elif isinstance(pattern, (str, re.Pattern)):
1166             mobj = re.search(pattern, string, flags)
1167         else:
1168             for p in pattern:
1169                 mobj = re.search(p, string, flags)
1170                 if mobj:
1171                     break
1172
1173         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1174
1175         if mobj:
1176             if group is None:
1177                 # return the first matching group
1178                 return next(g for g in mobj.groups() if g is not None)
1179             elif isinstance(group, (list, tuple)):
1180                 return tuple(mobj.group(g) for g in group)
1181             else:
1182                 return mobj.group(group)
1183         elif default is not NO_DEFAULT:
1184             return default
1185         elif fatal:
1186             raise RegexNotFoundError('Unable to extract %s' % _name)
1187         else:
1188             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1189             return None
1190
1191     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1192                      contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs):
1193         """Searches string for the JSON object specified by start_pattern"""
1194         # NB: end_pattern is only used to reduce the size of the initial match
1195         if default is NO_DEFAULT:
1196             default, has_default = {}, False
1197         else:
1198             fatal, has_default = False, True
1199
1200         json_string = self._search_regex(
1201             rf'{start_pattern}\s*(?P<json>{{\s*{contains_pattern}\s*}})\s*{end_pattern}',
1202             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1203         if not json_string:
1204             return default
1205
1206         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1207         try:
1208             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1209         except ExtractorError as e:
1210             if fatal:
1211                 raise ExtractorError(
1212                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1213             elif not has_default:
1214                 self.report_warning(
1215                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1216         return default
1217
1218     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1219         """
1220         Like _search_regex, but strips HTML tags and unescapes entities.
1221         """
1222         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1223         if res:
1224             return clean_html(res).strip()
1225         else:
1226             return res
1227
1228     def _get_netrc_login_info(self, netrc_machine=None):
1229         username = None
1230         password = None
1231         netrc_machine = netrc_machine or self._NETRC_MACHINE
1232
1233         if self.get_param('usenetrc', False):
1234             try:
1235                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1236                 if os.path.isdir(netrc_file):
1237                     netrc_file = os.path.join(netrc_file, '.netrc')
1238                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1239                 if info is not None:
1240                     username = info[0]
1241                     password = info[2]
1242                 else:
1243                     raise netrc.NetrcParseError(
1244                         'No authenticators for %s' % netrc_machine)
1245             except (OSError, netrc.NetrcParseError) as err:
1246                 self.report_warning(
1247                     'parsing .netrc: %s' % error_to_compat_str(err))
1248
1249         return username, password
1250
1251     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1252         """
1253         Get the login info as (username, password)
1254         First look for the manually specified credentials using username_option
1255         and password_option as keys in params dictionary. If no such credentials
1256         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1257         value.
1258         If there's no info available, return (None, None)
1259         """
1260
1261         # Attempt to use provided username and password or .netrc data
1262         username = self.get_param(username_option)
1263         if username is not None:
1264             password = self.get_param(password_option)
1265         else:
1266             username, password = self._get_netrc_login_info(netrc_machine)
1267
1268         return username, password
1269
1270     def _get_tfa_info(self, note='two-factor verification code'):
1271         """
1272         Get the two-factor authentication info
1273         TODO - asking the user will be required for sms/phone verify
1274         currently just uses the command line option
1275         If there's no info available, return None
1276         """
1277
1278         tfa = self.get_param('twofactor')
1279         if tfa is not None:
1280             return tfa
1281
1282         return compat_getpass('Type %s and press [Return]: ' % note)
1283
1284     # Helper functions for extracting OpenGraph info
1285     @staticmethod
1286     def _og_regexes(prop):
1287         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1288         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1289                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1290         template = r'<meta[^>]+?%s[^>]+?%s'
1291         return [
1292             template % (property_re, content_re),
1293             template % (content_re, property_re),
1294         ]
1295
1296     @staticmethod
1297     def _meta_regex(prop):
1298         return r'''(?isx)<meta
1299                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1300                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1301
1302     def _og_search_property(self, prop, html, name=None, **kargs):
1303         prop = variadic(prop)
1304         if name is None:
1305             name = 'OpenGraph %s' % prop[0]
1306         og_regexes = []
1307         for p in prop:
1308             og_regexes.extend(self._og_regexes(p))
1309         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1310         if escaped is None:
1311             return None
1312         return unescapeHTML(escaped)
1313
1314     def _og_search_thumbnail(self, html, **kargs):
1315         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1316
1317     def _og_search_description(self, html, **kargs):
1318         return self._og_search_property('description', html, fatal=False, **kargs)
1319
1320     def _og_search_title(self, html, *, fatal=False, **kargs):
1321         return self._og_search_property('title', html, fatal=fatal, **kargs)
1322
1323     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1324         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1325         if secure:
1326             regexes = self._og_regexes('video:secure_url') + regexes
1327         return self._html_search_regex(regexes, html, name, **kargs)
1328
1329     def _og_search_url(self, html, **kargs):
1330         return self._og_search_property('url', html, **kargs)
1331
1332     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1333         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1334
1335     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1336         name = variadic(name)
1337         if display_name is None:
1338             display_name = name[0]
1339         return self._html_search_regex(
1340             [self._meta_regex(n) for n in name],
1341             html, display_name, fatal=fatal, group='content', **kwargs)
1342
1343     def _dc_search_uploader(self, html):
1344         return self._html_search_meta('dc.creator', html, 'uploader')
1345
1346     def _rta_search(self, html):
1347         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1348         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1349                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1350                      html):
1351             return 18
1352         return 0
1353
1354     def _media_rating_search(self, html):
1355         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1356         rating = self._html_search_meta('rating', html)
1357
1358         if not rating:
1359             return None
1360
1361         RATING_TABLE = {
1362             'safe for kids': 0,
1363             'general': 8,
1364             '14 years': 14,
1365             'mature': 17,
1366             'restricted': 19,
1367         }
1368         return RATING_TABLE.get(rating.lower())
1369
1370     def _family_friendly_search(self, html):
1371         # See http://schema.org/VideoObject
1372         family_friendly = self._html_search_meta(
1373             'isFamilyFriendly', html, default=None)
1374
1375         if not family_friendly:
1376             return None
1377
1378         RATING_TABLE = {
1379             '1': 0,
1380             'true': 0,
1381             '0': 18,
1382             'false': 18,
1383         }
1384         return RATING_TABLE.get(family_friendly.lower())
1385
1386     def _twitter_search_player(self, html):
1387         return self._html_search_meta('twitter:player', html,
1388                                       'twitter card player')
1389
1390     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1391         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1392         default = kwargs.get('default', NO_DEFAULT)
1393         # JSON-LD may be malformed and thus `fatal` should be respected.
1394         # At the same time `default` may be passed that assumes `fatal=False`
1395         # for _search_regex. Let's simulate the same behavior here as well.
1396         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1397         json_ld = []
1398         for mobj in json_ld_list:
1399             json_ld_item = self._parse_json(
1400                 mobj.group('json_ld'), video_id, fatal=fatal)
1401             if not json_ld_item:
1402                 continue
1403             if isinstance(json_ld_item, dict):
1404                 json_ld.append(json_ld_item)
1405             elif isinstance(json_ld_item, (list, tuple)):
1406                 json_ld.extend(json_ld_item)
1407         if json_ld:
1408             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1409         if json_ld:
1410             return json_ld
1411         if default is not NO_DEFAULT:
1412             return default
1413         elif fatal:
1414             raise RegexNotFoundError('Unable to extract JSON-LD')
1415         else:
1416             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1417             return {}
1418
1419     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1420         if isinstance(json_ld, compat_str):
1421             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1422         if not json_ld:
1423             return {}
1424         info = {}
1425         if not isinstance(json_ld, (list, tuple, dict)):
1426             return info
1427         if isinstance(json_ld, dict):
1428             json_ld = [json_ld]
1429
1430         INTERACTION_TYPE_MAP = {
1431             'CommentAction': 'comment',
1432             'AgreeAction': 'like',
1433             'DisagreeAction': 'dislike',
1434             'LikeAction': 'like',
1435             'DislikeAction': 'dislike',
1436             'ListenAction': 'view',
1437             'WatchAction': 'view',
1438             'ViewAction': 'view',
1439         }
1440
1441         def is_type(e, *expected_types):
1442             type = variadic(traverse_obj(e, '@type'))
1443             return any(x in type for x in expected_types)
1444
1445         def extract_interaction_type(e):
1446             interaction_type = e.get('interactionType')
1447             if isinstance(interaction_type, dict):
1448                 interaction_type = interaction_type.get('@type')
1449             return str_or_none(interaction_type)
1450
1451         def extract_interaction_statistic(e):
1452             interaction_statistic = e.get('interactionStatistic')
1453             if isinstance(interaction_statistic, dict):
1454                 interaction_statistic = [interaction_statistic]
1455             if not isinstance(interaction_statistic, list):
1456                 return
1457             for is_e in interaction_statistic:
1458                 if not is_type(is_e, 'InteractionCounter'):
1459                     continue
1460                 interaction_type = extract_interaction_type(is_e)
1461                 if not interaction_type:
1462                     continue
1463                 # For interaction count some sites provide string instead of
1464                 # an integer (as per spec) with non digit characters (e.g. ",")
1465                 # so extracting count with more relaxed str_to_int
1466                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1467                 if interaction_count is None:
1468                     continue
1469                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1470                 if not count_kind:
1471                     continue
1472                 count_key = '%s_count' % count_kind
1473                 if info.get(count_key) is not None:
1474                     continue
1475                 info[count_key] = interaction_count
1476
1477         def extract_chapter_information(e):
1478             chapters = [{
1479                 'title': part.get('name'),
1480                 'start_time': part.get('startOffset'),
1481                 'end_time': part.get('endOffset'),
1482             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1483             for idx, (last_c, current_c, next_c) in enumerate(zip(
1484                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1485                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1486                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1487                 if None in current_c.values():
1488                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1489                     return
1490             if chapters:
1491                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1492                 info['chapters'] = chapters
1493
1494         def extract_video_object(e):
1495             assert is_type(e, 'VideoObject')
1496             author = e.get('author')
1497             info.update({
1498                 'url': traverse_obj(e, 'contentUrl', 'embedUrl', expected_type=url_or_none),
1499                 'title': unescapeHTML(e.get('name')),
1500                 'description': unescapeHTML(e.get('description')),
1501                 'thumbnails': [{'url': url}
1502                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1503                                if url_or_none(url)],
1504                 'duration': parse_duration(e.get('duration')),
1505                 'timestamp': unified_timestamp(e.get('uploadDate')),
1506                 # author can be an instance of 'Organization' or 'Person' types.
1507                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1508                 # however some websites are using 'Text' type instead.
1509                 # 1. https://schema.org/VideoObject
1510                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1511                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1512                 'tbr': int_or_none(e.get('bitrate')),
1513                 'width': int_or_none(e.get('width')),
1514                 'height': int_or_none(e.get('height')),
1515                 'view_count': int_or_none(e.get('interactionCount')),
1516             })
1517             extract_interaction_statistic(e)
1518             extract_chapter_information(e)
1519
1520         def traverse_json_ld(json_ld, at_top_level=True):
1521             for e in json_ld:
1522                 if at_top_level and '@context' not in e:
1523                     continue
1524                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1525                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1526                     break
1527                 if expected_type is not None and not is_type(e, expected_type):
1528                     continue
1529                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1530                 if rating is not None:
1531                     info['average_rating'] = rating
1532                 if is_type(e, 'TVEpisode', 'Episode'):
1533                     episode_name = unescapeHTML(e.get('name'))
1534                     info.update({
1535                         'episode': episode_name,
1536                         'episode_number': int_or_none(e.get('episodeNumber')),
1537                         'description': unescapeHTML(e.get('description')),
1538                     })
1539                     if not info.get('title') and episode_name:
1540                         info['title'] = episode_name
1541                     part_of_season = e.get('partOfSeason')
1542                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1543                         info.update({
1544                             'season': unescapeHTML(part_of_season.get('name')),
1545                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1546                         })
1547                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1548                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1549                         info['series'] = unescapeHTML(part_of_series.get('name'))
1550                 elif is_type(e, 'Movie'):
1551                     info.update({
1552                         'title': unescapeHTML(e.get('name')),
1553                         'description': unescapeHTML(e.get('description')),
1554                         'duration': parse_duration(e.get('duration')),
1555                         'timestamp': unified_timestamp(e.get('dateCreated')),
1556                     })
1557                 elif is_type(e, 'Article', 'NewsArticle'):
1558                     info.update({
1559                         'timestamp': parse_iso8601(e.get('datePublished')),
1560                         'title': unescapeHTML(e.get('headline')),
1561                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1562                     })
1563                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1564                         extract_video_object(e['video'][0])
1565                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1566                         extract_video_object(e['subjectOf'][0])
1567                 elif is_type(e, 'VideoObject'):
1568                     extract_video_object(e)
1569                     if expected_type is None:
1570                         continue
1571                     else:
1572                         break
1573                 video = e.get('video')
1574                 if is_type(video, 'VideoObject'):
1575                     extract_video_object(video)
1576                 if expected_type is None:
1577                     continue
1578                 else:
1579                     break
1580         traverse_json_ld(json_ld)
1581
1582         return filter_dict(info)
1583
1584     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1585         return self._parse_json(
1586             self._search_regex(
1587                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1588                 webpage, 'next.js data', fatal=fatal, **kw),
1589             video_id, transform_source=transform_source, fatal=fatal)
1590
1591     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1592         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1593         rectx = re.escape(context_name)
1594         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1595         js, arg_keys, arg_vals = self._search_regex(
1596             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1597             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
1598
1599         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1600
1601         for key, val in args.items():
1602             if val in ('undefined', 'void 0'):
1603                 args[key] = 'null'
1604
1605         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1606         return traverse_obj(ret, traverse) or {}
1607
1608     @staticmethod
1609     def _hidden_inputs(html):
1610         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1611         hidden_inputs = {}
1612         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1613             attrs = extract_attributes(input)
1614             if not input:
1615                 continue
1616             if attrs.get('type') not in ('hidden', 'submit'):
1617                 continue
1618             name = attrs.get('name') or attrs.get('id')
1619             value = attrs.get('value')
1620             if name and value is not None:
1621                 hidden_inputs[name] = value
1622         return hidden_inputs
1623
1624     def _form_hidden_inputs(self, form_id, html):
1625         form = self._search_regex(
1626             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1627             html, '%s form' % form_id, group='form')
1628         return self._hidden_inputs(form)
1629
1630     class FormatSort:
1631         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1632
1633         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1634                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1635                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1636         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1637                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1638                         'fps', 'fs_approx', 'source', 'id')
1639
1640         settings = {
1641             'vcodec': {'type': 'ordered', 'regex': True,
1642                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1643             'acodec': {'type': 'ordered', 'regex': True,
1644                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1645             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1646                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1647             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1648                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1649             'vext': {'type': 'ordered', 'field': 'video_ext',
1650                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1651                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1652             'aext': {'type': 'ordered', 'field': 'audio_ext',
1653                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1654                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1655             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1656             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1657                            'field': ('vcodec', 'acodec'),
1658                            'function': lambda it: int(any(v != 'none' for v in it))},
1659             'ie_pref': {'priority': True, 'type': 'extractor'},
1660             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1661             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1662             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1663             'quality': {'convert': 'float', 'default': -1},
1664             'filesize': {'convert': 'bytes'},
1665             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1666             'id': {'convert': 'string', 'field': 'format_id'},
1667             'height': {'convert': 'float_none'},
1668             'width': {'convert': 'float_none'},
1669             'fps': {'convert': 'float_none'},
1670             'tbr': {'convert': 'float_none'},
1671             'vbr': {'convert': 'float_none'},
1672             'abr': {'convert': 'float_none'},
1673             'asr': {'convert': 'float_none'},
1674             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1675
1676             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1677             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1678             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1679             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1680             'res': {'type': 'multiple', 'field': ('height', 'width'),
1681                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1682
1683             # For compatibility with youtube-dl
1684             'format_id': {'type': 'alias', 'field': 'id'},
1685             'preference': {'type': 'alias', 'field': 'ie_pref'},
1686             'language_preference': {'type': 'alias', 'field': 'lang'},
1687             'source_preference': {'type': 'alias', 'field': 'source'},
1688             'protocol': {'type': 'alias', 'field': 'proto'},
1689             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1690
1691             # Deprecated
1692             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1693             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1694             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1695             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1696             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1697             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1698             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1699             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1700             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1701             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1702             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1703             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1704             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1705             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1706             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1707             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1708             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1709             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1710             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1711             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1712         }
1713
1714         def __init__(self, ie, field_preference):
1715             self._order = []
1716             self.ydl = ie._downloader
1717             self.evaluate_params(self.ydl.params, field_preference)
1718             if ie.get_param('verbose'):
1719                 self.print_verbose_info(self.ydl.write_debug)
1720
1721         def _get_field_setting(self, field, key):
1722             if field not in self.settings:
1723                 if key in ('forced', 'priority'):
1724                     return False
1725                 self.ydl.deprecation_warning(
1726                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1727                     'and may be removed in a future version')
1728                 self.settings[field] = {}
1729             propObj = self.settings[field]
1730             if key not in propObj:
1731                 type = propObj.get('type')
1732                 if key == 'field':
1733                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1734                 elif key == 'convert':
1735                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1736                 else:
1737                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1738                 propObj[key] = default
1739             return propObj[key]
1740
1741         def _resolve_field_value(self, field, value, convertNone=False):
1742             if value is None:
1743                 if not convertNone:
1744                     return None
1745             else:
1746                 value = value.lower()
1747             conversion = self._get_field_setting(field, 'convert')
1748             if conversion == 'ignore':
1749                 return None
1750             if conversion == 'string':
1751                 return value
1752             elif conversion == 'float_none':
1753                 return float_or_none(value)
1754             elif conversion == 'bytes':
1755                 return FileDownloader.parse_bytes(value)
1756             elif conversion == 'order':
1757                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1758                 use_regex = self._get_field_setting(field, 'regex')
1759                 list_length = len(order_list)
1760                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1761                 if use_regex and value is not None:
1762                     for i, regex in enumerate(order_list):
1763                         if regex and re.match(regex, value):
1764                             return list_length - i
1765                     return list_length - empty_pos  # not in list
1766                 else:  # not regex or  value = None
1767                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1768             else:
1769                 if value.isnumeric():
1770                     return float(value)
1771                 else:
1772                     self.settings[field]['convert'] = 'string'
1773                     return value
1774
1775         def evaluate_params(self, params, sort_extractor):
1776             self._use_free_order = params.get('prefer_free_formats', False)
1777             self._sort_user = params.get('format_sort', [])
1778             self._sort_extractor = sort_extractor
1779
1780             def add_item(field, reverse, closest, limit_text):
1781                 field = field.lower()
1782                 if field in self._order:
1783                     return
1784                 self._order.append(field)
1785                 limit = self._resolve_field_value(field, limit_text)
1786                 data = {
1787                     'reverse': reverse,
1788                     'closest': False if limit is None else closest,
1789                     'limit_text': limit_text,
1790                     'limit': limit}
1791                 if field in self.settings:
1792                     self.settings[field].update(data)
1793                 else:
1794                     self.settings[field] = data
1795
1796             sort_list = (
1797                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1798                 + (tuple() if params.get('format_sort_force', False)
1799                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1800                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1801
1802             for item in sort_list:
1803                 match = re.match(self.regex, item)
1804                 if match is None:
1805                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1806                 field = match.group('field')
1807                 if field is None:
1808                     continue
1809                 if self._get_field_setting(field, 'type') == 'alias':
1810                     alias, field = field, self._get_field_setting(field, 'field')
1811                     if self._get_field_setting(alias, 'deprecated'):
1812                         self.ydl.deprecation_warning(
1813                             f'Format sorting alias {alias} is deprecated '
1814                             f'and may be removed in a future version. Please use {field} instead')
1815                 reverse = match.group('reverse') is not None
1816                 closest = match.group('separator') == '~'
1817                 limit_text = match.group('limit')
1818
1819                 has_limit = limit_text is not None
1820                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1821                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1822
1823                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1824                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1825                 limit_count = len(limits)
1826                 for (i, f) in enumerate(fields):
1827                     add_item(f, reverse, closest,
1828                              limits[i] if i < limit_count
1829                              else limits[0] if has_limit and not has_multiple_limits
1830                              else None)
1831
1832         def print_verbose_info(self, write_debug):
1833             if self._sort_user:
1834                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1835             if self._sort_extractor:
1836                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1837             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1838                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1839                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1840                               self._get_field_setting(field, 'limit_text'),
1841                               self._get_field_setting(field, 'limit'))
1842                 if self._get_field_setting(field, 'limit_text') is not None else '')
1843                 for field in self._order if self._get_field_setting(field, 'visible')]))
1844
1845         def _calculate_field_preference_from_value(self, format, field, type, value):
1846             reverse = self._get_field_setting(field, 'reverse')
1847             closest = self._get_field_setting(field, 'closest')
1848             limit = self._get_field_setting(field, 'limit')
1849
1850             if type == 'extractor':
1851                 maximum = self._get_field_setting(field, 'max')
1852                 if value is None or (maximum is not None and value >= maximum):
1853                     value = -1
1854             elif type == 'boolean':
1855                 in_list = self._get_field_setting(field, 'in_list')
1856                 not_in_list = self._get_field_setting(field, 'not_in_list')
1857                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1858             elif type == 'ordered':
1859                 value = self._resolve_field_value(field, value, True)
1860
1861             # try to convert to number
1862             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1863             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1864             if is_num:
1865                 value = val_num
1866
1867             return ((-10, 0) if value is None
1868                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1869                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1870                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1871                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1872                     else (-1, value, 0))
1873
1874         def _calculate_field_preference(self, format, field):
1875             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1876             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1877             if type == 'multiple':
1878                 type = 'field'  # Only 'field' is allowed in multiple for now
1879                 actual_fields = self._get_field_setting(field, 'field')
1880
1881                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1882             else:
1883                 value = get_value(field)
1884             return self._calculate_field_preference_from_value(format, field, type, value)
1885
1886         def calculate_preference(self, format):
1887             # Determine missing protocol
1888             if not format.get('protocol'):
1889                 format['protocol'] = determine_protocol(format)
1890
1891             # Determine missing ext
1892             if not format.get('ext') and 'url' in format:
1893                 format['ext'] = determine_ext(format['url'])
1894             if format.get('vcodec') == 'none':
1895                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1896                 format['video_ext'] = 'none'
1897             else:
1898                 format['video_ext'] = format['ext']
1899                 format['audio_ext'] = 'none'
1900             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1901             #    format['preference'] = -1000
1902
1903             # Determine missing bitrates
1904             if format.get('tbr') is None:
1905                 if format.get('vbr') is not None and format.get('abr') is not None:
1906                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1907             else:
1908                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1909                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1910                 if format.get('acodec') != 'none' and format.get('abr') is None:
1911                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1912
1913             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1914
1915     def _sort_formats(self, formats, field_preference=[]):
1916         if not formats:
1917             return
1918         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1919
1920     def _check_formats(self, formats, video_id):
1921         if formats:
1922             formats[:] = filter(
1923                 lambda f: self._is_valid_url(
1924                     f['url'], video_id,
1925                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1926                 formats)
1927
1928     @staticmethod
1929     def _remove_duplicate_formats(formats):
1930         format_urls = set()
1931         unique_formats = []
1932         for f in formats:
1933             if f['url'] not in format_urls:
1934                 format_urls.add(f['url'])
1935                 unique_formats.append(f)
1936         formats[:] = unique_formats
1937
1938     def _is_valid_url(self, url, video_id, item='video', headers={}):
1939         url = self._proto_relative_url(url, scheme='http:')
1940         # For now assume non HTTP(S) URLs always valid
1941         if not (url.startswith('http://') or url.startswith('https://')):
1942             return True
1943         try:
1944             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1945             return True
1946         except ExtractorError as e:
1947             self.to_screen(
1948                 '%s: %s URL is invalid, skipping: %s'
1949                 % (video_id, item, error_to_compat_str(e.cause)))
1950             return False
1951
1952     def http_scheme(self):
1953         """ Either "http:" or "https:", depending on the user's preferences """
1954         return (
1955             'http:'
1956             if self.get_param('prefer_insecure', False)
1957             else 'https:')
1958
1959     def _proto_relative_url(self, url, scheme=None):
1960         if url is None:
1961             return url
1962         if url.startswith('//'):
1963             if scheme is None:
1964                 scheme = self.http_scheme()
1965             return scheme + url
1966         else:
1967             return url
1968
1969     def _sleep(self, timeout, video_id, msg_template=None):
1970         if msg_template is None:
1971             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1972         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1973         self.to_screen(msg)
1974         time.sleep(timeout)
1975
1976     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1977                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1978                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1979         res = self._download_xml_handle(
1980             manifest_url, video_id, 'Downloading f4m manifest',
1981             'Unable to download f4m manifest',
1982             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1983             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1984             transform_source=transform_source,
1985             fatal=fatal, data=data, headers=headers, query=query)
1986         if res is False:
1987             return []
1988
1989         manifest, urlh = res
1990         manifest_url = urlh.geturl()
1991
1992         return self._parse_f4m_formats(
1993             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1994             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1995
1996     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1997                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1998                            fatal=True, m3u8_id=None):
1999         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2000             return []
2001
2002         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2003         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2004         if akamai_pv is not None and ';' in akamai_pv.text:
2005             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2006             if playerVerificationChallenge.strip() != '':
2007                 return []
2008
2009         formats = []
2010         manifest_version = '1.0'
2011         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2012         if not media_nodes:
2013             manifest_version = '2.0'
2014             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2015         # Remove unsupported DRM protected media from final formats
2016         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2017         media_nodes = remove_encrypted_media(media_nodes)
2018         if not media_nodes:
2019             return formats
2020
2021         manifest_base_url = get_base_url(manifest)
2022
2023         bootstrap_info = xpath_element(
2024             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2025             'bootstrap info', default=None)
2026
2027         vcodec = None
2028         mime_type = xpath_text(
2029             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2030             'base URL', default=None)
2031         if mime_type and mime_type.startswith('audio/'):
2032             vcodec = 'none'
2033
2034         for i, media_el in enumerate(media_nodes):
2035             tbr = int_or_none(media_el.attrib.get('bitrate'))
2036             width = int_or_none(media_el.attrib.get('width'))
2037             height = int_or_none(media_el.attrib.get('height'))
2038             format_id = join_nonempty(f4m_id, tbr or i)
2039             # If <bootstrapInfo> is present, the specified f4m is a
2040             # stream-level manifest, and only set-level manifests may refer to
2041             # external resources.  See section 11.4 and section 4 of F4M spec
2042             if bootstrap_info is None:
2043                 media_url = None
2044                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2045                 if manifest_version == '2.0':
2046                     media_url = media_el.attrib.get('href')
2047                 if media_url is None:
2048                     media_url = media_el.attrib.get('url')
2049                 if not media_url:
2050                     continue
2051                 manifest_url = (
2052                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2053                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2054                 # If media_url is itself a f4m manifest do the recursive extraction
2055                 # since bitrates in parent manifest (this one) and media_url manifest
2056                 # may differ leading to inability to resolve the format by requested
2057                 # bitrate in f4m downloader
2058                 ext = determine_ext(manifest_url)
2059                 if ext == 'f4m':
2060                     f4m_formats = self._extract_f4m_formats(
2061                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2062                         transform_source=transform_source, fatal=fatal)
2063                     # Sometimes stream-level manifest contains single media entry that
2064                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2065                     # At the same time parent's media entry in set-level manifest may
2066                     # contain it. We will copy it from parent in such cases.
2067                     if len(f4m_formats) == 1:
2068                         f = f4m_formats[0]
2069                         f.update({
2070                             'tbr': f.get('tbr') or tbr,
2071                             'width': f.get('width') or width,
2072                             'height': f.get('height') or height,
2073                             'format_id': f.get('format_id') if not tbr else format_id,
2074                             'vcodec': vcodec,
2075                         })
2076                     formats.extend(f4m_formats)
2077                     continue
2078                 elif ext == 'm3u8':
2079                     formats.extend(self._extract_m3u8_formats(
2080                         manifest_url, video_id, 'mp4', preference=preference,
2081                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2082                     continue
2083             formats.append({
2084                 'format_id': format_id,
2085                 'url': manifest_url,
2086                 'manifest_url': manifest_url,
2087                 'ext': 'flv' if bootstrap_info is not None else None,
2088                 'protocol': 'f4m',
2089                 'tbr': tbr,
2090                 'width': width,
2091                 'height': height,
2092                 'vcodec': vcodec,
2093                 'preference': preference,
2094                 'quality': quality,
2095             })
2096         return formats
2097
2098     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2099         return {
2100             'format_id': join_nonempty(m3u8_id, 'meta'),
2101             'url': m3u8_url,
2102             'ext': ext,
2103             'protocol': 'm3u8',
2104             'preference': preference - 100 if preference else -100,
2105             'quality': quality,
2106             'resolution': 'multiple',
2107             'format_note': 'Quality selection URL',
2108         }
2109
2110     def _report_ignoring_subs(self, name):
2111         self.report_warning(bug_reports_message(
2112             f'Ignoring subtitle tracks found in the {name} manifest; '
2113             'if any subtitle tracks are missing,'
2114         ), only_once=True)
2115
2116     def _extract_m3u8_formats(self, *args, **kwargs):
2117         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2118         if subs:
2119             self._report_ignoring_subs('HLS')
2120         return fmts
2121
2122     def _extract_m3u8_formats_and_subtitles(
2123             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2124             preference=None, quality=None, m3u8_id=None, note=None,
2125             errnote=None, fatal=True, live=False, data=None, headers={},
2126             query={}):
2127
2128         res = self._download_webpage_handle(
2129             m3u8_url, video_id,
2130             note='Downloading m3u8 information' if note is None else note,
2131             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2132             fatal=fatal, data=data, headers=headers, query=query)
2133
2134         if res is False:
2135             return [], {}
2136
2137         m3u8_doc, urlh = res
2138         m3u8_url = urlh.geturl()
2139
2140         return self._parse_m3u8_formats_and_subtitles(
2141             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2142             preference=preference, quality=quality, m3u8_id=m3u8_id,
2143             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2144             headers=headers, query=query, video_id=video_id)
2145
2146     def _parse_m3u8_formats_and_subtitles(
2147             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2148             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2149             errnote=None, fatal=True, data=None, headers={}, query={},
2150             video_id=None):
2151         formats, subtitles = [], {}
2152
2153         has_drm = re.search('|'.join([
2154             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2155             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2156         ]), m3u8_doc)
2157
2158         def format_url(url):
2159             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2160
2161         if self.get_param('hls_split_discontinuity', False):
2162             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2163                 if not m3u8_doc:
2164                     if not manifest_url:
2165                         return []
2166                     m3u8_doc = self._download_webpage(
2167                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2168                         note=False, errnote='Failed to download m3u8 playlist information')
2169                     if m3u8_doc is False:
2170                         return []
2171                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2172
2173         else:
2174             def _extract_m3u8_playlist_indices(*args, **kwargs):
2175                 return [None]
2176
2177         # References:
2178         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2179         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2180         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2181
2182         # We should try extracting formats only from master playlists [1, 4.3.4],
2183         # i.e. playlists that describe available qualities. On the other hand
2184         # media playlists [1, 4.3.3] should be returned as is since they contain
2185         # just the media without qualities renditions.
2186         # Fortunately, master playlist can be easily distinguished from media
2187         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2188         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2189         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2190         # media playlist and MUST NOT appear in master playlist thus we can
2191         # clearly detect media playlist with this criterion.
2192
2193         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2194             formats = [{
2195                 'format_id': join_nonempty(m3u8_id, idx),
2196                 'format_index': idx,
2197                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2198                 'ext': ext,
2199                 'protocol': entry_protocol,
2200                 'preference': preference,
2201                 'quality': quality,
2202                 'has_drm': has_drm,
2203             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2204
2205             return formats, subtitles
2206
2207         groups = {}
2208         last_stream_inf = {}
2209
2210         def extract_media(x_media_line):
2211             media = parse_m3u8_attributes(x_media_line)
2212             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2213             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2214             if not (media_type and group_id and name):
2215                 return
2216             groups.setdefault(group_id, []).append(media)
2217             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2218             if media_type == 'SUBTITLES':
2219                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2220                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2221                 # However, lack of URI has been spotted in the wild.
2222                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2223                 if not media.get('URI'):
2224                     return
2225                 url = format_url(media['URI'])
2226                 sub_info = {
2227                     'url': url,
2228                     'ext': determine_ext(url),
2229                 }
2230                 if sub_info['ext'] == 'm3u8':
2231                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2232                     # files may contain is WebVTT:
2233                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2234                     sub_info['ext'] = 'vtt'
2235                     sub_info['protocol'] = 'm3u8_native'
2236                 lang = media.get('LANGUAGE') or 'und'
2237                 subtitles.setdefault(lang, []).append(sub_info)
2238             if media_type not in ('VIDEO', 'AUDIO'):
2239                 return
2240             media_url = media.get('URI')
2241             if media_url:
2242                 manifest_url = format_url(media_url)
2243                 formats.extend({
2244                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2245                     'format_note': name,
2246                     'format_index': idx,
2247                     'url': manifest_url,
2248                     'manifest_url': m3u8_url,
2249                     'language': media.get('LANGUAGE'),
2250                     'ext': ext,
2251                     'protocol': entry_protocol,
2252                     'preference': preference,
2253                     'quality': quality,
2254                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2255                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2256
2257         def build_stream_name():
2258             # Despite specification does not mention NAME attribute for
2259             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2260             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2261             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2262             stream_name = last_stream_inf.get('NAME')
2263             if stream_name:
2264                 return stream_name
2265             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2266             # from corresponding rendition group
2267             stream_group_id = last_stream_inf.get('VIDEO')
2268             if not stream_group_id:
2269                 return
2270             stream_group = groups.get(stream_group_id)
2271             if not stream_group:
2272                 return stream_group_id
2273             rendition = stream_group[0]
2274             return rendition.get('NAME') or stream_group_id
2275
2276         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2277         # chance to detect video only formats when EXT-X-STREAM-INF tags
2278         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2279         for line in m3u8_doc.splitlines():
2280             if line.startswith('#EXT-X-MEDIA:'):
2281                 extract_media(line)
2282
2283         for line in m3u8_doc.splitlines():
2284             if line.startswith('#EXT-X-STREAM-INF:'):
2285                 last_stream_inf = parse_m3u8_attributes(line)
2286             elif line.startswith('#') or not line.strip():
2287                 continue
2288             else:
2289                 tbr = float_or_none(
2290                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2291                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2292                 manifest_url = format_url(line.strip())
2293
2294                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2295                     format_id = [m3u8_id, None, idx]
2296                     # Bandwidth of live streams may differ over time thus making
2297                     # format_id unpredictable. So it's better to keep provided
2298                     # format_id intact.
2299                     if not live:
2300                         stream_name = build_stream_name()
2301                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2302                     f = {
2303                         'format_id': join_nonempty(*format_id),
2304                         'format_index': idx,
2305                         'url': manifest_url,
2306                         'manifest_url': m3u8_url,
2307                         'tbr': tbr,
2308                         'ext': ext,
2309                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2310                         'protocol': entry_protocol,
2311                         'preference': preference,
2312                         'quality': quality,
2313                     }
2314                     resolution = last_stream_inf.get('RESOLUTION')
2315                     if resolution:
2316                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2317                         if mobj:
2318                             f['width'] = int(mobj.group('width'))
2319                             f['height'] = int(mobj.group('height'))
2320                     # Unified Streaming Platform
2321                     mobj = re.search(
2322                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2323                     if mobj:
2324                         abr, vbr = mobj.groups()
2325                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2326                         f.update({
2327                             'vbr': vbr,
2328                             'abr': abr,
2329                         })
2330                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2331                     f.update(codecs)
2332                     audio_group_id = last_stream_inf.get('AUDIO')
2333                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2334                     # references a rendition group MUST have a CODECS attribute.
2335                     # However, this is not always respected, for example, [2]
2336                     # contains EXT-X-STREAM-INF tag which references AUDIO
2337                     # rendition group but does not have CODECS and despite
2338                     # referencing an audio group it represents a complete
2339                     # (with audio and video) format. So, for such cases we will
2340                     # ignore references to rendition groups and treat them
2341                     # as complete formats.
2342                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2343                         audio_group = groups.get(audio_group_id)
2344                         if audio_group and audio_group[0].get('URI'):
2345                             # TODO: update acodec for audio only formats with
2346                             # the same GROUP-ID
2347                             f['acodec'] = 'none'
2348                     if not f.get('ext'):
2349                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2350                     formats.append(f)
2351
2352                     # for DailyMotion
2353                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2354                     if progressive_uri:
2355                         http_f = f.copy()
2356                         del http_f['manifest_url']
2357                         http_f.update({
2358                             'format_id': f['format_id'].replace('hls-', 'http-'),
2359                             'protocol': 'http',
2360                             'url': progressive_uri,
2361                         })
2362                         formats.append(http_f)
2363
2364                 last_stream_inf = {}
2365         return formats, subtitles
2366
2367     def _extract_m3u8_vod_duration(
2368             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2369
2370         m3u8_vod = self._download_webpage(
2371             m3u8_vod_url, video_id,
2372             note='Downloading m3u8 VOD manifest' if note is None else note,
2373             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2374             fatal=False, data=data, headers=headers, query=query)
2375
2376         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2377
2378     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2379         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2380             return None
2381
2382         return int(sum(
2383             float(line[len('#EXTINF:'):].split(',')[0])
2384             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2385
2386     @staticmethod
2387     def _xpath_ns(path, namespace=None):
2388         if not namespace:
2389             return path
2390         out = []
2391         for c in path.split('/'):
2392             if not c or c == '.':
2393                 out.append(c)
2394             else:
2395                 out.append('{%s}%s' % (namespace, c))
2396         return '/'.join(out)
2397
2398     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2399         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2400         if res is False:
2401             assert not fatal
2402             return [], {}
2403
2404         smil, urlh = res
2405         smil_url = urlh.geturl()
2406
2407         namespace = self._parse_smil_namespace(smil)
2408
2409         fmts = self._parse_smil_formats(
2410             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2411         subs = self._parse_smil_subtitles(
2412             smil, namespace=namespace)
2413
2414         return fmts, subs
2415
2416     def _extract_smil_formats(self, *args, **kwargs):
2417         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2418         if subs:
2419             self._report_ignoring_subs('SMIL')
2420         return fmts
2421
2422     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2423         res = self._download_smil(smil_url, video_id, fatal=fatal)
2424         if res is False:
2425             return {}
2426
2427         smil, urlh = res
2428         smil_url = urlh.geturl()
2429
2430         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2431
2432     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2433         return self._download_xml_handle(
2434             smil_url, video_id, 'Downloading SMIL file',
2435             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2436
2437     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2438         namespace = self._parse_smil_namespace(smil)
2439
2440         formats = self._parse_smil_formats(
2441             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2442         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2443
2444         video_id = os.path.splitext(url_basename(smil_url))[0]
2445         title = None
2446         description = None
2447         upload_date = None
2448         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2449             name = meta.attrib.get('name')
2450             content = meta.attrib.get('content')
2451             if not name or not content:
2452                 continue
2453             if not title and name == 'title':
2454                 title = content
2455             elif not description and name in ('description', 'abstract'):
2456                 description = content
2457             elif not upload_date and name == 'date':
2458                 upload_date = unified_strdate(content)
2459
2460         thumbnails = [{
2461             'id': image.get('type'),
2462             'url': image.get('src'),
2463             'width': int_or_none(image.get('width')),
2464             'height': int_or_none(image.get('height')),
2465         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2466
2467         return {
2468             'id': video_id,
2469             'title': title or video_id,
2470             'description': description,
2471             'upload_date': upload_date,
2472             'thumbnails': thumbnails,
2473             'formats': formats,
2474             'subtitles': subtitles,
2475         }
2476
2477     def _parse_smil_namespace(self, smil):
2478         return self._search_regex(
2479             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2480
2481     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2482         base = smil_url
2483         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2484             b = meta.get('base') or meta.get('httpBase')
2485             if b:
2486                 base = b
2487                 break
2488
2489         formats = []
2490         rtmp_count = 0
2491         http_count = 0
2492         m3u8_count = 0
2493         imgs_count = 0
2494
2495         srcs = set()
2496         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2497         for medium in media:
2498             src = medium.get('src')
2499             if not src or src in srcs:
2500                 continue
2501             srcs.add(src)
2502
2503             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2504             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2505             width = int_or_none(medium.get('width'))
2506             height = int_or_none(medium.get('height'))
2507             proto = medium.get('proto')
2508             ext = medium.get('ext')
2509             src_ext = determine_ext(src)
2510             streamer = medium.get('streamer') or base
2511
2512             if proto == 'rtmp' or streamer.startswith('rtmp'):
2513                 rtmp_count += 1
2514                 formats.append({
2515                     'url': streamer,
2516                     'play_path': src,
2517                     'ext': 'flv',
2518                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2519                     'tbr': bitrate,
2520                     'filesize': filesize,
2521                     'width': width,
2522                     'height': height,
2523                 })
2524                 if transform_rtmp_url:
2525                     streamer, src = transform_rtmp_url(streamer, src)
2526                     formats[-1].update({
2527                         'url': streamer,
2528                         'play_path': src,
2529                     })
2530                 continue
2531
2532             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2533             src_url = src_url.strip()
2534
2535             if proto == 'm3u8' or src_ext == 'm3u8':
2536                 m3u8_formats = self._extract_m3u8_formats(
2537                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2538                 if len(m3u8_formats) == 1:
2539                     m3u8_count += 1
2540                     m3u8_formats[0].update({
2541                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2542                         'tbr': bitrate,
2543                         'width': width,
2544                         'height': height,
2545                     })
2546                 formats.extend(m3u8_formats)
2547             elif src_ext == 'f4m':
2548                 f4m_url = src_url
2549                 if not f4m_params:
2550                     f4m_params = {
2551                         'hdcore': '3.2.0',
2552                         'plugin': 'flowplayer-3.2.0.1',
2553                     }
2554                 f4m_url += '&' if '?' in f4m_url else '?'
2555                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2556                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2557             elif src_ext == 'mpd':
2558                 formats.extend(self._extract_mpd_formats(
2559                     src_url, video_id, mpd_id='dash', fatal=False))
2560             elif re.search(r'\.ism/[Mm]anifest', src_url):
2561                 formats.extend(self._extract_ism_formats(
2562                     src_url, video_id, ism_id='mss', fatal=False))
2563             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2564                 http_count += 1
2565                 formats.append({
2566                     'url': src_url,
2567                     'ext': ext or src_ext or 'flv',
2568                     'format_id': 'http-%d' % (bitrate or http_count),
2569                     'tbr': bitrate,
2570                     'filesize': filesize,
2571                     'width': width,
2572                     'height': height,
2573                 })
2574
2575         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2576             src = medium.get('src')
2577             if not src or src in srcs:
2578                 continue
2579             srcs.add(src)
2580
2581             imgs_count += 1
2582             formats.append({
2583                 'format_id': 'imagestream-%d' % (imgs_count),
2584                 'url': src,
2585                 'ext': mimetype2ext(medium.get('type')),
2586                 'acodec': 'none',
2587                 'vcodec': 'none',
2588                 'width': int_or_none(medium.get('width')),
2589                 'height': int_or_none(medium.get('height')),
2590                 'format_note': 'SMIL storyboards',
2591             })
2592
2593         return formats
2594
2595     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2596         urls = []
2597         subtitles = {}
2598         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2599             src = textstream.get('src')
2600             if not src or src in urls:
2601                 continue
2602             urls.append(src)
2603             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2604             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2605             subtitles.setdefault(lang, []).append({
2606                 'url': src,
2607                 'ext': ext,
2608             })
2609         return subtitles
2610
2611     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2612         res = self._download_xml_handle(
2613             xspf_url, playlist_id, 'Downloading xpsf playlist',
2614             'Unable to download xspf manifest', fatal=fatal)
2615         if res is False:
2616             return []
2617
2618         xspf, urlh = res
2619         xspf_url = urlh.geturl()
2620
2621         return self._parse_xspf(
2622             xspf, playlist_id, xspf_url=xspf_url,
2623             xspf_base_url=base_url(xspf_url))
2624
2625     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2626         NS_MAP = {
2627             'xspf': 'http://xspf.org/ns/0/',
2628             's1': 'http://static.streamone.nl/player/ns/0',
2629         }
2630
2631         entries = []
2632         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2633             title = xpath_text(
2634                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2635             description = xpath_text(
2636                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2637             thumbnail = xpath_text(
2638                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2639             duration = float_or_none(
2640                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2641
2642             formats = []
2643             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2644                 format_url = urljoin(xspf_base_url, location.text)
2645                 if not format_url:
2646                     continue
2647                 formats.append({
2648                     'url': format_url,
2649                     'manifest_url': xspf_url,
2650                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2651                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2652                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2653                 })
2654             self._sort_formats(formats)
2655
2656             entries.append({
2657                 'id': playlist_id,
2658                 'title': title,
2659                 'description': description,
2660                 'thumbnail': thumbnail,
2661                 'duration': duration,
2662                 'formats': formats,
2663             })
2664         return entries
2665
2666     def _extract_mpd_formats(self, *args, **kwargs):
2667         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2668         if subs:
2669             self._report_ignoring_subs('DASH')
2670         return fmts
2671
2672     def _extract_mpd_formats_and_subtitles(
2673             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2674             fatal=True, data=None, headers={}, query={}):
2675         res = self._download_xml_handle(
2676             mpd_url, video_id,
2677             note='Downloading MPD manifest' if note is None else note,
2678             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2679             fatal=fatal, data=data, headers=headers, query=query)
2680         if res is False:
2681             return [], {}
2682         mpd_doc, urlh = res
2683         if mpd_doc is None:
2684             return [], {}
2685
2686         # We could have been redirected to a new url when we retrieved our mpd file.
2687         mpd_url = urlh.geturl()
2688         mpd_base_url = base_url(mpd_url)
2689
2690         return self._parse_mpd_formats_and_subtitles(
2691             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2692
2693     def _parse_mpd_formats(self, *args, **kwargs):
2694         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2695         if subs:
2696             self._report_ignoring_subs('DASH')
2697         return fmts
2698
2699     def _parse_mpd_formats_and_subtitles(
2700             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2701         """
2702         Parse formats from MPD manifest.
2703         References:
2704          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2705             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2706          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2707         """
2708         if not self.get_param('dynamic_mpd', True):
2709             if mpd_doc.get('type') == 'dynamic':
2710                 return [], {}
2711
2712         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2713
2714         def _add_ns(path):
2715             return self._xpath_ns(path, namespace)
2716
2717         def is_drm_protected(element):
2718             return element.find(_add_ns('ContentProtection')) is not None
2719
2720         def extract_multisegment_info(element, ms_parent_info):
2721             ms_info = ms_parent_info.copy()
2722
2723             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2724             # common attributes and elements.  We will only extract relevant
2725             # for us.
2726             def extract_common(source):
2727                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2728                 if segment_timeline is not None:
2729                     s_e = segment_timeline.findall(_add_ns('S'))
2730                     if s_e:
2731                         ms_info['total_number'] = 0
2732                         ms_info['s'] = []
2733                         for s in s_e:
2734                             r = int(s.get('r', 0))
2735                             ms_info['total_number'] += 1 + r
2736                             ms_info['s'].append({
2737                                 't': int(s.get('t', 0)),
2738                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2739                                 'd': int(s.attrib['d']),
2740                                 'r': r,
2741                             })
2742                 start_number = source.get('startNumber')
2743                 if start_number:
2744                     ms_info['start_number'] = int(start_number)
2745                 timescale = source.get('timescale')
2746                 if timescale:
2747                     ms_info['timescale'] = int(timescale)
2748                 segment_duration = source.get('duration')
2749                 if segment_duration:
2750                     ms_info['segment_duration'] = float(segment_duration)
2751
2752             def extract_Initialization(source):
2753                 initialization = source.find(_add_ns('Initialization'))
2754                 if initialization is not None:
2755                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2756
2757             segment_list = element.find(_add_ns('SegmentList'))
2758             if segment_list is not None:
2759                 extract_common(segment_list)
2760                 extract_Initialization(segment_list)
2761                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2762                 if segment_urls_e:
2763                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2764             else:
2765                 segment_template = element.find(_add_ns('SegmentTemplate'))
2766                 if segment_template is not None:
2767                     extract_common(segment_template)
2768                     media = segment_template.get('media')
2769                     if media:
2770                         ms_info['media'] = media
2771                     initialization = segment_template.get('initialization')
2772                     if initialization:
2773                         ms_info['initialization'] = initialization
2774                     else:
2775                         extract_Initialization(segment_template)
2776             return ms_info
2777
2778         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2779         formats, subtitles = [], {}
2780         stream_numbers = collections.defaultdict(int)
2781         for period in mpd_doc.findall(_add_ns('Period')):
2782             period_duration = parse_duration(period.get('duration')) or mpd_duration
2783             period_ms_info = extract_multisegment_info(period, {
2784                 'start_number': 1,
2785                 'timescale': 1,
2786             })
2787             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2788                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2789                 for representation in adaptation_set.findall(_add_ns('Representation')):
2790                     representation_attrib = adaptation_set.attrib.copy()
2791                     representation_attrib.update(representation.attrib)
2792                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2793                     mime_type = representation_attrib['mimeType']
2794                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2795
2796                     codec_str = representation_attrib.get('codecs', '')
2797                     # Some kind of binary subtitle found in some youtube livestreams
2798                     if mime_type == 'application/x-rawcc':
2799                         codecs = {'scodec': codec_str}
2800                     else:
2801                         codecs = parse_codecs(codec_str)
2802                     if content_type not in ('video', 'audio', 'text'):
2803                         if mime_type == 'image/jpeg':
2804                             content_type = mime_type
2805                         elif codecs.get('vcodec', 'none') != 'none':
2806                             content_type = 'video'
2807                         elif codecs.get('acodec', 'none') != 'none':
2808                             content_type = 'audio'
2809                         elif codecs.get('scodec', 'none') != 'none':
2810                             content_type = 'text'
2811                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2812                             content_type = 'text'
2813                         else:
2814                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2815                             continue
2816
2817                     base_url = ''
2818                     for element in (representation, adaptation_set, period, mpd_doc):
2819                         base_url_e = element.find(_add_ns('BaseURL'))
2820                         if base_url_e is not None:
2821                             base_url = base_url_e.text + base_url
2822                             if re.match(r'^https?://', base_url):
2823                                 break
2824                     if mpd_base_url and base_url.startswith('/'):
2825                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2826                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2827                         if not mpd_base_url.endswith('/'):
2828                             mpd_base_url += '/'
2829                         base_url = mpd_base_url + base_url
2830                     representation_id = representation_attrib.get('id')
2831                     lang = representation_attrib.get('lang')
2832                     url_el = representation.find(_add_ns('BaseURL'))
2833                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2834                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2835                     if representation_id is not None:
2836                         format_id = representation_id
2837                     else:
2838                         format_id = content_type
2839                     if mpd_id:
2840                         format_id = mpd_id + '-' + format_id
2841                     if content_type in ('video', 'audio'):
2842                         f = {
2843                             'format_id': format_id,
2844                             'manifest_url': mpd_url,
2845                             'ext': mimetype2ext(mime_type),
2846                             'width': int_or_none(representation_attrib.get('width')),
2847                             'height': int_or_none(representation_attrib.get('height')),
2848                             'tbr': float_or_none(bandwidth, 1000),
2849                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2850                             'fps': int_or_none(representation_attrib.get('frameRate')),
2851                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2852                             'format_note': 'DASH %s' % content_type,
2853                             'filesize': filesize,
2854                             'container': mimetype2ext(mime_type) + '_dash',
2855                             **codecs
2856                         }
2857                     elif content_type == 'text':
2858                         f = {
2859                             'ext': mimetype2ext(mime_type),
2860                             'manifest_url': mpd_url,
2861                             'filesize': filesize,
2862                         }
2863                     elif content_type == 'image/jpeg':
2864                         # See test case in VikiIE
2865                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2866                         f = {
2867                             'format_id': format_id,
2868                             'ext': 'mhtml',
2869                             'manifest_url': mpd_url,
2870                             'format_note': 'DASH storyboards (jpeg)',
2871                             'acodec': 'none',
2872                             'vcodec': 'none',
2873                         }
2874                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2875                         f['has_drm'] = True
2876                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2877
2878                     def prepare_template(template_name, identifiers):
2879                         tmpl = representation_ms_info[template_name]
2880                         # First of, % characters outside $...$ templates
2881                         # must be escaped by doubling for proper processing
2882                         # by % operator string formatting used further (see
2883                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2884                         t = ''
2885                         in_template = False
2886                         for c in tmpl:
2887                             t += c
2888                             if c == '$':
2889                                 in_template = not in_template
2890                             elif c == '%' and not in_template:
2891                                 t += c
2892                         # Next, $...$ templates are translated to their
2893                         # %(...) counterparts to be used with % operator
2894                         if representation_id is not None:
2895                             t = t.replace('$RepresentationID$', representation_id)
2896                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2897                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2898                         t.replace('$$', '$')
2899                         return t
2900
2901                     # @initialization is a regular template like @media one
2902                     # so it should be handled just the same way (see
2903                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2904                     if 'initialization' in representation_ms_info:
2905                         initialization_template = prepare_template(
2906                             'initialization',
2907                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2908                             # $Time$ shall not be included for @initialization thus
2909                             # only $Bandwidth$ remains
2910                             ('Bandwidth', ))
2911                         representation_ms_info['initialization_url'] = initialization_template % {
2912                             'Bandwidth': bandwidth,
2913                         }
2914
2915                     def location_key(location):
2916                         return 'url' if re.match(r'^https?://', location) else 'path'
2917
2918                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2919
2920                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2921                         media_location_key = location_key(media_template)
2922
2923                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2924                         # can't be used at the same time
2925                         if '%(Number' in media_template and 's' not in representation_ms_info:
2926                             segment_duration = None
2927                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2928                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2929                                 representation_ms_info['total_number'] = int(math.ceil(
2930                                     float_or_none(period_duration, segment_duration, default=0)))
2931                             representation_ms_info['fragments'] = [{
2932                                 media_location_key: media_template % {
2933                                     'Number': segment_number,
2934                                     'Bandwidth': bandwidth,
2935                                 },
2936                                 'duration': segment_duration,
2937                             } for segment_number in range(
2938                                 representation_ms_info['start_number'],
2939                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2940                         else:
2941                             # $Number*$ or $Time$ in media template with S list available
2942                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2943                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2944                             representation_ms_info['fragments'] = []
2945                             segment_time = 0
2946                             segment_d = None
2947                             segment_number = representation_ms_info['start_number']
2948
2949                             def add_segment_url():
2950                                 segment_url = media_template % {
2951                                     'Time': segment_time,
2952                                     'Bandwidth': bandwidth,
2953                                     'Number': segment_number,
2954                                 }
2955                                 representation_ms_info['fragments'].append({
2956                                     media_location_key: segment_url,
2957                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2958                                 })
2959
2960                             for num, s in enumerate(representation_ms_info['s']):
2961                                 segment_time = s.get('t') or segment_time
2962                                 segment_d = s['d']
2963                                 add_segment_url()
2964                                 segment_number += 1
2965                                 for r in range(s.get('r', 0)):
2966                                     segment_time += segment_d
2967                                     add_segment_url()
2968                                     segment_number += 1
2969                                 segment_time += segment_d
2970                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2971                         # No media template
2972                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2973                         # or any YouTube dashsegments video
2974                         fragments = []
2975                         segment_index = 0
2976                         timescale = representation_ms_info['timescale']
2977                         for s in representation_ms_info['s']:
2978                             duration = float_or_none(s['d'], timescale)
2979                             for r in range(s.get('r', 0) + 1):
2980                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2981                                 fragments.append({
2982                                     location_key(segment_uri): segment_uri,
2983                                     'duration': duration,
2984                                 })
2985                                 segment_index += 1
2986                         representation_ms_info['fragments'] = fragments
2987                     elif 'segment_urls' in representation_ms_info:
2988                         # Segment URLs with no SegmentTimeline
2989                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2990                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2991                         fragments = []
2992                         segment_duration = float_or_none(
2993                             representation_ms_info['segment_duration'],
2994                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2995                         for segment_url in representation_ms_info['segment_urls']:
2996                             fragment = {
2997                                 location_key(segment_url): segment_url,
2998                             }
2999                             if segment_duration:
3000                                 fragment['duration'] = segment_duration
3001                             fragments.append(fragment)
3002                         representation_ms_info['fragments'] = fragments
3003                     # If there is a fragments key available then we correctly recognized fragmented media.
3004                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3005                     # assumption is not necessarily correct since we may simply have no support for
3006                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3007                     if 'fragments' in representation_ms_info:
3008                         f.update({
3009                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3010                             'url': mpd_url or base_url,
3011                             'fragment_base_url': base_url,
3012                             'fragments': [],
3013                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3014                         })
3015                         if 'initialization_url' in representation_ms_info:
3016                             initialization_url = representation_ms_info['initialization_url']
3017                             if not f.get('url'):
3018                                 f['url'] = initialization_url
3019                             f['fragments'].append({location_key(initialization_url): initialization_url})
3020                         f['fragments'].extend(representation_ms_info['fragments'])
3021                         if not period_duration:
3022                             period_duration = try_get(
3023                                 representation_ms_info,
3024                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3025                     else:
3026                         # Assuming direct URL to unfragmented media.
3027                         f['url'] = base_url
3028                     if content_type in ('video', 'audio', 'image/jpeg'):
3029                         f['manifest_stream_number'] = stream_numbers[f['url']]
3030                         stream_numbers[f['url']] += 1
3031                         formats.append(f)
3032                     elif content_type == 'text':
3033                         subtitles.setdefault(lang or 'und', []).append(f)
3034
3035         return formats, subtitles
3036
3037     def _extract_ism_formats(self, *args, **kwargs):
3038         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3039         if subs:
3040             self._report_ignoring_subs('ISM')
3041         return fmts
3042
3043     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3044         res = self._download_xml_handle(
3045             ism_url, video_id,
3046             note='Downloading ISM manifest' if note is None else note,
3047             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3048             fatal=fatal, data=data, headers=headers, query=query)
3049         if res is False:
3050             return [], {}
3051         ism_doc, urlh = res
3052         if ism_doc is None:
3053             return [], {}
3054
3055         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3056
3057     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3058         """
3059         Parse formats from ISM manifest.
3060         References:
3061          1. [MS-SSTR]: Smooth Streaming Protocol,
3062             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3063         """
3064         if ism_doc.get('IsLive') == 'TRUE':
3065             return [], {}
3066
3067         duration = int(ism_doc.attrib['Duration'])
3068         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3069
3070         formats = []
3071         subtitles = {}
3072         for stream in ism_doc.findall('StreamIndex'):
3073             stream_type = stream.get('Type')
3074             if stream_type not in ('video', 'audio', 'text'):
3075                 continue
3076             url_pattern = stream.attrib['Url']
3077             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3078             stream_name = stream.get('Name')
3079             stream_language = stream.get('Language', 'und')
3080             for track in stream.findall('QualityLevel'):
3081                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3082                 # TODO: add support for WVC1 and WMAP
3083                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3084                     self.report_warning('%s is not a supported codec' % fourcc)
3085                     continue
3086                 tbr = int(track.attrib['Bitrate']) // 1000
3087                 # [1] does not mention Width and Height attributes. However,
3088                 # they're often present while MaxWidth and MaxHeight are
3089                 # missing, so should be used as fallbacks
3090                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3091                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3092                 sampling_rate = int_or_none(track.get('SamplingRate'))
3093
3094                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3095                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3096
3097                 fragments = []
3098                 fragment_ctx = {
3099                     'time': 0,
3100                 }
3101                 stream_fragments = stream.findall('c')
3102                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3103                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3104                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3105                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3106                     if not fragment_ctx['duration']:
3107                         try:
3108                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3109                         except IndexError:
3110                             next_fragment_time = duration
3111                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3112                     for _ in range(fragment_repeat):
3113                         fragments.append({
3114                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3115                             'duration': fragment_ctx['duration'] / stream_timescale,
3116                         })
3117                         fragment_ctx['time'] += fragment_ctx['duration']
3118
3119                 if stream_type == 'text':
3120                     subtitles.setdefault(stream_language, []).append({
3121                         'ext': 'ismt',
3122                         'protocol': 'ism',
3123                         'url': ism_url,
3124                         'manifest_url': ism_url,
3125                         'fragments': fragments,
3126                         '_download_params': {
3127                             'stream_type': stream_type,
3128                             'duration': duration,
3129                             'timescale': stream_timescale,
3130                             'fourcc': fourcc,
3131                             'language': stream_language,
3132                             'codec_private_data': track.get('CodecPrivateData'),
3133                         }
3134                     })
3135                 elif stream_type in ('video', 'audio'):
3136                     formats.append({
3137                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3138                         'url': ism_url,
3139                         'manifest_url': ism_url,
3140                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3141                         'width': width,
3142                         'height': height,
3143                         'tbr': tbr,
3144                         'asr': sampling_rate,
3145                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3146                         'acodec': 'none' if stream_type == 'video' else fourcc,
3147                         'protocol': 'ism',
3148                         'fragments': fragments,
3149                         'has_drm': ism_doc.find('Protection') is not None,
3150                         '_download_params': {
3151                             'stream_type': stream_type,
3152                             'duration': duration,
3153                             'timescale': stream_timescale,
3154                             'width': width or 0,
3155                             'height': height or 0,
3156                             'fourcc': fourcc,
3157                             'language': stream_language,
3158                             'codec_private_data': track.get('CodecPrivateData'),
3159                             'sampling_rate': sampling_rate,
3160                             'channels': int_or_none(track.get('Channels', 2)),
3161                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3162                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3163                         },
3164                     })
3165         return formats, subtitles
3166
3167     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3168         def absolute_url(item_url):
3169             return urljoin(base_url, item_url)
3170
3171         def parse_content_type(content_type):
3172             if not content_type:
3173                 return {}
3174             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3175             if ctr:
3176                 mimetype, codecs = ctr.groups()
3177                 f = parse_codecs(codecs)
3178                 f['ext'] = mimetype2ext(mimetype)
3179                 return f
3180             return {}
3181
3182         def _media_formats(src, cur_media_type, type_info=None):
3183             type_info = type_info or {}
3184             full_url = absolute_url(src)
3185             ext = type_info.get('ext') or determine_ext(full_url)
3186             if ext == 'm3u8':
3187                 is_plain_url = False
3188                 formats = self._extract_m3u8_formats(
3189                     full_url, video_id, ext='mp4',
3190                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3191                     preference=preference, quality=quality, fatal=False)
3192             elif ext == 'mpd':
3193                 is_plain_url = False
3194                 formats = self._extract_mpd_formats(
3195                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3196             else:
3197                 is_plain_url = True
3198                 formats = [{
3199                     'url': full_url,
3200                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3201                     'ext': ext,
3202                 }]
3203             return is_plain_url, formats
3204
3205         entries = []
3206         # amp-video and amp-audio are very similar to their HTML5 counterparts
3207         # so we wll include them right here (see
3208         # https://www.ampproject.org/docs/reference/components/amp-video)
3209         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3210         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3211         media_tags = [(media_tag, media_tag_name, media_type, '')
3212                       for media_tag, media_tag_name, media_type
3213                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3214         media_tags.extend(re.findall(
3215             # We only allow video|audio followed by a whitespace or '>'.
3216             # Allowing more characters may end up in significant slow down (see
3217             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3218             # http://www.porntrex.com/maps/videositemap.xml).
3219             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3220         for media_tag, _, media_type, media_content in media_tags:
3221             media_info = {
3222                 'formats': [],
3223                 'subtitles': {},
3224             }
3225             media_attributes = extract_attributes(media_tag)
3226             src = strip_or_none(media_attributes.get('src'))
3227             if src:
3228                 f = parse_content_type(media_attributes.get('type'))
3229                 _, formats = _media_formats(src, media_type, f)
3230                 media_info['formats'].extend(formats)
3231             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3232             if media_content:
3233                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3234                     s_attr = extract_attributes(source_tag)
3235                     # data-video-src and data-src are non standard but seen
3236                     # several times in the wild
3237                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3238                     if not src:
3239                         continue
3240                     f = parse_content_type(s_attr.get('type'))
3241                     is_plain_url, formats = _media_formats(src, media_type, f)
3242                     if is_plain_url:
3243                         # width, height, res, label and title attributes are
3244                         # all not standard but seen several times in the wild
3245                         labels = [
3246                             s_attr.get(lbl)
3247                             for lbl in ('label', 'title')
3248                             if str_or_none(s_attr.get(lbl))
3249                         ]
3250                         width = int_or_none(s_attr.get('width'))
3251                         height = (int_or_none(s_attr.get('height'))
3252                                   or int_or_none(s_attr.get('res')))
3253                         if not width or not height:
3254                             for lbl in labels:
3255                                 resolution = parse_resolution(lbl)
3256                                 if not resolution:
3257                                     continue
3258                                 width = width or resolution.get('width')
3259                                 height = height or resolution.get('height')
3260                         for lbl in labels:
3261                             tbr = parse_bitrate(lbl)
3262                             if tbr:
3263                                 break
3264                         else:
3265                             tbr = None
3266                         f.update({
3267                             'width': width,
3268                             'height': height,
3269                             'tbr': tbr,
3270                             'format_id': s_attr.get('label') or s_attr.get('title'),
3271                         })
3272                         f.update(formats[0])
3273                         media_info['formats'].append(f)
3274                     else:
3275                         media_info['formats'].extend(formats)
3276                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3277                     track_attributes = extract_attributes(track_tag)
3278                     kind = track_attributes.get('kind')
3279                     if not kind or kind in ('subtitles', 'captions'):
3280                         src = strip_or_none(track_attributes.get('src'))
3281                         if not src:
3282                             continue
3283                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3284                         media_info['subtitles'].setdefault(lang, []).append({
3285                             'url': absolute_url(src),
3286                         })
3287             for f in media_info['formats']:
3288                 f.setdefault('http_headers', {})['Referer'] = base_url
3289             if media_info['formats'] or media_info['subtitles']:
3290                 entries.append(media_info)
3291         return entries
3292
3293     def _extract_akamai_formats(self, *args, **kwargs):
3294         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3295         if subs:
3296             self._report_ignoring_subs('akamai')
3297         return fmts
3298
3299     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3300         signed = 'hdnea=' in manifest_url
3301         if not signed:
3302             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3303             manifest_url = re.sub(
3304                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3305                 '', manifest_url).strip('?')
3306
3307         formats = []
3308         subtitles = {}
3309
3310         hdcore_sign = 'hdcore=3.7.0'
3311         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3312         hds_host = hosts.get('hds')
3313         if hds_host:
3314             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3315         if 'hdcore=' not in f4m_url:
3316             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3317         f4m_formats = self._extract_f4m_formats(
3318             f4m_url, video_id, f4m_id='hds', fatal=False)
3319         for entry in f4m_formats:
3320             entry.update({'extra_param_to_segment_url': hdcore_sign})
3321         formats.extend(f4m_formats)
3322
3323         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3324         hls_host = hosts.get('hls')
3325         if hls_host:
3326             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3327         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3328             m3u8_url, video_id, 'mp4', 'm3u8_native',
3329             m3u8_id='hls', fatal=False)
3330         formats.extend(m3u8_formats)
3331         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3332
3333         http_host = hosts.get('http')
3334         if http_host and m3u8_formats and not signed:
3335             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3336             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3337             qualities_length = len(qualities)
3338             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3339                 i = 0
3340                 for f in m3u8_formats:
3341                     if f['vcodec'] != 'none':
3342                         for protocol in ('http', 'https'):
3343                             http_f = f.copy()
3344                             del http_f['manifest_url']
3345                             http_url = re.sub(
3346                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3347                             http_f.update({
3348                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3349                                 'url': http_url,
3350                                 'protocol': protocol,
3351                             })
3352                             formats.append(http_f)
3353                         i += 1
3354
3355         return formats, subtitles
3356
3357     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3358         query = compat_urlparse.urlparse(url).query
3359         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3360         mobj = re.search(
3361             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3362         url_base = mobj.group('url')
3363         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3364         formats = []
3365
3366         def manifest_url(manifest):
3367             m_url = f'{http_base_url}/{manifest}'
3368             if query:
3369                 m_url += '?%s' % query
3370             return m_url
3371
3372         if 'm3u8' not in skip_protocols:
3373             formats.extend(self._extract_m3u8_formats(
3374                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3375                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3376         if 'f4m' not in skip_protocols:
3377             formats.extend(self._extract_f4m_formats(
3378                 manifest_url('manifest.f4m'),
3379                 video_id, f4m_id='hds', fatal=False))
3380         if 'dash' not in skip_protocols:
3381             formats.extend(self._extract_mpd_formats(
3382                 manifest_url('manifest.mpd'),
3383                 video_id, mpd_id='dash', fatal=False))
3384         if re.search(r'(?:/smil:|\.smil)', url_base):
3385             if 'smil' not in skip_protocols:
3386                 rtmp_formats = self._extract_smil_formats(
3387                     manifest_url('jwplayer.smil'),
3388                     video_id, fatal=False)
3389                 for rtmp_format in rtmp_formats:
3390                     rtsp_format = rtmp_format.copy()
3391                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3392                     del rtsp_format['play_path']
3393                     del rtsp_format['ext']
3394                     rtsp_format.update({
3395                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3396                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3397                         'protocol': 'rtsp',
3398                     })
3399                     formats.extend([rtmp_format, rtsp_format])
3400         else:
3401             for protocol in ('rtmp', 'rtsp'):
3402                 if protocol not in skip_protocols:
3403                     formats.append({
3404                         'url': f'{protocol}:{url_base}',
3405                         'format_id': protocol,
3406                         'protocol': protocol,
3407                     })
3408         return formats
3409
3410     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3411         mobj = re.search(
3412             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3413             webpage)
3414         if mobj:
3415             try:
3416                 jwplayer_data = self._parse_json(mobj.group('options'),
3417                                                  video_id=video_id,
3418                                                  transform_source=transform_source)
3419             except ExtractorError:
3420                 pass
3421             else:
3422                 if isinstance(jwplayer_data, dict):
3423                     return jwplayer_data
3424
3425     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3426         jwplayer_data = self._find_jwplayer_data(
3427             webpage, video_id, transform_source=js_to_json)
3428         return self._parse_jwplayer_data(
3429             jwplayer_data, video_id, *args, **kwargs)
3430
3431     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3432                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3433         # JWPlayer backward compatibility: flattened playlists
3434         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3435         if 'playlist' not in jwplayer_data:
3436             jwplayer_data = {'playlist': [jwplayer_data]}
3437
3438         entries = []
3439
3440         # JWPlayer backward compatibility: single playlist item
3441         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3442         if not isinstance(jwplayer_data['playlist'], list):
3443             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3444
3445         for video_data in jwplayer_data['playlist']:
3446             # JWPlayer backward compatibility: flattened sources
3447             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3448             if 'sources' not in video_data:
3449                 video_data['sources'] = [video_data]
3450
3451             this_video_id = video_id or video_data['mediaid']
3452
3453             formats = self._parse_jwplayer_formats(
3454                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3455                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3456
3457             subtitles = {}
3458             tracks = video_data.get('tracks')
3459             if tracks and isinstance(tracks, list):
3460                 for track in tracks:
3461                     if not isinstance(track, dict):
3462                         continue
3463                     track_kind = track.get('kind')
3464                     if not track_kind or not isinstance(track_kind, compat_str):
3465                         continue
3466                     if track_kind.lower() not in ('captions', 'subtitles'):
3467                         continue
3468                     track_url = urljoin(base_url, track.get('file'))
3469                     if not track_url:
3470                         continue
3471                     subtitles.setdefault(track.get('label') or 'en', []).append({
3472                         'url': self._proto_relative_url(track_url)
3473                     })
3474
3475             entry = {
3476                 'id': this_video_id,
3477                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3478                 'description': clean_html(video_data.get('description')),
3479                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3480                 'timestamp': int_or_none(video_data.get('pubdate')),
3481                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3482                 'subtitles': subtitles,
3483             }
3484             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3485             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3486                 entry.update({
3487                     '_type': 'url_transparent',
3488                     'url': formats[0]['url'],
3489                 })
3490             else:
3491                 self._sort_formats(formats)
3492                 entry['formats'] = formats
3493             entries.append(entry)
3494         if len(entries) == 1:
3495             return entries[0]
3496         else:
3497             return self.playlist_result(entries)
3498
3499     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3500                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3501         urls = []
3502         formats = []
3503         for source in jwplayer_sources_data:
3504             if not isinstance(source, dict):
3505                 continue
3506             source_url = urljoin(
3507                 base_url, self._proto_relative_url(source.get('file')))
3508             if not source_url or source_url in urls:
3509                 continue
3510             urls.append(source_url)
3511             source_type = source.get('type') or ''
3512             ext = mimetype2ext(source_type) or determine_ext(source_url)
3513             if source_type == 'hls' or ext == 'm3u8':
3514                 formats.extend(self._extract_m3u8_formats(
3515                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3516                     m3u8_id=m3u8_id, fatal=False))
3517             elif source_type == 'dash' or ext == 'mpd':
3518                 formats.extend(self._extract_mpd_formats(
3519                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3520             elif ext == 'smil':
3521                 formats.extend(self._extract_smil_formats(
3522                     source_url, video_id, fatal=False))
3523             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3524             elif source_type.startswith('audio') or ext in (
3525                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3526                 formats.append({
3527                     'url': source_url,
3528                     'vcodec': 'none',
3529                     'ext': ext,
3530                 })
3531             else:
3532                 height = int_or_none(source.get('height'))
3533                 if height is None:
3534                     # Often no height is provided but there is a label in
3535                     # format like "1080p", "720p SD", or 1080.
3536                     height = int_or_none(self._search_regex(
3537                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3538                         'height', default=None))
3539                 a_format = {
3540                     'url': source_url,
3541                     'width': int_or_none(source.get('width')),
3542                     'height': height,
3543                     'tbr': int_or_none(source.get('bitrate')),
3544                     'ext': ext,
3545                 }
3546                 if source_url.startswith('rtmp'):
3547                     a_format['ext'] = 'flv'
3548                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3549                     # of jwplayer.flash.swf
3550                     rtmp_url_parts = re.split(
3551                         r'((?:mp4|mp3|flv):)', source_url, 1)
3552                     if len(rtmp_url_parts) == 3:
3553                         rtmp_url, prefix, play_path = rtmp_url_parts
3554                         a_format.update({
3555                             'url': rtmp_url,
3556                             'play_path': prefix + play_path,
3557                         })
3558                     if rtmp_params:
3559                         a_format.update(rtmp_params)
3560                 formats.append(a_format)
3561         return formats
3562
3563     def _live_title(self, name):
3564         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3565         return name
3566
3567     def _int(self, v, name, fatal=False, **kwargs):
3568         res = int_or_none(v, **kwargs)
3569         if res is None:
3570             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3571             if fatal:
3572                 raise ExtractorError(msg)
3573             else:
3574                 self.report_warning(msg)
3575         return res
3576
3577     def _float(self, v, name, fatal=False, **kwargs):
3578         res = float_or_none(v, **kwargs)
3579         if res is None:
3580             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3581             if fatal:
3582                 raise ExtractorError(msg)
3583             else:
3584                 self.report_warning(msg)
3585         return res
3586
3587     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3588                     path='/', secure=False, discard=False, rest={}, **kwargs):
3589         cookie = compat_cookiejar_Cookie(
3590             0, name, value, port, port is not None, domain, True,
3591             domain.startswith('.'), path, True, secure, expire_time,
3592             discard, None, None, rest)
3593         self._downloader.cookiejar.set_cookie(cookie)
3594
3595     def _get_cookies(self, url):
3596         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3597         return compat_cookies_SimpleCookie(self._downloader._calc_cookies(url))
3598
3599     def _apply_first_set_cookie_header(self, url_handle, cookie):
3600         """
3601         Apply first Set-Cookie header instead of the last. Experimental.
3602
3603         Some sites (e.g. [1-3]) may serve two cookies under the same name
3604         in Set-Cookie header and expect the first (old) one to be set rather
3605         than second (new). However, as of RFC6265 the newer one cookie
3606         should be set into cookie store what actually happens.
3607         We will workaround this issue by resetting the cookie to
3608         the first one manually.
3609         1. https://new.vk.com/
3610         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3611         3. https://learning.oreilly.com/
3612         """
3613         for header, cookies in url_handle.headers.items():
3614             if header.lower() != 'set-cookie':
3615                 continue
3616             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3617             cookie_value = re.search(
3618                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3619             if cookie_value:
3620                 value, domain = cookie_value.groups()
3621                 self._set_cookie(domain, cookie, value)
3622                 break
3623
3624     @classmethod
3625     def get_testcases(cls, include_onlymatching=False):
3626         t = getattr(cls, '_TEST', None)
3627         if t:
3628             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3629             tests = [t]
3630         else:
3631             tests = getattr(cls, '_TESTS', [])
3632         for t in tests:
3633             if not include_onlymatching and t.get('only_matching', False):
3634                 continue
3635             t['name'] = cls.ie_key()
3636             yield t
3637
3638     @classproperty
3639     def age_limit(cls):
3640         """Get age limit from the testcases"""
3641         return max(traverse_obj(
3642             tuple(cls.get_testcases(include_onlymatching=False)),
3643             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3644
3645     @classmethod
3646     def is_suitable(cls, age_limit):
3647         """Test whether the extractor is generally suitable for the given age limit"""
3648         return not age_restricted(cls.age_limit, age_limit)
3649
3650     @classmethod
3651     def description(cls, *, markdown=True, search_examples=None):
3652         """Description of the extractor"""
3653         desc = ''
3654         if cls._NETRC_MACHINE:
3655             if markdown:
3656                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3657             else:
3658                 desc += f' [{cls._NETRC_MACHINE}]'
3659         if cls.IE_DESC is False:
3660             desc += ' [HIDDEN]'
3661         elif cls.IE_DESC:
3662             desc += f' {cls.IE_DESC}'
3663         if cls.SEARCH_KEY:
3664             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3665             if search_examples:
3666                 _COUNTS = ('', '5', '10', 'all')
3667                 desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3668         if not cls.working():
3669             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3670
3671         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3672         return f'{name}:{desc}' if desc else name
3673
3674     def extract_subtitles(self, *args, **kwargs):
3675         if (self.get_param('writesubtitles', False)
3676                 or self.get_param('listsubtitles')):
3677             return self._get_subtitles(*args, **kwargs)
3678         return {}
3679
3680     def _get_subtitles(self, *args, **kwargs):
3681         raise NotImplementedError('This method must be implemented by subclasses')
3682
3683     def extract_comments(self, *args, **kwargs):
3684         if not self.get_param('getcomments'):
3685             return None
3686         generator = self._get_comments(*args, **kwargs)
3687
3688         def extractor():
3689             comments = []
3690             interrupted = True
3691             try:
3692                 while True:
3693                     comments.append(next(generator))
3694             except StopIteration:
3695                 interrupted = False
3696             except KeyboardInterrupt:
3697                 self.to_screen('Interrupted by user')
3698             except Exception as e:
3699                 if self.get_param('ignoreerrors') is not True:
3700                     raise
3701                 self._downloader.report_error(e)
3702             comment_count = len(comments)
3703             self.to_screen(f'Extracted {comment_count} comments')
3704             return {
3705                 'comments': comments,
3706                 'comment_count': None if interrupted else comment_count
3707             }
3708         return extractor
3709
3710     def _get_comments(self, *args, **kwargs):
3711         raise NotImplementedError('This method must be implemented by subclasses')
3712
3713     @staticmethod
3714     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3715         """ Merge subtitle items for one language. Items with duplicated URLs/data
3716         will be dropped. """
3717         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3718         ret = list(subtitle_list1)
3719         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3720         return ret
3721
3722     @classmethod
3723     def _merge_subtitles(cls, *dicts, target=None):
3724         """ Merge subtitle dictionaries, language by language. """
3725         if target is None:
3726             target = {}
3727         for d in dicts:
3728             for lang, subs in d.items():
3729                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3730         return target
3731
3732     def extract_automatic_captions(self, *args, **kwargs):
3733         if (self.get_param('writeautomaticsub', False)
3734                 or self.get_param('listsubtitles')):
3735             return self._get_automatic_captions(*args, **kwargs)
3736         return {}
3737
3738     def _get_automatic_captions(self, *args, **kwargs):
3739         raise NotImplementedError('This method must be implemented by subclasses')
3740
3741     @functools.cached_property
3742     def _cookies_passed(self):
3743         """Whether cookies have been passed to YoutubeDL"""
3744         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3745
3746     def mark_watched(self, *args, **kwargs):
3747         if not self.get_param('mark_watched', False):
3748             return
3749         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3750             self._mark_watched(*args, **kwargs)
3751
3752     def _mark_watched(self, *args, **kwargs):
3753         raise NotImplementedError('This method must be implemented by subclasses')
3754
3755     def geo_verification_headers(self):
3756         headers = {}
3757         geo_verification_proxy = self.get_param('geo_verification_proxy')
3758         if geo_verification_proxy:
3759             headers['Ytdl-request-proxy'] = geo_verification_proxy
3760         return headers
3761
3762     def _generic_id(self, url):
3763         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3764
3765     def _generic_title(self, url):
3766         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3767
3768     @staticmethod
3769     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3770         all_known = all(map(
3771             lambda x: x is not None,
3772             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3773         return (
3774             'private' if is_private
3775             else 'premium_only' if needs_premium
3776             else 'subscriber_only' if needs_subscription
3777             else 'needs_auth' if needs_auth
3778             else 'unlisted' if is_unlisted
3779             else 'public' if all_known
3780             else None)
3781
3782     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3783         '''
3784         @returns            A list of values for the extractor argument given by "key"
3785                             or "default" if no such key is present
3786         @param default      The default value to return when the key is not present (default: [])
3787         @param casesense    When false, the values are converted to lower case
3788         '''
3789         val = traverse_obj(
3790             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3791         if val is None:
3792             return [] if default is NO_DEFAULT else default
3793         return list(val) if casesense else [x.lower() for x in val]
3794
3795     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3796         if not playlist_id or not video_id:
3797             return not video_id
3798
3799         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3800         if no_playlist is not None:
3801             return not no_playlist
3802
3803         video_id = '' if video_id is True else f' {video_id}'
3804         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3805         if self.get_param('noplaylist'):
3806             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3807             return False
3808         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3809         return True
3810
3811
3812 class SearchInfoExtractor(InfoExtractor):
3813     """
3814     Base class for paged search queries extractors.
3815     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3816     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3817     """
3818
3819     _MAX_RESULTS = float('inf')
3820
3821     @classmethod
3822     def _make_valid_url(cls):
3823         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3824
3825     def _real_extract(self, query):
3826         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3827         if prefix == '':
3828             return self._get_n_results(query, 1)
3829         elif prefix == 'all':
3830             return self._get_n_results(query, self._MAX_RESULTS)
3831         else:
3832             n = int(prefix)
3833             if n <= 0:
3834                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3835             elif n > self._MAX_RESULTS:
3836                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3837                 n = self._MAX_RESULTS
3838             return self._get_n_results(query, n)
3839
3840     def _get_n_results(self, query, n):
3841         """Get a specified number of results for a query.
3842         Either this function or _search_results must be overridden by subclasses """
3843         return self.playlist_result(
3844             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3845             query, query)
3846
3847     def _search_results(self, query):
3848         """Returns an iterator of search results"""
3849         raise NotImplementedError('This method must be implemented by subclasses')
3850
3851     @classproperty
3852     def SEARCH_KEY(cls):
3853         return cls._SEARCH_KEY