yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import itertools
   9 import json
  10 import math
  11 import netrc
  12 import os
  13 import random
  14 import sys
  15 import time
  16 import urllib.request
  17 import xml.etree.ElementTree
  18
  19 from ..compat import functools, re  # isort: split
  20 from ..compat import (
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_parse_unquote,
  26     compat_urllib_parse_urlencode,
  27     compat_urlparse,
  28 )
  29 from ..downloader import FileDownloader
  30 from ..downloader.f4m import get_base_url, remove_encrypted_media
  31 from ..utils import (
  32     JSON_LD_RE,
  33     NO_DEFAULT,
  34     ExtractorError,
  35     GeoRestrictedError,
  36     GeoUtils,
  37     LenientJSONDecoder,
  38     RegexNotFoundError,
  39     UnsupportedError,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     classproperty,
  44     clean_html,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     encode_data_uri,
  49     error_to_compat_str,
  50     extract_attributes,
  51     filter_dict,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     int_or_none,
  56     join_nonempty,
  57     js_to_json,
  58     mimetype2ext,
  59     network_exceptions,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     sanitize_filename,
  68     sanitized_Request,
  69     str_or_none,
  70     str_to_int,
  71     strip_or_none,
  72     traverse_obj,
  73     try_get,
  74     unescapeHTML,
  75     unified_strdate,
  76     unified_timestamp,
  77     update_Request,
  78     update_url_query,
  79     url_basename,
  80     url_or_none,
  81     urljoin,
  82     variadic,
  83     xpath_element,
  84     xpath_text,
  85     xpath_with_ns,
  86 )
  87
  88
  89 class InfoExtractor:
  90     """Information Extractor class.
  91
  92     Information extractors are the classes that, given a URL, extract
  93     information about the video (or videos) the URL refers to. This
  94     information includes the real video URL, the video title, author and
  95     others. The information is stored in a dictionary which is then
  96     passed to the YoutubeDL. The YoutubeDL processes this
  97     information possibly downloading the video to the file system, among
  98     other possible outcomes.
  99
 100     The type field determines the type of the result.
 101     By far the most common value (and the default if _type is missing) is
 102     "video", which indicates a single video.
 103
 104     For a video, the dictionaries must include the following fields:
 105
 106     id:             Video identifier.
 107     title:          Video title, unescaped. Set to an empty string if video has
 108                     no title as opposed to "None" which signifies that the
 109                     extractor failed to obtain a title
 110
 111     Additionally, it must contain either a formats entry or a url one:
 112
 113     formats:        A list of dictionaries for each format available, ordered
 114                     from worst to best quality.
 115
 116                     Potential fields:
 117                     * url        The mandatory URL representing the media:
 118                                    for plain file media - HTTP URL of this file,
 119                                    for RTMP - RTMP URL,
 120                                    for HLS - URL of the M3U8 media playlist,
 121                                    for HDS - URL of the F4M manifest,
 122                                    for DASH
 123                                      - HTTP URL to plain file media (in case of
 124                                        unfragmented media)
 125                                      - URL of the MPD manifest or base URL
 126                                        representing the media if MPD manifest
 127                                        is parsed from a string (in case of
 128                                        fragmented media)
 129                                    for MSS - URL of the ISM manifest.
 130                     * manifest_url
 131                                  The URL of the manifest file in case of
 132                                  fragmented media:
 133                                    for HLS - URL of the M3U8 master playlist,
 134                                    for HDS - URL of the F4M manifest,
 135                                    for DASH - URL of the MPD manifest,
 136                                    for MSS - URL of the ISM manifest.
 137                     * manifest_stream_number  (For internal use only)
 138                                  The index of the stream in the manifest file
 139                     * ext        Will be calculated from URL if missing
 140                     * format     A human-readable description of the format
 141                                  ("mp4 container with h264/opus").
 142                                  Calculated from the format_id, width, height.
 143                                  and format_note fields if missing.
 144                     * format_id  A short description of the format
 145                                  ("mp4_h264_opus" or "19").
 146                                 Technically optional, but strongly recommended.
 147                     * format_note Additional info about the format
 148                                  ("3D" or "DASH video")
 149                     * width      Width of the video, if known
 150                     * height     Height of the video, if known
 151                     * resolution Textual description of width and height
 152                     * dynamic_range The dynamic range of the video. One of:
 153                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 154                     * tbr        Average bitrate of audio and video in KBit/s
 155                     * abr        Average audio bitrate in KBit/s
 156                     * acodec     Name of the audio codec in use
 157                     * asr        Audio sampling rate in Hertz
 158                     * vbr        Average video bitrate in KBit/s
 159                     * fps        Frame rate
 160                     * vcodec     Name of the video codec in use
 161                     * container  Name of the container format
 162                     * filesize   The number of bytes, if known in advance
 163                     * filesize_approx  An estimate for the number of bytes
 164                     * player_url SWF Player URL (used for rtmpdump).
 165                     * protocol   The protocol that will be used for the actual
 166                                  download, lower-case. One of "http", "https" or
 167                                  one of the protocols defined in downloader.PROTOCOL_MAP
 168                     * fragment_base_url
 169                                  Base URL for fragments. Each fragment's path
 170                                  value (if present) will be relative to
 171                                  this URL.
 172                     * fragments  A list of fragments of a fragmented media.
 173                                  Each fragment entry must contain either an url
 174                                  or a path. If an url is present it should be
 175                                  considered by a client. Otherwise both path and
 176                                  fragment_base_url must be present. Here is
 177                                  the list of all potential fields:
 178                                  * "url" - fragment's URL
 179                                  * "path" - fragment's path relative to
 180                                             fragment_base_url
 181                                  * "duration" (optional, int or float)
 182                                  * "filesize" (optional, int)
 183                     * is_from_start  Is a live format that can be downloaded
 184                                 from the start. Boolean
 185                     * preference Order number of this format. If this field is
 186                                  present and not None, the formats get sorted
 187                                  by this field, regardless of all other values.
 188                                  -1 for default (order by other properties),
 189                                  -2 or smaller for less than default.
 190                                  < -1000 to hide the format (if there is
 191                                     another one which is strictly better)
 192                     * language   Language code, e.g. "de" or "en-US".
 193                     * language_preference  Is this in the language mentioned in
 194                                  the URL?
 195                                  10 if it's what the URL is about,
 196                                  -1 for default (don't know),
 197                                  -10 otherwise, other values reserved for now.
 198                     * quality    Order number of the video quality of this
 199                                  format, irrespective of the file format.
 200                                  -1 for default (order by other properties),
 201                                  -2 or smaller for less than default.
 202                     * source_preference  Order number for this video source
 203                                   (quality takes higher priority)
 204                                  -1 for default (order by other properties),
 205                                  -2 or smaller for less than default.
 206                     * http_headers  A dictionary of additional HTTP headers
 207                                  to add to the request.
 208                     * stretched_ratio  If given and not 1, indicates that the
 209                                  video's pixels are not square.
 210                                  width : height ratio as float.
 211                     * no_resume  The server does not support resuming the
 212                                  (HTTP or RTMP) download. Boolean.
 213                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 214                     * downloader_options  A dictionary of downloader options
 215                                  (For internal use only)
 216                                  * http_chunk_size Chunk size for HTTP downloads
 217                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 218                     RTMP formats can also have the additional fields: page_url,
 219                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 220                     rtmp_protocol, rtmp_real_time
 221
 222     url:            Final video URL.
 223     ext:            Video filename extension.
 224     format:         The video format, defaults to ext (used for --get-format)
 225     player_url:     SWF Player URL (used for rtmpdump).
 226
 227     The following fields are optional:
 228
 229     direct:         True if a direct video file was given (must only be set by GenericIE)
 230     alt_title:      A secondary title of the video.
 231     display_id      An alternative identifier for the video, not necessarily
 232                     unique, but available before title. Typically, id is
 233                     something like "4234987", title "Dancing naked mole rats",
 234                     and display_id "dancing-naked-mole-rats"
 235     thumbnails:     A list of dictionaries, with the following entries:
 236                         * "id" (optional, string) - Thumbnail format ID
 237                         * "url"
 238                         * "preference" (optional, int) - quality of the image
 239                         * "width" (optional, int)
 240                         * "height" (optional, int)
 241                         * "resolution" (optional, string "{width}x{height}",
 242                                         deprecated)
 243                         * "filesize" (optional, int)
 244                         * "http_headers" (dict) - HTTP headers for the request
 245     thumbnail:      Full URL to a video thumbnail image.
 246     description:    Full video description.
 247     uploader:       Full name of the video uploader.
 248     license:        License name the video is licensed under.
 249     creator:        The creator of the video.
 250     timestamp:      UNIX timestamp of the moment the video was uploaded
 251     upload_date:    Video upload date in UTC (YYYYMMDD).
 252                     If not explicitly set, calculated from timestamp
 253     release_timestamp: UNIX timestamp of the moment the video was released.
 254                     If it is not clear whether to use timestamp or this, use the former
 255     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 256                     If not explicitly set, calculated from release_timestamp
 257     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 258     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 259                     If not explicitly set, calculated from modified_timestamp
 260     uploader_id:    Nickname or id of the video uploader.
 261     uploader_url:   Full URL to a personal webpage of the video uploader.
 262     channel:        Full name of the channel the video is uploaded on.
 263                     Note that channel fields may or may not repeat uploader
 264                     fields. This depends on a particular extractor.
 265     channel_id:     Id of the channel.
 266     channel_url:    Full URL to a channel webpage.
 267     channel_follower_count: Number of followers of the channel.
 268     location:       Physical location where the video was filmed.
 269     subtitles:      The available subtitles as a dictionary in the format
 270                     {tag: subformats}. "tag" is usually a language code, and
 271                     "subformats" is a list sorted from lower to higher
 272                     preference, each element is a dictionary with the "ext"
 273                     entry and one of:
 274                         * "data": The subtitles file contents
 275                         * "url": A URL pointing to the subtitles file
 276                     It can optionally also have:
 277                         * "name": Name or description of the subtitles
 278                         * "http_headers": A dictionary of additional HTTP headers
 279                                   to add to the request.
 280                     "ext" will be calculated from URL if missing
 281     automatic_captions: Like 'subtitles'; contains automatically generated
 282                     captions instead of normal subtitles
 283     duration:       Length of the video in seconds, as an integer or float.
 284     view_count:     How many users have watched the video on the platform.
 285     like_count:     Number of positive ratings of the video
 286     dislike_count:  Number of negative ratings of the video
 287     repost_count:   Number of reposts of the video
 288     average_rating: Average rating give by users, the scale used depends on the webpage
 289     comment_count:  Number of comments on the video
 290     comments:       A list of comments, each with one or more of the following
 291                     properties (all but one of text or html optional):
 292                         * "author" - human-readable name of the comment author
 293                         * "author_id" - user ID of the comment author
 294                         * "author_thumbnail" - The thumbnail of the comment author
 295                         * "id" - Comment ID
 296                         * "html" - Comment as HTML
 297                         * "text" - Plain text of the comment
 298                         * "timestamp" - UNIX timestamp of comment
 299                         * "parent" - ID of the comment this one is replying to.
 300                                      Set to "root" to indicate that this is a
 301                                      comment to the original video.
 302                         * "like_count" - Number of positive ratings of the comment
 303                         * "dislike_count" - Number of negative ratings of the comment
 304                         * "is_favorited" - Whether the comment is marked as
 305                                            favorite by the video uploader
 306                         * "author_is_uploader" - Whether the comment is made by
 307                                                  the video uploader
 308     age_limit:      Age restriction for the video, as an integer (years)
 309     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 310                     should allow to get the same result again. (It will be set
 311                     by YoutubeDL if it's missing)
 312     categories:     A list of categories that the video falls in, for example
 313                     ["Sports", "Berlin"]
 314     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 315     cast:           A list of the video cast
 316     is_live:        True, False, or None (=unknown). Whether this video is a
 317                     live stream that goes on instead of a fixed-length video.
 318     was_live:       True, False, or None (=unknown). Whether this video was
 319                     originally a live stream.
 320     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 321                     If absent, automatically set from is_live, was_live
 322     start_time:     Time in seconds where the reproduction should start, as
 323                     specified in the URL.
 324     end_time:       Time in seconds where the reproduction should end, as
 325                     specified in the URL.
 326     chapters:       A list of dictionaries, with the following entries:
 327                         * "start_time" - The start time of the chapter in seconds
 328                         * "end_time" - The end time of the chapter in seconds
 329                         * "title" (optional, string)
 330     playable_in_embed: Whether this video is allowed to play in embedded
 331                     players on other sites. Can be True (=always allowed),
 332                     False (=never allowed), None (=unknown), or a string
 333                     specifying the criteria for embedability (Eg: 'whitelist')
 334     availability:   Under what condition the video is available. One of
 335                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 336                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 337                     to set it
 338     __post_extractor: A function to be called just before the metadata is
 339                     written to either disk, logger or console. The function
 340                     must return a dict which will be added to the info_dict.
 341                     This is usefull for additional information that is
 342                     time-consuming to extract. Note that the fields thus
 343                     extracted will not be available to output template and
 344                     match_filter. So, only "comments" and "comment_count" are
 345                     currently allowed to be extracted via this method.
 346
 347     The following fields should only be used when the video belongs to some logical
 348     chapter or section:
 349
 350     chapter:        Name or title of the chapter the video belongs to.
 351     chapter_number: Number of the chapter the video belongs to, as an integer.
 352     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 353
 354     The following fields should only be used when the video is an episode of some
 355     series, programme or podcast:
 356
 357     series:         Title of the series or programme the video episode belongs to.
 358     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 359     season:         Title of the season the video episode belongs to.
 360     season_number:  Number of the season the video episode belongs to, as an integer.
 361     season_id:      Id of the season the video episode belongs to, as a unicode string.
 362     episode:        Title of the video episode. Unlike mandatory video title field,
 363                     this field should denote the exact title of the video episode
 364                     without any kind of decoration.
 365     episode_number: Number of the video episode within a season, as an integer.
 366     episode_id:     Id of the video episode, as a unicode string.
 367
 368     The following fields should only be used when the media is a track or a part of
 369     a music album:
 370
 371     track:          Title of the track.
 372     track_number:   Number of the track within an album or a disc, as an integer.
 373     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 374                     as a unicode string.
 375     artist:         Artist(s) of the track.
 376     genre:          Genre(s) of the track.
 377     album:          Title of the album the track belongs to.
 378     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 379     album_artist:   List of all artists appeared on the album (e.g.
 380                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 381                     and compilations).
 382     disc_number:    Number of the disc or other physical medium the track belongs to,
 383                     as an integer.
 384     release_year:   Year (YYYY) when the album was released.
 385     composer:       Composer of the piece
 386
 387     The following fields should only be set for clips that should be cut from the original video:
 388
 389     section_start:  Start time of the section in seconds
 390     section_end:    End time of the section in seconds
 391
 392     Unless mentioned otherwise, the fields should be Unicode strings.
 393
 394     Unless mentioned otherwise, None is equivalent to absence of information.
 395
 396
 397     _type "playlist" indicates multiple videos.
 398     There must be a key "entries", which is a list, an iterable, or a PagedList
 399     object, each element of which is a valid dictionary by this specification.
 400
 401     Additionally, playlists can have "id", "title", and any other relevent
 402     attributes with the same semantics as videos (see above).
 403
 404     It can also have the following optional fields:
 405
 406     playlist_count: The total number of videos in a playlist. If not given,
 407                     YoutubeDL tries to calculate it from "entries"
 408
 409
 410     _type "multi_video" indicates that there are multiple videos that
 411     form a single show, for examples multiple acts of an opera or TV episode.
 412     It must have an entries key like a playlist and contain all the keys
 413     required for a video at the same time.
 414
 415
 416     _type "url" indicates that the video must be extracted from another
 417     location, possibly by a different extractor. Its only required key is:
 418     "url" - the next URL to extract.
 419     The key "ie_key" can be set to the class name (minus the trailing "IE",
 420     e.g. "Youtube") if the extractor class is known in advance.
 421     Additionally, the dictionary may have any properties of the resolved entity
 422     known in advance, for example "title" if the title of the referred video is
 423     known ahead of time.
 424
 425
 426     _type "url_transparent" entities have the same specification as "url", but
 427     indicate that the given additional information is more precise than the one
 428     associated with the resolved URL.
 429     This is useful when a site employs a video service that hosts the video and
 430     its technical metadata, but that video service does not embed a useful
 431     title, description etc.
 432
 433
 434     Subclasses of this should define a _VALID_URL regexp and, re-define the
 435     _real_extract() and (optionally) _real_initialize() methods.
 436     Probably, they should also be added to the list of extractors.
 437
 438     Subclasses may also override suitable() if necessary, but ensure the function
 439     signature is preserved and that this function imports everything it needs
 440     (except other extractors), so that lazy_extractors works correctly.
 441
 442     To support username + password (or netrc) login, the extractor must define a
 443     _NETRC_MACHINE and re-define _perform_login(username, password) and
 444     (optionally) _initialize_pre_login() methods. The _perform_login method will
 445     be called between _initialize_pre_login and _real_initialize if credentials
 446     are passed by the user. In cases where it is necessary to have the login
 447     process as part of the extraction rather than initialization, _perform_login
 448     can be left undefined.
 449
 450     _GEO_BYPASS attribute may be set to False in order to disable
 451     geo restriction bypass mechanisms for a particular extractor.
 452     Though it won't disable explicit geo restriction bypass based on
 453     country code provided with geo_bypass_country.
 454
 455     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 456     countries for this extractor. One of these countries will be used by
 457     geo restriction bypass mechanism right away in order to bypass
 458     geo restriction, of course, if the mechanism is not disabled.
 459
 460     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 461     IP blocks in CIDR notation for this extractor. One of these IP blocks
 462     will be used by geo restriction bypass mechanism similarly
 463     to _GEO_COUNTRIES.
 464
 465     The _WORKING attribute should be set to False for broken IEs
 466     in order to warn the users and skip the tests.
 467     """
 468
 469     _ready = False
 470     _downloader = None
 471     _x_forwarded_for_ip = None
 472     _GEO_BYPASS = True
 473     _GEO_COUNTRIES = None
 474     _GEO_IP_BLOCKS = None
 475     _WORKING = True
 476     _NETRC_MACHINE = None
 477     IE_DESC = None
 478     SEARCH_KEY = None
 479
 480     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 481         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 482         return {
 483             None: '',
 484             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 485             'password': f'Use {password_hint}',
 486             'cookies': (
 487                 'Use --cookies-from-browser or --cookies for the authentication. '
 488                 'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 489         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 490
 491     def __init__(self, downloader=None):
 492         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 493         If a downloader is not passed during initialization,
 494         it must be set using "set_downloader()" before "extract()" is called"""
 495         self._ready = False
 496         self._x_forwarded_for_ip = None
 497         self._printed_messages = set()
 498         self.set_downloader(downloader)
 499
 500     @classmethod
 501     def _match_valid_url(cls, url):
 502         # This does not use has/getattr intentionally - we want to know whether
 503         # we have cached the regexp for *this* class, whereas getattr would also
 504         # match the superclass
 505         if '_VALID_URL_RE' not in cls.__dict__:
 506             if '_VALID_URL' not in cls.__dict__:
 507                 cls._VALID_URL = cls._make_valid_url()
 508             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 509         return cls._VALID_URL_RE.match(url)
 510
 511     @classmethod
 512     def suitable(cls, url):
 513         """Receives a URL and returns True if suitable for this IE."""
 514         # This function must import everything it needs (except other extractors),
 515         # so that lazy_extractors works correctly
 516         return cls._match_valid_url(url) is not None
 517
 518     @classmethod
 519     def _match_id(cls, url):
 520         return cls._match_valid_url(url).group('id')
 521
 522     @classmethod
 523     def get_temp_id(cls, url):
 524         try:
 525             return cls._match_id(url)
 526         except (IndexError, AttributeError):
 527             return None
 528
 529     @classmethod
 530     def working(cls):
 531         """Getter method for _WORKING."""
 532         return cls._WORKING
 533
 534     @classmethod
 535     def supports_login(cls):
 536         return bool(cls._NETRC_MACHINE)
 537
 538     def initialize(self):
 539         """Initializes an instance (authentication, etc)."""
 540         self._printed_messages = set()
 541         self._initialize_geo_bypass({
 542             'countries': self._GEO_COUNTRIES,
 543             'ip_blocks': self._GEO_IP_BLOCKS,
 544         })
 545         if not self._ready:
 546             self._initialize_pre_login()
 547             if self.supports_login():
 548                 username, password = self._get_login_info()
 549                 if username:
 550                     self._perform_login(username, password)
 551             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 552                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 553             self._real_initialize()
 554             self._ready = True
 555
 556     def _initialize_geo_bypass(self, geo_bypass_context):
 557         """
 558         Initialize geo restriction bypass mechanism.
 559
 560         This method is used to initialize geo bypass mechanism based on faking
 561         X-Forwarded-For HTTP header. A random country from provided country list
 562         is selected and a random IP belonging to this country is generated. This
 563         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 564         HTTP requests.
 565
 566         This method will be used for initial geo bypass mechanism initialization
 567         during the instance initialization with _GEO_COUNTRIES and
 568         _GEO_IP_BLOCKS.
 569
 570         You may also manually call it from extractor's code if geo bypass
 571         information is not available beforehand (e.g. obtained during
 572         extraction) or due to some other reason. In this case you should pass
 573         this information in geo bypass context passed as first argument. It may
 574         contain following fields:
 575
 576         countries:  List of geo unrestricted countries (similar
 577                     to _GEO_COUNTRIES)
 578         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 579                     (similar to _GEO_IP_BLOCKS)
 580
 581         """
 582         if not self._x_forwarded_for_ip:
 583
 584             # Geo bypass mechanism is explicitly disabled by user
 585             if not self.get_param('geo_bypass', True):
 586                 return
 587
 588             if not geo_bypass_context:
 589                 geo_bypass_context = {}
 590
 591             # Backward compatibility: previously _initialize_geo_bypass
 592             # expected a list of countries, some 3rd party code may still use
 593             # it this way
 594             if isinstance(geo_bypass_context, (list, tuple)):
 595                 geo_bypass_context = {
 596                     'countries': geo_bypass_context,
 597                 }
 598
 599             # The whole point of geo bypass mechanism is to fake IP
 600             # as X-Forwarded-For HTTP header based on some IP block or
 601             # country code.
 602
 603             # Path 1: bypassing based on IP block in CIDR notation
 604
 605             # Explicit IP block specified by user, use it right away
 606             # regardless of whether extractor is geo bypassable or not
 607             ip_block = self.get_param('geo_bypass_ip_block', None)
 608
 609             # Otherwise use random IP block from geo bypass context but only
 610             # if extractor is known as geo bypassable
 611             if not ip_block:
 612                 ip_blocks = geo_bypass_context.get('ip_blocks')
 613                 if self._GEO_BYPASS and ip_blocks:
 614                     ip_block = random.choice(ip_blocks)
 615
 616             if ip_block:
 617                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 618                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 619                 return
 620
 621             # Path 2: bypassing based on country code
 622
 623             # Explicit country code specified by user, use it right away
 624             # regardless of whether extractor is geo bypassable or not
 625             country = self.get_param('geo_bypass_country', None)
 626
 627             # Otherwise use random country code from geo bypass context but
 628             # only if extractor is known as geo bypassable
 629             if not country:
 630                 countries = geo_bypass_context.get('countries')
 631                 if self._GEO_BYPASS and countries:
 632                     country = random.choice(countries)
 633
 634             if country:
 635                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 636                 self._downloader.write_debug(
 637                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 638
 639     def extract(self, url):
 640         """Extracts URL information and returns it in list of dicts."""
 641         try:
 642             for _ in range(2):
 643                 try:
 644                     self.initialize()
 645                     self.write_debug('Extracting URL: %s' % url)
 646                     ie_result = self._real_extract(url)
 647                     if ie_result is None:
 648                         return None
 649                     if self._x_forwarded_for_ip:
 650                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 651                     subtitles = ie_result.get('subtitles')
 652                     if (subtitles and 'live_chat' in subtitles
 653                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 654                         del subtitles['live_chat']
 655                     return ie_result
 656                 except GeoRestrictedError as e:
 657                     if self.__maybe_fake_ip_and_retry(e.countries):
 658                         continue
 659                     raise
 660         except UnsupportedError:
 661             raise
 662         except ExtractorError as e:
 663             kwargs = {
 664                 'video_id': e.video_id or self.get_temp_id(url),
 665                 'ie': self.IE_NAME,
 666                 'tb': e.traceback or sys.exc_info()[2],
 667                 'expected': e.expected,
 668                 'cause': e.cause
 669             }
 670             if hasattr(e, 'countries'):
 671                 kwargs['countries'] = e.countries
 672             raise type(e)(e.orig_msg, **kwargs)
 673         except http.client.IncompleteRead as e:
 674             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 675         except (KeyError, StopIteration) as e:
 676             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 677
 678     def __maybe_fake_ip_and_retry(self, countries):
 679         if (not self.get_param('geo_bypass_country', None)
 680                 and self._GEO_BYPASS
 681                 and self.get_param('geo_bypass', True)
 682                 and not self._x_forwarded_for_ip
 683                 and countries):
 684             country_code = random.choice(countries)
 685             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 686             if self._x_forwarded_for_ip:
 687                 self.report_warning(
 688                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 689                     % (self._x_forwarded_for_ip, country_code.upper()))
 690                 return True
 691         return False
 692
 693     def set_downloader(self, downloader):
 694         """Sets a YoutubeDL instance as the downloader for this IE."""
 695         self._downloader = downloader
 696
 697     @property
 698     def cache(self):
 699         return self._downloader.cache
 700
 701     @property
 702     def cookiejar(self):
 703         return self._downloader.cookiejar
 704
 705     def _initialize_pre_login(self):
 706         """ Intialization before login. Redefine in subclasses."""
 707         pass
 708
 709     def _perform_login(self, username, password):
 710         """ Login with username and password. Redefine in subclasses."""
 711         pass
 712
 713     def _real_initialize(self):
 714         """Real initialization process. Redefine in subclasses."""
 715         pass
 716
 717     def _real_extract(self, url):
 718         """Real extraction process. Redefine in subclasses."""
 719         raise NotImplementedError('This method must be implemented by subclasses')
 720
 721     @classmethod
 722     def ie_key(cls):
 723         """A string for getting the InfoExtractor with get_info_extractor"""
 724         return cls.__name__[:-2]
 725
 726     @classproperty
 727     def IE_NAME(cls):
 728         return cls.__name__[:-2]
 729
 730     @staticmethod
 731     def __can_accept_status_code(err, expected_status):
 732         assert isinstance(err, urllib.error.HTTPError)
 733         if expected_status is None:
 734             return False
 735         elif callable(expected_status):
 736             return expected_status(err.code) is True
 737         else:
 738             return err.code in variadic(expected_status)
 739
 740     def _create_request(self, url_or_request, data=None, headers={}, query={}):
 741         if isinstance(url_or_request, urllib.request.Request):
 742             return update_Request(url_or_request, data=data, headers=headers, query=query)
 743         if query:
 744             url_or_request = update_url_query(url_or_request, query)
 745         return sanitized_Request(url_or_request, data, headers)
 746
 747     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 748         """
 749         Return the response handle.
 750
 751         See _download_webpage docstring for arguments specification.
 752         """
 753         if not self._downloader._first_webpage_request:
 754             sleep_interval = self.get_param('sleep_interval_requests') or 0
 755             if sleep_interval > 0:
 756                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 757                 time.sleep(sleep_interval)
 758         else:
 759             self._downloader._first_webpage_request = False
 760
 761         if note is None:
 762             self.report_download_webpage(video_id)
 763         elif note is not False:
 764             if video_id is None:
 765                 self.to_screen(str(note))
 766             else:
 767                 self.to_screen(f'{video_id}: {note}')
 768
 769         # Some sites check X-Forwarded-For HTTP header in order to figure out
 770         # the origin of the client behind proxy. This allows bypassing geo
 771         # restriction by faking this header's value to IP that belongs to some
 772         # geo unrestricted country. We will do so once we encounter any
 773         # geo restriction error.
 774         if self._x_forwarded_for_ip:
 775             if 'X-Forwarded-For' not in headers:
 776                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 777
 778         try:
 779             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 780         except network_exceptions as err:
 781             if isinstance(err, urllib.error.HTTPError):
 782                 if self.__can_accept_status_code(err, expected_status):
 783                     # Retain reference to error to prevent file object from
 784                     # being closed before it can be read. Works around the
 785                     # effects of <https://bugs.python.org/issue15002>
 786                     # introduced in Python 3.4.1.
 787                     err.fp._error = err
 788                     return err.fp
 789
 790             if errnote is False:
 791                 return False
 792             if errnote is None:
 793                 errnote = 'Unable to download webpage'
 794
 795             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 796             if fatal:
 797                 raise ExtractorError(errmsg, cause=err)
 798             else:
 799                 self.report_warning(errmsg)
 800                 return False
 801
 802     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 803                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 804         """
 805         Return a tuple (page content as string, URL handle).
 806
 807         Arguments:
 808         url_or_request -- plain text URL as a string or
 809             a urllib.request.Request object
 810         video_id -- Video/playlist/item identifier (string)
 811
 812         Keyword arguments:
 813         note -- note printed before downloading (string)
 814         errnote -- note printed in case of an error (string)
 815         fatal -- flag denoting whether error should be considered fatal,
 816             i.e. whether it should cause ExtractionError to be raised,
 817             otherwise a warning will be reported and extraction continued
 818         encoding -- encoding for a page content decoding, guessed automatically
 819             when not explicitly specified
 820         data -- POST data (bytes)
 821         headers -- HTTP headers (dict)
 822         query -- URL query (dict)
 823         expected_status -- allows to accept failed HTTP requests (non 2xx
 824             status code) by explicitly specifying a set of accepted status
 825             codes. Can be any of the following entities:
 826                 - an integer type specifying an exact failed status code to
 827                   accept
 828                 - a list or a tuple of integer types specifying a list of
 829                   failed status codes to accept
 830                 - a callable accepting an actual failed status code and
 831                   returning True if it should be accepted
 832             Note that this argument does not affect success status codes (2xx)
 833             which are always accepted.
 834         """
 835
 836         # Strip hashes from the URL (#1038)
 837         if isinstance(url_or_request, (compat_str, str)):
 838             url_or_request = url_or_request.partition('#')[0]
 839
 840         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 841         if urlh is False:
 842             assert not fatal
 843             return False
 844         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 845         return (content, urlh)
 846
 847     @staticmethod
 848     def _guess_encoding_from_content(content_type, webpage_bytes):
 849         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 850         if m:
 851             encoding = m.group(1)
 852         else:
 853             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 854                           webpage_bytes[:1024])
 855             if m:
 856                 encoding = m.group(1).decode('ascii')
 857             elif webpage_bytes.startswith(b'\xff\xfe'):
 858                 encoding = 'utf-16'
 859             else:
 860                 encoding = 'utf-8'
 861
 862         return encoding
 863
 864     def __check_blocked(self, content):
 865         first_block = content[:512]
 866         if ('<title>Access to this site is blocked</title>' in content
 867                 and 'Websense' in first_block):
 868             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 869             blocked_iframe = self._html_search_regex(
 870                 r'<iframe src="([^"]+)"', content,
 871                 'Websense information URL', default=None)
 872             if blocked_iframe:
 873                 msg += ' Visit %s for more details' % blocked_iframe
 874             raise ExtractorError(msg, expected=True)
 875         if '<title>The URL you requested has been blocked</title>' in first_block:
 876             msg = (
 877                 'Access to this webpage has been blocked by Indian censorship. '
 878                 'Use a VPN or proxy server (with --proxy) to route around it.')
 879             block_msg = self._html_search_regex(
 880                 r'</h1><p>(.*?)</p>',
 881                 content, 'block message', default=None)
 882             if block_msg:
 883                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 884             raise ExtractorError(msg, expected=True)
 885         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 886                 and 'blocklist.rkn.gov.ru' in content):
 887             raise ExtractorError(
 888                 'Access to this webpage has been blocked by decision of the Russian government. '
 889                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 890                 expected=True)
 891
 892     def _request_dump_filename(self, url, video_id):
 893         basen = f'{video_id}_{url}'
 894         trim_length = self.get_param('trim_file_name') or 240
 895         if len(basen) > trim_length:
 896             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 897             basen = basen[:trim_length - len(h)] + h
 898         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 899         # Working around MAX_PATH limitation on Windows (see
 900         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 901         if compat_os_name == 'nt':
 902             absfilepath = os.path.abspath(filename)
 903             if len(absfilepath) > 259:
 904                 filename = fR'\\?\{absfilepath}'
 905         return filename
 906
 907     def __decode_webpage(self, webpage_bytes, encoding, headers):
 908         if not encoding:
 909             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 910         try:
 911             return webpage_bytes.decode(encoding, 'replace')
 912         except LookupError:
 913             return webpage_bytes.decode('utf-8', 'replace')
 914
 915     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 916         webpage_bytes = urlh.read()
 917         if prefix is not None:
 918             webpage_bytes = prefix + webpage_bytes
 919         if self.get_param('dump_intermediate_pages', False):
 920             self.to_screen('Dumping request to ' + urlh.geturl())
 921             dump = base64.b64encode(webpage_bytes).decode('ascii')
 922             self._downloader.to_screen(dump)
 923         if self.get_param('write_pages'):
 924             filename = self._request_dump_filename(urlh.geturl(), video_id)
 925             self.to_screen(f'Saving request to {filename}')
 926             with open(filename, 'wb') as outf:
 927                 outf.write(webpage_bytes)
 928
 929         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 930         self.__check_blocked(content)
 931
 932         return content
 933
 934     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 935         if transform_source:
 936             xml_string = transform_source(xml_string)
 937         try:
 938             return compat_etree_fromstring(xml_string.encode('utf-8'))
 939         except xml.etree.ElementTree.ParseError as ve:
 940             errmsg = '%s: Failed to parse XML ' % video_id
 941             if fatal:
 942                 raise ExtractorError(errmsg, cause=ve)
 943             else:
 944                 self.report_warning(errmsg + str(ve))
 945
 946     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, **parser_kwargs):
 947         try:
 948             return json.loads(
 949                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 950         except ValueError as ve:
 951             errmsg = f'{video_id}: Failed to parse JSON'
 952             if fatal:
 953                 raise ExtractorError(errmsg, cause=ve)
 954             else:
 955                 self.report_warning(f'{errmsg}: {ve}')
 956
 957     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 958         return self._parse_json(
 959             data[data.find('{'):data.rfind('}') + 1],
 960             video_id, transform_source, fatal)
 961
 962     def __create_download_methods(name, parser, note, errnote, return_value):
 963
 964         def parse(ie, content, *args, **kwargs):
 965             if parser is None:
 966                 return content
 967             # parser is fetched by name so subclasses can override it
 968             return getattr(ie, parser)(content, *args, **kwargs)
 969
 970         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 971                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 972             res = self._download_webpage_handle(
 973                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 974                 data=data, headers=headers, query=query, expected_status=expected_status)
 975             if res is False:
 976                 return res
 977             content, urlh = res
 978             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal), urlh
 979
 980         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 981                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 982             if self.get_param('load_pages'):
 983                 url_or_request = self._create_request(url_or_request, data, headers, query)
 984                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
 985                 self.to_screen(f'Loading request from {filename}')
 986                 try:
 987                     with open(filename, 'rb') as dumpf:
 988                         webpage_bytes = dumpf.read()
 989                 except OSError as e:
 990                     self.report_warning(f'Unable to load request from disk: {e}')
 991                 else:
 992                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
 993                     return parse(self, content, video_id, transform_source, fatal)
 994             kwargs = {
 995                 'note': note,
 996                 'errnote': errnote,
 997                 'transform_source': transform_source,
 998                 'fatal': fatal,
 999                 'encoding': encoding,
1000                 'data': data,
1001                 'headers': headers,
1002                 'query': query,
1003                 'expected_status': expected_status,
1004             }
1005             if parser is None:
1006                 kwargs.pop('transform_source')
1007             # The method is fetched by name so subclasses can override _download_..._handle
1008             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1009             return res if res is False else res[0]
1010
1011         def impersonate(func, name, return_value):
1012             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1013             func.__doc__ = f'''
1014                 @param transform_source     Apply this transformation before parsing
1015                 @returns                    {return_value}
1016
1017                 See _download_webpage_handle docstring for other arguments specification
1018             '''
1019
1020         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1021         impersonate(download_content, f'_download_{name}', f'{return_value}')
1022         return download_handle, download_content
1023
1024     _download_xml_handle, _download_xml = __create_download_methods(
1025         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1026     _download_json_handle, _download_json = __create_download_methods(
1027         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1028     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1029         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1030     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1031
1032     def _download_webpage(
1033             self, url_or_request, video_id, note=None, errnote=None,
1034             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1035         """
1036         Return the data of the page as a string.
1037
1038         Keyword arguments:
1039         tries -- number of tries
1040         timeout -- sleep interval between tries
1041
1042         See _download_webpage_handle docstring for other arguments specification.
1043         """
1044
1045         R''' # NB: These are unused; should they be deprecated?
1046         if tries != 1:
1047             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1048         if timeout is NO_DEFAULT:
1049             timeout = 5
1050         else:
1051             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1052         '''
1053
1054         try_count = 0
1055         while True:
1056             try:
1057                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1058             except http.client.IncompleteRead as e:
1059                 try_count += 1
1060                 if try_count >= tries:
1061                     raise e
1062                 self._sleep(timeout, video_id)
1063
1064     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1065         idstr = format_field(video_id, None, '%s: ')
1066         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1067         if only_once:
1068             if f'WARNING: {msg}' in self._printed_messages:
1069                 return
1070             self._printed_messages.add(f'WARNING: {msg}')
1071         self._downloader.report_warning(msg, *args, **kwargs)
1072
1073     def to_screen(self, msg, *args, **kwargs):
1074         """Print msg to screen, prefixing it with '[ie_name]'"""
1075         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1076
1077     def write_debug(self, msg, *args, **kwargs):
1078         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1079
1080     def get_param(self, name, default=None, *args, **kwargs):
1081         if self._downloader:
1082             return self._downloader.params.get(name, default, *args, **kwargs)
1083         return default
1084
1085     def report_drm(self, video_id, partial=False):
1086         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1087
1088     def report_extraction(self, id_or_name):
1089         """Report information extraction."""
1090         self.to_screen('%s: Extracting information' % id_or_name)
1091
1092     def report_download_webpage(self, video_id):
1093         """Report webpage download."""
1094         self.to_screen('%s: Downloading webpage' % video_id)
1095
1096     def report_age_confirmation(self):
1097         """Report attempt to confirm age."""
1098         self.to_screen('Confirming age')
1099
1100     def report_login(self):
1101         """Report attempt to log in."""
1102         self.to_screen('Logging in')
1103
1104     def raise_login_required(
1105             self, msg='This video is only available for registered users',
1106             metadata_available=False, method=NO_DEFAULT):
1107         if metadata_available and (
1108                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1109             self.report_warning(msg)
1110             return
1111         msg += format_field(self._login_hint(method), None, '. %s')
1112         raise ExtractorError(msg, expected=True)
1113
1114     def raise_geo_restricted(
1115             self, msg='This video is not available from your location due to geo restriction',
1116             countries=None, metadata_available=False):
1117         if metadata_available and (
1118                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1119             self.report_warning(msg)
1120         else:
1121             raise GeoRestrictedError(msg, countries=countries)
1122
1123     def raise_no_formats(self, msg, expected=False, video_id=None):
1124         if expected and (
1125                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1126             self.report_warning(msg, video_id)
1127         elif isinstance(msg, ExtractorError):
1128             raise msg
1129         else:
1130             raise ExtractorError(msg, expected=expected, video_id=video_id)
1131
1132     # Methods for following #608
1133     @staticmethod
1134     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1135         """Returns a URL that points to a page that should be processed"""
1136         if ie is not None:
1137             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1138         if video_id is not None:
1139             kwargs['id'] = video_id
1140         if video_title is not None:
1141             kwargs['title'] = video_title
1142         return {
1143             **kwargs,
1144             '_type': 'url_transparent' if url_transparent else 'url',
1145             'url': url,
1146         }
1147
1148     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1149         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1150                 for m in orderedSet(map(getter, matches) if getter else matches))
1151         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1152
1153     @staticmethod
1154     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1155         """Returns a playlist"""
1156         if playlist_id:
1157             kwargs['id'] = playlist_id
1158         if playlist_title:
1159             kwargs['title'] = playlist_title
1160         if playlist_description is not None:
1161             kwargs['description'] = playlist_description
1162         return {
1163             **kwargs,
1164             '_type': 'multi_video' if multi_video else 'playlist',
1165             'entries': entries,
1166         }
1167
1168     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1169         """
1170         Perform a regex search on the given string, using a single or a list of
1171         patterns returning the first matching group.
1172         In case of failure return a default value or raise a WARNING or a
1173         RegexNotFoundError, depending on fatal, specifying the field name.
1174         """
1175         if string is None:
1176             mobj = None
1177         elif isinstance(pattern, (str, re.Pattern)):
1178             mobj = re.search(pattern, string, flags)
1179         else:
1180             for p in pattern:
1181                 mobj = re.search(p, string, flags)
1182                 if mobj:
1183                     break
1184
1185         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1186
1187         if mobj:
1188             if group is None:
1189                 # return the first matching group
1190                 return next(g for g in mobj.groups() if g is not None)
1191             elif isinstance(group, (list, tuple)):
1192                 return tuple(mobj.group(g) for g in group)
1193             else:
1194                 return mobj.group(group)
1195         elif default is not NO_DEFAULT:
1196             return default
1197         elif fatal:
1198             raise RegexNotFoundError('Unable to extract %s' % _name)
1199         else:
1200             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1201             return None
1202
1203     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1204                      contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs):
1205         """Searches string for the JSON object specified by start_pattern"""
1206         # NB: end_pattern is only used to reduce the size of the initial match
1207         if default is NO_DEFAULT:
1208             default, has_default = {}, False
1209         else:
1210             fatal, has_default = False, True
1211
1212         json_string = self._search_regex(
1213             rf'{start_pattern}\s*(?P<json>{{\s*{contains_pattern}\s*}})\s*{end_pattern}',
1214             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1215         if not json_string:
1216             return default
1217
1218         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1219         try:
1220             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1221         except ExtractorError as e:
1222             if fatal:
1223                 raise ExtractorError(
1224                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1225             elif not has_default:
1226                 self.report_warning(
1227                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1228         return default
1229
1230     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1231         """
1232         Like _search_regex, but strips HTML tags and unescapes entities.
1233         """
1234         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1235         if res:
1236             return clean_html(res).strip()
1237         else:
1238             return res
1239
1240     def _get_netrc_login_info(self, netrc_machine=None):
1241         username = None
1242         password = None
1243         netrc_machine = netrc_machine or self._NETRC_MACHINE
1244
1245         if self.get_param('usenetrc', False):
1246             try:
1247                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1248                 if os.path.isdir(netrc_file):
1249                     netrc_file = os.path.join(netrc_file, '.netrc')
1250                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1251                 if info is not None:
1252                     username = info[0]
1253                     password = info[2]
1254                 else:
1255                     raise netrc.NetrcParseError(
1256                         'No authenticators for %s' % netrc_machine)
1257             except (OSError, netrc.NetrcParseError) as err:
1258                 self.report_warning(
1259                     'parsing .netrc: %s' % error_to_compat_str(err))
1260
1261         return username, password
1262
1263     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1264         """
1265         Get the login info as (username, password)
1266         First look for the manually specified credentials using username_option
1267         and password_option as keys in params dictionary. If no such credentials
1268         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1269         value.
1270         If there's no info available, return (None, None)
1271         """
1272
1273         # Attempt to use provided username and password or .netrc data
1274         username = self.get_param(username_option)
1275         if username is not None:
1276             password = self.get_param(password_option)
1277         else:
1278             username, password = self._get_netrc_login_info(netrc_machine)
1279
1280         return username, password
1281
1282     def _get_tfa_info(self, note='two-factor verification code'):
1283         """
1284         Get the two-factor authentication info
1285         TODO - asking the user will be required for sms/phone verify
1286         currently just uses the command line option
1287         If there's no info available, return None
1288         """
1289
1290         tfa = self.get_param('twofactor')
1291         if tfa is not None:
1292             return tfa
1293
1294         return getpass.getpass('Type %s and press [Return]: ' % note)
1295
1296     # Helper functions for extracting OpenGraph info
1297     @staticmethod
1298     def _og_regexes(prop):
1299         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1300         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1301                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1302         template = r'<meta[^>]+?%s[^>]+?%s'
1303         return [
1304             template % (property_re, content_re),
1305             template % (content_re, property_re),
1306         ]
1307
1308     @staticmethod
1309     def _meta_regex(prop):
1310         return r'''(?isx)<meta
1311                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1312                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1313
1314     def _og_search_property(self, prop, html, name=None, **kargs):
1315         prop = variadic(prop)
1316         if name is None:
1317             name = 'OpenGraph %s' % prop[0]
1318         og_regexes = []
1319         for p in prop:
1320             og_regexes.extend(self._og_regexes(p))
1321         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1322         if escaped is None:
1323             return None
1324         return unescapeHTML(escaped)
1325
1326     def _og_search_thumbnail(self, html, **kargs):
1327         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1328
1329     def _og_search_description(self, html, **kargs):
1330         return self._og_search_property('description', html, fatal=False, **kargs)
1331
1332     def _og_search_title(self, html, *, fatal=False, **kargs):
1333         return self._og_search_property('title', html, fatal=fatal, **kargs)
1334
1335     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1336         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1337         if secure:
1338             regexes = self._og_regexes('video:secure_url') + regexes
1339         return self._html_search_regex(regexes, html, name, **kargs)
1340
1341     def _og_search_url(self, html, **kargs):
1342         return self._og_search_property('url', html, **kargs)
1343
1344     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1345         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1346
1347     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1348         name = variadic(name)
1349         if display_name is None:
1350             display_name = name[0]
1351         return self._html_search_regex(
1352             [self._meta_regex(n) for n in name],
1353             html, display_name, fatal=fatal, group='content', **kwargs)
1354
1355     def _dc_search_uploader(self, html):
1356         return self._html_search_meta('dc.creator', html, 'uploader')
1357
1358     def _rta_search(self, html):
1359         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1360         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1361                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1362                      html):
1363             return 18
1364         return 0
1365
1366     def _media_rating_search(self, html):
1367         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1368         rating = self._html_search_meta('rating', html)
1369
1370         if not rating:
1371             return None
1372
1373         RATING_TABLE = {
1374             'safe for kids': 0,
1375             'general': 8,
1376             '14 years': 14,
1377             'mature': 17,
1378             'restricted': 19,
1379         }
1380         return RATING_TABLE.get(rating.lower())
1381
1382     def _family_friendly_search(self, html):
1383         # See http://schema.org/VideoObject
1384         family_friendly = self._html_search_meta(
1385             'isFamilyFriendly', html, default=None)
1386
1387         if not family_friendly:
1388             return None
1389
1390         RATING_TABLE = {
1391             '1': 0,
1392             'true': 0,
1393             '0': 18,
1394             'false': 18,
1395         }
1396         return RATING_TABLE.get(family_friendly.lower())
1397
1398     def _twitter_search_player(self, html):
1399         return self._html_search_meta('twitter:player', html,
1400                                       'twitter card player')
1401
1402     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1403         """Yield all json ld objects in the html"""
1404         if default is not NO_DEFAULT:
1405             fatal = False
1406         for mobj in re.finditer(JSON_LD_RE, html):
1407             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1408             for json_ld in variadic(json_ld_item):
1409                 if isinstance(json_ld, dict):
1410                     yield json_ld
1411
1412     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1413         """Search for a video in any json ld in the html"""
1414         if default is not NO_DEFAULT:
1415             fatal = False
1416         info = self._json_ld(
1417             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1418             video_id, fatal=fatal, expected_type=expected_type)
1419         if info:
1420             return info
1421         if default is not NO_DEFAULT:
1422             return default
1423         elif fatal:
1424             raise RegexNotFoundError('Unable to extract JSON-LD')
1425         else:
1426             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1427             return {}
1428
1429     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1430         if isinstance(json_ld, compat_str):
1431             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1432         if not json_ld:
1433             return {}
1434         info = {}
1435         if not isinstance(json_ld, (list, tuple, dict)):
1436             return info
1437         if isinstance(json_ld, dict):
1438             json_ld = [json_ld]
1439
1440         INTERACTION_TYPE_MAP = {
1441             'CommentAction': 'comment',
1442             'AgreeAction': 'like',
1443             'DisagreeAction': 'dislike',
1444             'LikeAction': 'like',
1445             'DislikeAction': 'dislike',
1446             'ListenAction': 'view',
1447             'WatchAction': 'view',
1448             'ViewAction': 'view',
1449         }
1450
1451         def is_type(e, *expected_types):
1452             type = variadic(traverse_obj(e, '@type'))
1453             return any(x in type for x in expected_types)
1454
1455         def extract_interaction_type(e):
1456             interaction_type = e.get('interactionType')
1457             if isinstance(interaction_type, dict):
1458                 interaction_type = interaction_type.get('@type')
1459             return str_or_none(interaction_type)
1460
1461         def extract_interaction_statistic(e):
1462             interaction_statistic = e.get('interactionStatistic')
1463             if isinstance(interaction_statistic, dict):
1464                 interaction_statistic = [interaction_statistic]
1465             if not isinstance(interaction_statistic, list):
1466                 return
1467             for is_e in interaction_statistic:
1468                 if not is_type(is_e, 'InteractionCounter'):
1469                     continue
1470                 interaction_type = extract_interaction_type(is_e)
1471                 if not interaction_type:
1472                     continue
1473                 # For interaction count some sites provide string instead of
1474                 # an integer (as per spec) with non digit characters (e.g. ",")
1475                 # so extracting count with more relaxed str_to_int
1476                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1477                 if interaction_count is None:
1478                     continue
1479                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1480                 if not count_kind:
1481                     continue
1482                 count_key = '%s_count' % count_kind
1483                 if info.get(count_key) is not None:
1484                     continue
1485                 info[count_key] = interaction_count
1486
1487         def extract_chapter_information(e):
1488             chapters = [{
1489                 'title': part.get('name'),
1490                 'start_time': part.get('startOffset'),
1491                 'end_time': part.get('endOffset'),
1492             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1493             for idx, (last_c, current_c, next_c) in enumerate(zip(
1494                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1495                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1496                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1497                 if None in current_c.values():
1498                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1499                     return
1500             if chapters:
1501                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1502                 info['chapters'] = chapters
1503
1504         def extract_video_object(e):
1505             assert is_type(e, 'VideoObject')
1506             author = e.get('author')
1507             info.update({
1508                 'url': url_or_none(e.get('contentUrl')),
1509                 'title': unescapeHTML(e.get('name')),
1510                 'description': unescapeHTML(e.get('description')),
1511                 'thumbnails': [{'url': url}
1512                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1513                                if url_or_none(url)],
1514                 'duration': parse_duration(e.get('duration')),
1515                 'timestamp': unified_timestamp(e.get('uploadDate')),
1516                 # author can be an instance of 'Organization' or 'Person' types.
1517                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1518                 # however some websites are using 'Text' type instead.
1519                 # 1. https://schema.org/VideoObject
1520                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1521                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1522                 'tbr': int_or_none(e.get('bitrate')),
1523                 'width': int_or_none(e.get('width')),
1524                 'height': int_or_none(e.get('height')),
1525                 'view_count': int_or_none(e.get('interactionCount')),
1526             })
1527             extract_interaction_statistic(e)
1528             extract_chapter_information(e)
1529
1530         def traverse_json_ld(json_ld, at_top_level=True):
1531             for e in json_ld:
1532                 if at_top_level and '@context' not in e:
1533                     continue
1534                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1535                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1536                     break
1537                 if expected_type is not None and not is_type(e, expected_type):
1538                     continue
1539                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1540                 if rating is not None:
1541                     info['average_rating'] = rating
1542                 if is_type(e, 'TVEpisode', 'Episode'):
1543                     episode_name = unescapeHTML(e.get('name'))
1544                     info.update({
1545                         'episode': episode_name,
1546                         'episode_number': int_or_none(e.get('episodeNumber')),
1547                         'description': unescapeHTML(e.get('description')),
1548                     })
1549                     if not info.get('title') and episode_name:
1550                         info['title'] = episode_name
1551                     part_of_season = e.get('partOfSeason')
1552                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1553                         info.update({
1554                             'season': unescapeHTML(part_of_season.get('name')),
1555                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1556                         })
1557                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1558                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1559                         info['series'] = unescapeHTML(part_of_series.get('name'))
1560                 elif is_type(e, 'Movie'):
1561                     info.update({
1562                         'title': unescapeHTML(e.get('name')),
1563                         'description': unescapeHTML(e.get('description')),
1564                         'duration': parse_duration(e.get('duration')),
1565                         'timestamp': unified_timestamp(e.get('dateCreated')),
1566                     })
1567                 elif is_type(e, 'Article', 'NewsArticle'):
1568                     info.update({
1569                         'timestamp': parse_iso8601(e.get('datePublished')),
1570                         'title': unescapeHTML(e.get('headline')),
1571                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1572                     })
1573                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1574                         extract_video_object(e['video'][0])
1575                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1576                         extract_video_object(e['subjectOf'][0])
1577                 elif is_type(e, 'VideoObject'):
1578                     extract_video_object(e)
1579                     if expected_type is None:
1580                         continue
1581                     else:
1582                         break
1583                 video = e.get('video')
1584                 if is_type(video, 'VideoObject'):
1585                     extract_video_object(video)
1586                 if expected_type is None:
1587                     continue
1588                 else:
1589                     break
1590         traverse_json_ld(json_ld)
1591
1592         return filter_dict(info)
1593
1594     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1595         return self._parse_json(
1596             self._search_regex(
1597                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1598                 webpage, 'next.js data', fatal=fatal, **kw),
1599             video_id, transform_source=transform_source, fatal=fatal)
1600
1601     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1602         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1603         rectx = re.escape(context_name)
1604         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1605         js, arg_keys, arg_vals = self._search_regex(
1606             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1607             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
1608
1609         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1610
1611         for key, val in args.items():
1612             if val in ('undefined', 'void 0'):
1613                 args[key] = 'null'
1614
1615         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1616         return traverse_obj(ret, traverse) or {}
1617
1618     @staticmethod
1619     def _hidden_inputs(html):
1620         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1621         hidden_inputs = {}
1622         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1623             attrs = extract_attributes(input)
1624             if not input:
1625                 continue
1626             if attrs.get('type') not in ('hidden', 'submit'):
1627                 continue
1628             name = attrs.get('name') or attrs.get('id')
1629             value = attrs.get('value')
1630             if name and value is not None:
1631                 hidden_inputs[name] = value
1632         return hidden_inputs
1633
1634     def _form_hidden_inputs(self, form_id, html):
1635         form = self._search_regex(
1636             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1637             html, '%s form' % form_id, group='form')
1638         return self._hidden_inputs(form)
1639
1640     class FormatSort:
1641         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1642
1643         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1644                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1645                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1646         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1647                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1648                         'fps', 'fs_approx', 'source', 'id')
1649
1650         settings = {
1651             'vcodec': {'type': 'ordered', 'regex': True,
1652                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1653             'acodec': {'type': 'ordered', 'regex': True,
1654                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1655             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1656                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1657             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1658                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1659             'vext': {'type': 'ordered', 'field': 'video_ext',
1660                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1661                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1662             'aext': {'type': 'ordered', 'field': 'audio_ext',
1663                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1664                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1665             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1666             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1667                            'field': ('vcodec', 'acodec'),
1668                            'function': lambda it: int(any(v != 'none' for v in it))},
1669             'ie_pref': {'priority': True, 'type': 'extractor'},
1670             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1671             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1672             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1673             'quality': {'convert': 'float', 'default': -1},
1674             'filesize': {'convert': 'bytes'},
1675             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1676             'id': {'convert': 'string', 'field': 'format_id'},
1677             'height': {'convert': 'float_none'},
1678             'width': {'convert': 'float_none'},
1679             'fps': {'convert': 'float_none'},
1680             'tbr': {'convert': 'float_none'},
1681             'vbr': {'convert': 'float_none'},
1682             'abr': {'convert': 'float_none'},
1683             'asr': {'convert': 'float_none'},
1684             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1685
1686             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1687             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1688             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1689             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1690             'res': {'type': 'multiple', 'field': ('height', 'width'),
1691                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1692
1693             # For compatibility with youtube-dl
1694             'format_id': {'type': 'alias', 'field': 'id'},
1695             'preference': {'type': 'alias', 'field': 'ie_pref'},
1696             'language_preference': {'type': 'alias', 'field': 'lang'},
1697             'source_preference': {'type': 'alias', 'field': 'source'},
1698             'protocol': {'type': 'alias', 'field': 'proto'},
1699             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1700
1701             # Deprecated
1702             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1703             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1704             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1705             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1706             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1707             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1708             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1709             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1710             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1711             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1712             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1713             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1714             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1715             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1716             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1717             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1718             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1719             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1720             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1721             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1722         }
1723
1724         def __init__(self, ie, field_preference):
1725             self._order = []
1726             self.ydl = ie._downloader
1727             self.evaluate_params(self.ydl.params, field_preference)
1728             if ie.get_param('verbose'):
1729                 self.print_verbose_info(self.ydl.write_debug)
1730
1731         def _get_field_setting(self, field, key):
1732             if field not in self.settings:
1733                 if key in ('forced', 'priority'):
1734                     return False
1735                 self.ydl.deprecation_warning(
1736                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1737                     'and may be removed in a future version')
1738                 self.settings[field] = {}
1739             propObj = self.settings[field]
1740             if key not in propObj:
1741                 type = propObj.get('type')
1742                 if key == 'field':
1743                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1744                 elif key == 'convert':
1745                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1746                 else:
1747                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1748                 propObj[key] = default
1749             return propObj[key]
1750
1751         def _resolve_field_value(self, field, value, convertNone=False):
1752             if value is None:
1753                 if not convertNone:
1754                     return None
1755             else:
1756                 value = value.lower()
1757             conversion = self._get_field_setting(field, 'convert')
1758             if conversion == 'ignore':
1759                 return None
1760             if conversion == 'string':
1761                 return value
1762             elif conversion == 'float_none':
1763                 return float_or_none(value)
1764             elif conversion == 'bytes':
1765                 return FileDownloader.parse_bytes(value)
1766             elif conversion == 'order':
1767                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1768                 use_regex = self._get_field_setting(field, 'regex')
1769                 list_length = len(order_list)
1770                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1771                 if use_regex and value is not None:
1772                     for i, regex in enumerate(order_list):
1773                         if regex and re.match(regex, value):
1774                             return list_length - i
1775                     return list_length - empty_pos  # not in list
1776                 else:  # not regex or  value = None
1777                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1778             else:
1779                 if value.isnumeric():
1780                     return float(value)
1781                 else:
1782                     self.settings[field]['convert'] = 'string'
1783                     return value
1784
1785         def evaluate_params(self, params, sort_extractor):
1786             self._use_free_order = params.get('prefer_free_formats', False)
1787             self._sort_user = params.get('format_sort', [])
1788             self._sort_extractor = sort_extractor
1789
1790             def add_item(field, reverse, closest, limit_text):
1791                 field = field.lower()
1792                 if field in self._order:
1793                     return
1794                 self._order.append(field)
1795                 limit = self._resolve_field_value(field, limit_text)
1796                 data = {
1797                     'reverse': reverse,
1798                     'closest': False if limit is None else closest,
1799                     'limit_text': limit_text,
1800                     'limit': limit}
1801                 if field in self.settings:
1802                     self.settings[field].update(data)
1803                 else:
1804                     self.settings[field] = data
1805
1806             sort_list = (
1807                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1808                 + (tuple() if params.get('format_sort_force', False)
1809                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1810                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1811
1812             for item in sort_list:
1813                 match = re.match(self.regex, item)
1814                 if match is None:
1815                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1816                 field = match.group('field')
1817                 if field is None:
1818                     continue
1819                 if self._get_field_setting(field, 'type') == 'alias':
1820                     alias, field = field, self._get_field_setting(field, 'field')
1821                     if self._get_field_setting(alias, 'deprecated'):
1822                         self.ydl.deprecation_warning(
1823                             f'Format sorting alias {alias} is deprecated '
1824                             f'and may be removed in a future version. Please use {field} instead')
1825                 reverse = match.group('reverse') is not None
1826                 closest = match.group('separator') == '~'
1827                 limit_text = match.group('limit')
1828
1829                 has_limit = limit_text is not None
1830                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1831                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1832
1833                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1834                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1835                 limit_count = len(limits)
1836                 for (i, f) in enumerate(fields):
1837                     add_item(f, reverse, closest,
1838                              limits[i] if i < limit_count
1839                              else limits[0] if has_limit and not has_multiple_limits
1840                              else None)
1841
1842         def print_verbose_info(self, write_debug):
1843             if self._sort_user:
1844                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1845             if self._sort_extractor:
1846                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1847             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1848                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1849                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1850                               self._get_field_setting(field, 'limit_text'),
1851                               self._get_field_setting(field, 'limit'))
1852                 if self._get_field_setting(field, 'limit_text') is not None else '')
1853                 for field in self._order if self._get_field_setting(field, 'visible')]))
1854
1855         def _calculate_field_preference_from_value(self, format, field, type, value):
1856             reverse = self._get_field_setting(field, 'reverse')
1857             closest = self._get_field_setting(field, 'closest')
1858             limit = self._get_field_setting(field, 'limit')
1859
1860             if type == 'extractor':
1861                 maximum = self._get_field_setting(field, 'max')
1862                 if value is None or (maximum is not None and value >= maximum):
1863                     value = -1
1864             elif type == 'boolean':
1865                 in_list = self._get_field_setting(field, 'in_list')
1866                 not_in_list = self._get_field_setting(field, 'not_in_list')
1867                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1868             elif type == 'ordered':
1869                 value = self._resolve_field_value(field, value, True)
1870
1871             # try to convert to number
1872             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1873             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1874             if is_num:
1875                 value = val_num
1876
1877             return ((-10, 0) if value is None
1878                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1879                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1880                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1881                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1882                     else (-1, value, 0))
1883
1884         def _calculate_field_preference(self, format, field):
1885             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1886             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1887             if type == 'multiple':
1888                 type = 'field'  # Only 'field' is allowed in multiple for now
1889                 actual_fields = self._get_field_setting(field, 'field')
1890
1891                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1892             else:
1893                 value = get_value(field)
1894             return self._calculate_field_preference_from_value(format, field, type, value)
1895
1896         def calculate_preference(self, format):
1897             # Determine missing protocol
1898             if not format.get('protocol'):
1899                 format['protocol'] = determine_protocol(format)
1900
1901             # Determine missing ext
1902             if not format.get('ext') and 'url' in format:
1903                 format['ext'] = determine_ext(format['url'])
1904             if format.get('vcodec') == 'none':
1905                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1906                 format['video_ext'] = 'none'
1907             else:
1908                 format['video_ext'] = format['ext']
1909                 format['audio_ext'] = 'none'
1910             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1911             #    format['preference'] = -1000
1912
1913             # Determine missing bitrates
1914             if format.get('tbr') is None:
1915                 if format.get('vbr') is not None and format.get('abr') is not None:
1916                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1917             else:
1918                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1919                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1920                 if format.get('acodec') != 'none' and format.get('abr') is None:
1921                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1922
1923             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1924
1925     def _sort_formats(self, formats, field_preference=[]):
1926         if not formats:
1927             return
1928         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1929
1930     def _check_formats(self, formats, video_id):
1931         if formats:
1932             formats[:] = filter(
1933                 lambda f: self._is_valid_url(
1934                     f['url'], video_id,
1935                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1936                 formats)
1937
1938     @staticmethod
1939     def _remove_duplicate_formats(formats):
1940         format_urls = set()
1941         unique_formats = []
1942         for f in formats:
1943             if f['url'] not in format_urls:
1944                 format_urls.add(f['url'])
1945                 unique_formats.append(f)
1946         formats[:] = unique_formats
1947
1948     def _is_valid_url(self, url, video_id, item='video', headers={}):
1949         url = self._proto_relative_url(url, scheme='http:')
1950         # For now assume non HTTP(S) URLs always valid
1951         if not (url.startswith('http://') or url.startswith('https://')):
1952             return True
1953         try:
1954             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1955             return True
1956         except ExtractorError as e:
1957             self.to_screen(
1958                 '%s: %s URL is invalid, skipping: %s'
1959                 % (video_id, item, error_to_compat_str(e.cause)))
1960             return False
1961
1962     def http_scheme(self):
1963         """ Either "http:" or "https:", depending on the user's preferences """
1964         return (
1965             'http:'
1966             if self.get_param('prefer_insecure', False)
1967             else 'https:')
1968
1969     def _proto_relative_url(self, url, scheme=None):
1970         if url is None:
1971             return url
1972         if url.startswith('//'):
1973             if scheme is None:
1974                 scheme = self.http_scheme()
1975             return scheme + url
1976         else:
1977             return url
1978
1979     def _sleep(self, timeout, video_id, msg_template=None):
1980         if msg_template is None:
1981             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1982         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1983         self.to_screen(msg)
1984         time.sleep(timeout)
1985
1986     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1987                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1988                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1989         res = self._download_xml_handle(
1990             manifest_url, video_id, 'Downloading f4m manifest',
1991             'Unable to download f4m manifest',
1992             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1993             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1994             transform_source=transform_source,
1995             fatal=fatal, data=data, headers=headers, query=query)
1996         if res is False:
1997             return []
1998
1999         manifest, urlh = res
2000         manifest_url = urlh.geturl()
2001
2002         return self._parse_f4m_formats(
2003             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2004             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2005
2006     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2007                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2008                            fatal=True, m3u8_id=None):
2009         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2010             return []
2011
2012         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2013         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2014         if akamai_pv is not None and ';' in akamai_pv.text:
2015             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2016             if playerVerificationChallenge.strip() != '':
2017                 return []
2018
2019         formats = []
2020         manifest_version = '1.0'
2021         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2022         if not media_nodes:
2023             manifest_version = '2.0'
2024             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2025         # Remove unsupported DRM protected media from final formats
2026         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2027         media_nodes = remove_encrypted_media(media_nodes)
2028         if not media_nodes:
2029             return formats
2030
2031         manifest_base_url = get_base_url(manifest)
2032
2033         bootstrap_info = xpath_element(
2034             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2035             'bootstrap info', default=None)
2036
2037         vcodec = None
2038         mime_type = xpath_text(
2039             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2040             'base URL', default=None)
2041         if mime_type and mime_type.startswith('audio/'):
2042             vcodec = 'none'
2043
2044         for i, media_el in enumerate(media_nodes):
2045             tbr = int_or_none(media_el.attrib.get('bitrate'))
2046             width = int_or_none(media_el.attrib.get('width'))
2047             height = int_or_none(media_el.attrib.get('height'))
2048             format_id = join_nonempty(f4m_id, tbr or i)
2049             # If <bootstrapInfo> is present, the specified f4m is a
2050             # stream-level manifest, and only set-level manifests may refer to
2051             # external resources.  See section 11.4 and section 4 of F4M spec
2052             if bootstrap_info is None:
2053                 media_url = None
2054                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2055                 if manifest_version == '2.0':
2056                     media_url = media_el.attrib.get('href')
2057                 if media_url is None:
2058                     media_url = media_el.attrib.get('url')
2059                 if not media_url:
2060                     continue
2061                 manifest_url = (
2062                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2063                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2064                 # If media_url is itself a f4m manifest do the recursive extraction
2065                 # since bitrates in parent manifest (this one) and media_url manifest
2066                 # may differ leading to inability to resolve the format by requested
2067                 # bitrate in f4m downloader
2068                 ext = determine_ext(manifest_url)
2069                 if ext == 'f4m':
2070                     f4m_formats = self._extract_f4m_formats(
2071                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2072                         transform_source=transform_source, fatal=fatal)
2073                     # Sometimes stream-level manifest contains single media entry that
2074                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2075                     # At the same time parent's media entry in set-level manifest may
2076                     # contain it. We will copy it from parent in such cases.
2077                     if len(f4m_formats) == 1:
2078                         f = f4m_formats[0]
2079                         f.update({
2080                             'tbr': f.get('tbr') or tbr,
2081                             'width': f.get('width') or width,
2082                             'height': f.get('height') or height,
2083                             'format_id': f.get('format_id') if not tbr else format_id,
2084                             'vcodec': vcodec,
2085                         })
2086                     formats.extend(f4m_formats)
2087                     continue
2088                 elif ext == 'm3u8':
2089                     formats.extend(self._extract_m3u8_formats(
2090                         manifest_url, video_id, 'mp4', preference=preference,
2091                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2092                     continue
2093             formats.append({
2094                 'format_id': format_id,
2095                 'url': manifest_url,
2096                 'manifest_url': manifest_url,
2097                 'ext': 'flv' if bootstrap_info is not None else None,
2098                 'protocol': 'f4m',
2099                 'tbr': tbr,
2100                 'width': width,
2101                 'height': height,
2102                 'vcodec': vcodec,
2103                 'preference': preference,
2104                 'quality': quality,
2105             })
2106         return formats
2107
2108     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2109         return {
2110             'format_id': join_nonempty(m3u8_id, 'meta'),
2111             'url': m3u8_url,
2112             'ext': ext,
2113             'protocol': 'm3u8',
2114             'preference': preference - 100 if preference else -100,
2115             'quality': quality,
2116             'resolution': 'multiple',
2117             'format_note': 'Quality selection URL',
2118         }
2119
2120     def _report_ignoring_subs(self, name):
2121         self.report_warning(bug_reports_message(
2122             f'Ignoring subtitle tracks found in the {name} manifest; '
2123             'if any subtitle tracks are missing,'
2124         ), only_once=True)
2125
2126     def _extract_m3u8_formats(self, *args, **kwargs):
2127         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2128         if subs:
2129             self._report_ignoring_subs('HLS')
2130         return fmts
2131
2132     def _extract_m3u8_formats_and_subtitles(
2133             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2134             preference=None, quality=None, m3u8_id=None, note=None,
2135             errnote=None, fatal=True, live=False, data=None, headers={},
2136             query={}):
2137
2138         res = self._download_webpage_handle(
2139             m3u8_url, video_id,
2140             note='Downloading m3u8 information' if note is None else note,
2141             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2142             fatal=fatal, data=data, headers=headers, query=query)
2143
2144         if res is False:
2145             return [], {}
2146
2147         m3u8_doc, urlh = res
2148         m3u8_url = urlh.geturl()
2149
2150         return self._parse_m3u8_formats_and_subtitles(
2151             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2152             preference=preference, quality=quality, m3u8_id=m3u8_id,
2153             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2154             headers=headers, query=query, video_id=video_id)
2155
2156     def _parse_m3u8_formats_and_subtitles(
2157             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2158             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2159             errnote=None, fatal=True, data=None, headers={}, query={},
2160             video_id=None):
2161         formats, subtitles = [], {}
2162
2163         has_drm = re.search('|'.join([
2164             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2165             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2166         ]), m3u8_doc)
2167
2168         def format_url(url):
2169             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2170
2171         if self.get_param('hls_split_discontinuity', False):
2172             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2173                 if not m3u8_doc:
2174                     if not manifest_url:
2175                         return []
2176                     m3u8_doc = self._download_webpage(
2177                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2178                         note=False, errnote='Failed to download m3u8 playlist information')
2179                     if m3u8_doc is False:
2180                         return []
2181                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2182
2183         else:
2184             def _extract_m3u8_playlist_indices(*args, **kwargs):
2185                 return [None]
2186
2187         # References:
2188         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2189         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2190         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2191
2192         # We should try extracting formats only from master playlists [1, 4.3.4],
2193         # i.e. playlists that describe available qualities. On the other hand
2194         # media playlists [1, 4.3.3] should be returned as is since they contain
2195         # just the media without qualities renditions.
2196         # Fortunately, master playlist can be easily distinguished from media
2197         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2198         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2199         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2200         # media playlist and MUST NOT appear in master playlist thus we can
2201         # clearly detect media playlist with this criterion.
2202
2203         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2204             formats = [{
2205                 'format_id': join_nonempty(m3u8_id, idx),
2206                 'format_index': idx,
2207                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2208                 'ext': ext,
2209                 'protocol': entry_protocol,
2210                 'preference': preference,
2211                 'quality': quality,
2212                 'has_drm': has_drm,
2213             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2214
2215             return formats, subtitles
2216
2217         groups = {}
2218         last_stream_inf = {}
2219
2220         def extract_media(x_media_line):
2221             media = parse_m3u8_attributes(x_media_line)
2222             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2223             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2224             if not (media_type and group_id and name):
2225                 return
2226             groups.setdefault(group_id, []).append(media)
2227             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2228             if media_type == 'SUBTITLES':
2229                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2230                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2231                 # However, lack of URI has been spotted in the wild.
2232                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2233                 if not media.get('URI'):
2234                     return
2235                 url = format_url(media['URI'])
2236                 sub_info = {
2237                     'url': url,
2238                     'ext': determine_ext(url),
2239                 }
2240                 if sub_info['ext'] == 'm3u8':
2241                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2242                     # files may contain is WebVTT:
2243                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2244                     sub_info['ext'] = 'vtt'
2245                     sub_info['protocol'] = 'm3u8_native'
2246                 lang = media.get('LANGUAGE') or 'und'
2247                 subtitles.setdefault(lang, []).append(sub_info)
2248             if media_type not in ('VIDEO', 'AUDIO'):
2249                 return
2250             media_url = media.get('URI')
2251             if media_url:
2252                 manifest_url = format_url(media_url)
2253                 formats.extend({
2254                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2255                     'format_note': name,
2256                     'format_index': idx,
2257                     'url': manifest_url,
2258                     'manifest_url': m3u8_url,
2259                     'language': media.get('LANGUAGE'),
2260                     'ext': ext,
2261                     'protocol': entry_protocol,
2262                     'preference': preference,
2263                     'quality': quality,
2264                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2265                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2266
2267         def build_stream_name():
2268             # Despite specification does not mention NAME attribute for
2269             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2270             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2271             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2272             stream_name = last_stream_inf.get('NAME')
2273             if stream_name:
2274                 return stream_name
2275             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2276             # from corresponding rendition group
2277             stream_group_id = last_stream_inf.get('VIDEO')
2278             if not stream_group_id:
2279                 return
2280             stream_group = groups.get(stream_group_id)
2281             if not stream_group:
2282                 return stream_group_id
2283             rendition = stream_group[0]
2284             return rendition.get('NAME') or stream_group_id
2285
2286         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2287         # chance to detect video only formats when EXT-X-STREAM-INF tags
2288         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2289         for line in m3u8_doc.splitlines():
2290             if line.startswith('#EXT-X-MEDIA:'):
2291                 extract_media(line)
2292
2293         for line in m3u8_doc.splitlines():
2294             if line.startswith('#EXT-X-STREAM-INF:'):
2295                 last_stream_inf = parse_m3u8_attributes(line)
2296             elif line.startswith('#') or not line.strip():
2297                 continue
2298             else:
2299                 tbr = float_or_none(
2300                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2301                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2302                 manifest_url = format_url(line.strip())
2303
2304                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2305                     format_id = [m3u8_id, None, idx]
2306                     # Bandwidth of live streams may differ over time thus making
2307                     # format_id unpredictable. So it's better to keep provided
2308                     # format_id intact.
2309                     if not live:
2310                         stream_name = build_stream_name()
2311                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2312                     f = {
2313                         'format_id': join_nonempty(*format_id),
2314                         'format_index': idx,
2315                         'url': manifest_url,
2316                         'manifest_url': m3u8_url,
2317                         'tbr': tbr,
2318                         'ext': ext,
2319                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2320                         'protocol': entry_protocol,
2321                         'preference': preference,
2322                         'quality': quality,
2323                     }
2324                     resolution = last_stream_inf.get('RESOLUTION')
2325                     if resolution:
2326                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2327                         if mobj:
2328                             f['width'] = int(mobj.group('width'))
2329                             f['height'] = int(mobj.group('height'))
2330                     # Unified Streaming Platform
2331                     mobj = re.search(
2332                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2333                     if mobj:
2334                         abr, vbr = mobj.groups()
2335                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2336                         f.update({
2337                             'vbr': vbr,
2338                             'abr': abr,
2339                         })
2340                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2341                     f.update(codecs)
2342                     audio_group_id = last_stream_inf.get('AUDIO')
2343                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2344                     # references a rendition group MUST have a CODECS attribute.
2345                     # However, this is not always respected, for example, [2]
2346                     # contains EXT-X-STREAM-INF tag which references AUDIO
2347                     # rendition group but does not have CODECS and despite
2348                     # referencing an audio group it represents a complete
2349                     # (with audio and video) format. So, for such cases we will
2350                     # ignore references to rendition groups and treat them
2351                     # as complete formats.
2352                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2353                         audio_group = groups.get(audio_group_id)
2354                         if audio_group and audio_group[0].get('URI'):
2355                             # TODO: update acodec for audio only formats with
2356                             # the same GROUP-ID
2357                             f['acodec'] = 'none'
2358                     if not f.get('ext'):
2359                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2360                     formats.append(f)
2361
2362                     # for DailyMotion
2363                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2364                     if progressive_uri:
2365                         http_f = f.copy()
2366                         del http_f['manifest_url']
2367                         http_f.update({
2368                             'format_id': f['format_id'].replace('hls-', 'http-'),
2369                             'protocol': 'http',
2370                             'url': progressive_uri,
2371                         })
2372                         formats.append(http_f)
2373
2374                 last_stream_inf = {}
2375         return formats, subtitles
2376
2377     def _extract_m3u8_vod_duration(
2378             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2379
2380         m3u8_vod = self._download_webpage(
2381             m3u8_vod_url, video_id,
2382             note='Downloading m3u8 VOD manifest' if note is None else note,
2383             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2384             fatal=False, data=data, headers=headers, query=query)
2385
2386         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2387
2388     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2389         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2390             return None
2391
2392         return int(sum(
2393             float(line[len('#EXTINF:'):].split(',')[0])
2394             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2395
2396     @staticmethod
2397     def _xpath_ns(path, namespace=None):
2398         if not namespace:
2399             return path
2400         out = []
2401         for c in path.split('/'):
2402             if not c or c == '.':
2403                 out.append(c)
2404             else:
2405                 out.append('{%s}%s' % (namespace, c))
2406         return '/'.join(out)
2407
2408     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2409         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2410         if res is False:
2411             assert not fatal
2412             return [], {}
2413
2414         smil, urlh = res
2415         smil_url = urlh.geturl()
2416
2417         namespace = self._parse_smil_namespace(smil)
2418
2419         fmts = self._parse_smil_formats(
2420             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2421         subs = self._parse_smil_subtitles(
2422             smil, namespace=namespace)
2423
2424         return fmts, subs
2425
2426     def _extract_smil_formats(self, *args, **kwargs):
2427         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2428         if subs:
2429             self._report_ignoring_subs('SMIL')
2430         return fmts
2431
2432     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2433         res = self._download_smil(smil_url, video_id, fatal=fatal)
2434         if res is False:
2435             return {}
2436
2437         smil, urlh = res
2438         smil_url = urlh.geturl()
2439
2440         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2441
2442     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2443         return self._download_xml_handle(
2444             smil_url, video_id, 'Downloading SMIL file',
2445             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2446
2447     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2448         namespace = self._parse_smil_namespace(smil)
2449
2450         formats = self._parse_smil_formats(
2451             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2452         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2453
2454         video_id = os.path.splitext(url_basename(smil_url))[0]
2455         title = None
2456         description = None
2457         upload_date = None
2458         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2459             name = meta.attrib.get('name')
2460             content = meta.attrib.get('content')
2461             if not name or not content:
2462                 continue
2463             if not title and name == 'title':
2464                 title = content
2465             elif not description and name in ('description', 'abstract'):
2466                 description = content
2467             elif not upload_date and name == 'date':
2468                 upload_date = unified_strdate(content)
2469
2470         thumbnails = [{
2471             'id': image.get('type'),
2472             'url': image.get('src'),
2473             'width': int_or_none(image.get('width')),
2474             'height': int_or_none(image.get('height')),
2475         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2476
2477         return {
2478             'id': video_id,
2479             'title': title or video_id,
2480             'description': description,
2481             'upload_date': upload_date,
2482             'thumbnails': thumbnails,
2483             'formats': formats,
2484             'subtitles': subtitles,
2485         }
2486
2487     def _parse_smil_namespace(self, smil):
2488         return self._search_regex(
2489             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2490
2491     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2492         base = smil_url
2493         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2494             b = meta.get('base') or meta.get('httpBase')
2495             if b:
2496                 base = b
2497                 break
2498
2499         formats = []
2500         rtmp_count = 0
2501         http_count = 0
2502         m3u8_count = 0
2503         imgs_count = 0
2504
2505         srcs = set()
2506         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2507         for medium in media:
2508             src = medium.get('src')
2509             if not src or src in srcs:
2510                 continue
2511             srcs.add(src)
2512
2513             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2514             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2515             width = int_or_none(medium.get('width'))
2516             height = int_or_none(medium.get('height'))
2517             proto = medium.get('proto')
2518             ext = medium.get('ext')
2519             src_ext = determine_ext(src)
2520             streamer = medium.get('streamer') or base
2521
2522             if proto == 'rtmp' or streamer.startswith('rtmp'):
2523                 rtmp_count += 1
2524                 formats.append({
2525                     'url': streamer,
2526                     'play_path': src,
2527                     'ext': 'flv',
2528                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2529                     'tbr': bitrate,
2530                     'filesize': filesize,
2531                     'width': width,
2532                     'height': height,
2533                 })
2534                 if transform_rtmp_url:
2535                     streamer, src = transform_rtmp_url(streamer, src)
2536                     formats[-1].update({
2537                         'url': streamer,
2538                         'play_path': src,
2539                     })
2540                 continue
2541
2542             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2543             src_url = src_url.strip()
2544
2545             if proto == 'm3u8' or src_ext == 'm3u8':
2546                 m3u8_formats = self._extract_m3u8_formats(
2547                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2548                 if len(m3u8_formats) == 1:
2549                     m3u8_count += 1
2550                     m3u8_formats[0].update({
2551                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2552                         'tbr': bitrate,
2553                         'width': width,
2554                         'height': height,
2555                     })
2556                 formats.extend(m3u8_formats)
2557             elif src_ext == 'f4m':
2558                 f4m_url = src_url
2559                 if not f4m_params:
2560                     f4m_params = {
2561                         'hdcore': '3.2.0',
2562                         'plugin': 'flowplayer-3.2.0.1',
2563                     }
2564                 f4m_url += '&' if '?' in f4m_url else '?'
2565                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2566                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2567             elif src_ext == 'mpd':
2568                 formats.extend(self._extract_mpd_formats(
2569                     src_url, video_id, mpd_id='dash', fatal=False))
2570             elif re.search(r'\.ism/[Mm]anifest', src_url):
2571                 formats.extend(self._extract_ism_formats(
2572                     src_url, video_id, ism_id='mss', fatal=False))
2573             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2574                 http_count += 1
2575                 formats.append({
2576                     'url': src_url,
2577                     'ext': ext or src_ext or 'flv',
2578                     'format_id': 'http-%d' % (bitrate or http_count),
2579                     'tbr': bitrate,
2580                     'filesize': filesize,
2581                     'width': width,
2582                     'height': height,
2583                 })
2584
2585         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2586             src = medium.get('src')
2587             if not src or src in srcs:
2588                 continue
2589             srcs.add(src)
2590
2591             imgs_count += 1
2592             formats.append({
2593                 'format_id': 'imagestream-%d' % (imgs_count),
2594                 'url': src,
2595                 'ext': mimetype2ext(medium.get('type')),
2596                 'acodec': 'none',
2597                 'vcodec': 'none',
2598                 'width': int_or_none(medium.get('width')),
2599                 'height': int_or_none(medium.get('height')),
2600                 'format_note': 'SMIL storyboards',
2601             })
2602
2603         return formats
2604
2605     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2606         urls = []
2607         subtitles = {}
2608         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2609             src = textstream.get('src')
2610             if not src or src in urls:
2611                 continue
2612             urls.append(src)
2613             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2614             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2615             subtitles.setdefault(lang, []).append({
2616                 'url': src,
2617                 'ext': ext,
2618             })
2619         return subtitles
2620
2621     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2622         res = self._download_xml_handle(
2623             xspf_url, playlist_id, 'Downloading xpsf playlist',
2624             'Unable to download xspf manifest', fatal=fatal)
2625         if res is False:
2626             return []
2627
2628         xspf, urlh = res
2629         xspf_url = urlh.geturl()
2630
2631         return self._parse_xspf(
2632             xspf, playlist_id, xspf_url=xspf_url,
2633             xspf_base_url=base_url(xspf_url))
2634
2635     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2636         NS_MAP = {
2637             'xspf': 'http://xspf.org/ns/0/',
2638             's1': 'http://static.streamone.nl/player/ns/0',
2639         }
2640
2641         entries = []
2642         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2643             title = xpath_text(
2644                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2645             description = xpath_text(
2646                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2647             thumbnail = xpath_text(
2648                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2649             duration = float_or_none(
2650                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2651
2652             formats = []
2653             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2654                 format_url = urljoin(xspf_base_url, location.text)
2655                 if not format_url:
2656                     continue
2657                 formats.append({
2658                     'url': format_url,
2659                     'manifest_url': xspf_url,
2660                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2661                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2662                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2663                 })
2664             self._sort_formats(formats)
2665
2666             entries.append({
2667                 'id': playlist_id,
2668                 'title': title,
2669                 'description': description,
2670                 'thumbnail': thumbnail,
2671                 'duration': duration,
2672                 'formats': formats,
2673             })
2674         return entries
2675
2676     def _extract_mpd_formats(self, *args, **kwargs):
2677         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2678         if subs:
2679             self._report_ignoring_subs('DASH')
2680         return fmts
2681
2682     def _extract_mpd_formats_and_subtitles(
2683             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2684             fatal=True, data=None, headers={}, query={}):
2685         res = self._download_xml_handle(
2686             mpd_url, video_id,
2687             note='Downloading MPD manifest' if note is None else note,
2688             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2689             fatal=fatal, data=data, headers=headers, query=query)
2690         if res is False:
2691             return [], {}
2692         mpd_doc, urlh = res
2693         if mpd_doc is None:
2694             return [], {}
2695
2696         # We could have been redirected to a new url when we retrieved our mpd file.
2697         mpd_url = urlh.geturl()
2698         mpd_base_url = base_url(mpd_url)
2699
2700         return self._parse_mpd_formats_and_subtitles(
2701             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2702
2703     def _parse_mpd_formats(self, *args, **kwargs):
2704         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2705         if subs:
2706             self._report_ignoring_subs('DASH')
2707         return fmts
2708
2709     def _parse_mpd_formats_and_subtitles(
2710             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2711         """
2712         Parse formats from MPD manifest.
2713         References:
2714          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2715             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2716          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2717         """
2718         if not self.get_param('dynamic_mpd', True):
2719             if mpd_doc.get('type') == 'dynamic':
2720                 return [], {}
2721
2722         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2723
2724         def _add_ns(path):
2725             return self._xpath_ns(path, namespace)
2726
2727         def is_drm_protected(element):
2728             return element.find(_add_ns('ContentProtection')) is not None
2729
2730         def extract_multisegment_info(element, ms_parent_info):
2731             ms_info = ms_parent_info.copy()
2732
2733             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2734             # common attributes and elements.  We will only extract relevant
2735             # for us.
2736             def extract_common(source):
2737                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2738                 if segment_timeline is not None:
2739                     s_e = segment_timeline.findall(_add_ns('S'))
2740                     if s_e:
2741                         ms_info['total_number'] = 0
2742                         ms_info['s'] = []
2743                         for s in s_e:
2744                             r = int(s.get('r', 0))
2745                             ms_info['total_number'] += 1 + r
2746                             ms_info['s'].append({
2747                                 't': int(s.get('t', 0)),
2748                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2749                                 'd': int(s.attrib['d']),
2750                                 'r': r,
2751                             })
2752                 start_number = source.get('startNumber')
2753                 if start_number:
2754                     ms_info['start_number'] = int(start_number)
2755                 timescale = source.get('timescale')
2756                 if timescale:
2757                     ms_info['timescale'] = int(timescale)
2758                 segment_duration = source.get('duration')
2759                 if segment_duration:
2760                     ms_info['segment_duration'] = float(segment_duration)
2761
2762             def extract_Initialization(source):
2763                 initialization = source.find(_add_ns('Initialization'))
2764                 if initialization is not None:
2765                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2766
2767             segment_list = element.find(_add_ns('SegmentList'))
2768             if segment_list is not None:
2769                 extract_common(segment_list)
2770                 extract_Initialization(segment_list)
2771                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2772                 if segment_urls_e:
2773                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2774             else:
2775                 segment_template = element.find(_add_ns('SegmentTemplate'))
2776                 if segment_template is not None:
2777                     extract_common(segment_template)
2778                     media = segment_template.get('media')
2779                     if media:
2780                         ms_info['media'] = media
2781                     initialization = segment_template.get('initialization')
2782                     if initialization:
2783                         ms_info['initialization'] = initialization
2784                     else:
2785                         extract_Initialization(segment_template)
2786             return ms_info
2787
2788         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2789         formats, subtitles = [], {}
2790         stream_numbers = collections.defaultdict(int)
2791         for period in mpd_doc.findall(_add_ns('Period')):
2792             period_duration = parse_duration(period.get('duration')) or mpd_duration
2793             period_ms_info = extract_multisegment_info(period, {
2794                 'start_number': 1,
2795                 'timescale': 1,
2796             })
2797             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2798                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2799                 for representation in adaptation_set.findall(_add_ns('Representation')):
2800                     representation_attrib = adaptation_set.attrib.copy()
2801                     representation_attrib.update(representation.attrib)
2802                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2803                     mime_type = representation_attrib['mimeType']
2804                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2805
2806                     codec_str = representation_attrib.get('codecs', '')
2807                     # Some kind of binary subtitle found in some youtube livestreams
2808                     if mime_type == 'application/x-rawcc':
2809                         codecs = {'scodec': codec_str}
2810                     else:
2811                         codecs = parse_codecs(codec_str)
2812                     if content_type not in ('video', 'audio', 'text'):
2813                         if mime_type == 'image/jpeg':
2814                             content_type = mime_type
2815                         elif codecs.get('vcodec', 'none') != 'none':
2816                             content_type = 'video'
2817                         elif codecs.get('acodec', 'none') != 'none':
2818                             content_type = 'audio'
2819                         elif codecs.get('scodec', 'none') != 'none':
2820                             content_type = 'text'
2821                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2822                             content_type = 'text'
2823                         else:
2824                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2825                             continue
2826
2827                     base_url = ''
2828                     for element in (representation, adaptation_set, period, mpd_doc):
2829                         base_url_e = element.find(_add_ns('BaseURL'))
2830                         if base_url_e is not None:
2831                             base_url = base_url_e.text + base_url
2832                             if re.match(r'^https?://', base_url):
2833                                 break
2834                     if mpd_base_url and base_url.startswith('/'):
2835                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2836                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2837                         if not mpd_base_url.endswith('/'):
2838                             mpd_base_url += '/'
2839                         base_url = mpd_base_url + base_url
2840                     representation_id = representation_attrib.get('id')
2841                     lang = representation_attrib.get('lang')
2842                     url_el = representation.find(_add_ns('BaseURL'))
2843                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2844                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2845                     if representation_id is not None:
2846                         format_id = representation_id
2847                     else:
2848                         format_id = content_type
2849                     if mpd_id:
2850                         format_id = mpd_id + '-' + format_id
2851                     if content_type in ('video', 'audio'):
2852                         f = {
2853                             'format_id': format_id,
2854                             'manifest_url': mpd_url,
2855                             'ext': mimetype2ext(mime_type),
2856                             'width': int_or_none(representation_attrib.get('width')),
2857                             'height': int_or_none(representation_attrib.get('height')),
2858                             'tbr': float_or_none(bandwidth, 1000),
2859                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2860                             'fps': int_or_none(representation_attrib.get('frameRate')),
2861                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2862                             'format_note': 'DASH %s' % content_type,
2863                             'filesize': filesize,
2864                             'container': mimetype2ext(mime_type) + '_dash',
2865                             **codecs
2866                         }
2867                     elif content_type == 'text':
2868                         f = {
2869                             'ext': mimetype2ext(mime_type),
2870                             'manifest_url': mpd_url,
2871                             'filesize': filesize,
2872                         }
2873                     elif content_type == 'image/jpeg':
2874                         # See test case in VikiIE
2875                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2876                         f = {
2877                             'format_id': format_id,
2878                             'ext': 'mhtml',
2879                             'manifest_url': mpd_url,
2880                             'format_note': 'DASH storyboards (jpeg)',
2881                             'acodec': 'none',
2882                             'vcodec': 'none',
2883                         }
2884                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2885                         f['has_drm'] = True
2886                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2887
2888                     def prepare_template(template_name, identifiers):
2889                         tmpl = representation_ms_info[template_name]
2890                         # First of, % characters outside $...$ templates
2891                         # must be escaped by doubling for proper processing
2892                         # by % operator string formatting used further (see
2893                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2894                         t = ''
2895                         in_template = False
2896                         for c in tmpl:
2897                             t += c
2898                             if c == '$':
2899                                 in_template = not in_template
2900                             elif c == '%' and not in_template:
2901                                 t += c
2902                         # Next, $...$ templates are translated to their
2903                         # %(...) counterparts to be used with % operator
2904                         if representation_id is not None:
2905                             t = t.replace('$RepresentationID$', representation_id)
2906                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2907                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2908                         t.replace('$$', '$')
2909                         return t
2910
2911                     # @initialization is a regular template like @media one
2912                     # so it should be handled just the same way (see
2913                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2914                     if 'initialization' in representation_ms_info:
2915                         initialization_template = prepare_template(
2916                             'initialization',
2917                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2918                             # $Time$ shall not be included for @initialization thus
2919                             # only $Bandwidth$ remains
2920                             ('Bandwidth', ))
2921                         representation_ms_info['initialization_url'] = initialization_template % {
2922                             'Bandwidth': bandwidth,
2923                         }
2924
2925                     def location_key(location):
2926                         return 'url' if re.match(r'^https?://', location) else 'path'
2927
2928                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2929
2930                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2931                         media_location_key = location_key(media_template)
2932
2933                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2934                         # can't be used at the same time
2935                         if '%(Number' in media_template and 's' not in representation_ms_info:
2936                             segment_duration = None
2937                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2938                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2939                                 representation_ms_info['total_number'] = int(math.ceil(
2940                                     float_or_none(period_duration, segment_duration, default=0)))
2941                             representation_ms_info['fragments'] = [{
2942                                 media_location_key: media_template % {
2943                                     'Number': segment_number,
2944                                     'Bandwidth': bandwidth,
2945                                 },
2946                                 'duration': segment_duration,
2947                             } for segment_number in range(
2948                                 representation_ms_info['start_number'],
2949                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2950                         else:
2951                             # $Number*$ or $Time$ in media template with S list available
2952                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2953                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2954                             representation_ms_info['fragments'] = []
2955                             segment_time = 0
2956                             segment_d = None
2957                             segment_number = representation_ms_info['start_number']
2958
2959                             def add_segment_url():
2960                                 segment_url = media_template % {
2961                                     'Time': segment_time,
2962                                     'Bandwidth': bandwidth,
2963                                     'Number': segment_number,
2964                                 }
2965                                 representation_ms_info['fragments'].append({
2966                                     media_location_key: segment_url,
2967                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2968                                 })
2969
2970                             for num, s in enumerate(representation_ms_info['s']):
2971                                 segment_time = s.get('t') or segment_time
2972                                 segment_d = s['d']
2973                                 add_segment_url()
2974                                 segment_number += 1
2975                                 for r in range(s.get('r', 0)):
2976                                     segment_time += segment_d
2977                                     add_segment_url()
2978                                     segment_number += 1
2979                                 segment_time += segment_d
2980                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2981                         # No media template
2982                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2983                         # or any YouTube dashsegments video
2984                         fragments = []
2985                         segment_index = 0
2986                         timescale = representation_ms_info['timescale']
2987                         for s in representation_ms_info['s']:
2988                             duration = float_or_none(s['d'], timescale)
2989                             for r in range(s.get('r', 0) + 1):
2990                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2991                                 fragments.append({
2992                                     location_key(segment_uri): segment_uri,
2993                                     'duration': duration,
2994                                 })
2995                                 segment_index += 1
2996                         representation_ms_info['fragments'] = fragments
2997                     elif 'segment_urls' in representation_ms_info:
2998                         # Segment URLs with no SegmentTimeline
2999                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3000                         # https://github.com/ytdl-org/youtube-dl/pull/14844
3001                         fragments = []
3002                         segment_duration = float_or_none(
3003                             representation_ms_info['segment_duration'],
3004                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3005                         for segment_url in representation_ms_info['segment_urls']:
3006                             fragment = {
3007                                 location_key(segment_url): segment_url,
3008                             }
3009                             if segment_duration:
3010                                 fragment['duration'] = segment_duration
3011                             fragments.append(fragment)
3012                         representation_ms_info['fragments'] = fragments
3013                     # If there is a fragments key available then we correctly recognized fragmented media.
3014                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3015                     # assumption is not necessarily correct since we may simply have no support for
3016                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3017                     if 'fragments' in representation_ms_info:
3018                         f.update({
3019                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3020                             'url': mpd_url or base_url,
3021                             'fragment_base_url': base_url,
3022                             'fragments': [],
3023                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3024                         })
3025                         if 'initialization_url' in representation_ms_info:
3026                             initialization_url = representation_ms_info['initialization_url']
3027                             if not f.get('url'):
3028                                 f['url'] = initialization_url
3029                             f['fragments'].append({location_key(initialization_url): initialization_url})
3030                         f['fragments'].extend(representation_ms_info['fragments'])
3031                         if not period_duration:
3032                             period_duration = try_get(
3033                                 representation_ms_info,
3034                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3035                     else:
3036                         # Assuming direct URL to unfragmented media.
3037                         f['url'] = base_url
3038                     if content_type in ('video', 'audio', 'image/jpeg'):
3039                         f['manifest_stream_number'] = stream_numbers[f['url']]
3040                         stream_numbers[f['url']] += 1
3041                         formats.append(f)
3042                     elif content_type == 'text':
3043                         subtitles.setdefault(lang or 'und', []).append(f)
3044
3045         return formats, subtitles
3046
3047     def _extract_ism_formats(self, *args, **kwargs):
3048         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3049         if subs:
3050             self._report_ignoring_subs('ISM')
3051         return fmts
3052
3053     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3054         res = self._download_xml_handle(
3055             ism_url, video_id,
3056             note='Downloading ISM manifest' if note is None else note,
3057             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3058             fatal=fatal, data=data, headers=headers, query=query)
3059         if res is False:
3060             return [], {}
3061         ism_doc, urlh = res
3062         if ism_doc is None:
3063             return [], {}
3064
3065         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3066
3067     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3068         """
3069         Parse formats from ISM manifest.
3070         References:
3071          1. [MS-SSTR]: Smooth Streaming Protocol,
3072             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3073         """
3074         if ism_doc.get('IsLive') == 'TRUE':
3075             return [], {}
3076
3077         duration = int(ism_doc.attrib['Duration'])
3078         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3079
3080         formats = []
3081         subtitles = {}
3082         for stream in ism_doc.findall('StreamIndex'):
3083             stream_type = stream.get('Type')
3084             if stream_type not in ('video', 'audio', 'text'):
3085                 continue
3086             url_pattern = stream.attrib['Url']
3087             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3088             stream_name = stream.get('Name')
3089             stream_language = stream.get('Language', 'und')
3090             for track in stream.findall('QualityLevel'):
3091                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3092                 # TODO: add support for WVC1 and WMAP
3093                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3094                     self.report_warning('%s is not a supported codec' % fourcc)
3095                     continue
3096                 tbr = int(track.attrib['Bitrate']) // 1000
3097                 # [1] does not mention Width and Height attributes. However,
3098                 # they're often present while MaxWidth and MaxHeight are
3099                 # missing, so should be used as fallbacks
3100                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3101                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3102                 sampling_rate = int_or_none(track.get('SamplingRate'))
3103
3104                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3105                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3106
3107                 fragments = []
3108                 fragment_ctx = {
3109                     'time': 0,
3110                 }
3111                 stream_fragments = stream.findall('c')
3112                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3113                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3114                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3115                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3116                     if not fragment_ctx['duration']:
3117                         try:
3118                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3119                         except IndexError:
3120                             next_fragment_time = duration
3121                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3122                     for _ in range(fragment_repeat):
3123                         fragments.append({
3124                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3125                             'duration': fragment_ctx['duration'] / stream_timescale,
3126                         })
3127                         fragment_ctx['time'] += fragment_ctx['duration']
3128
3129                 if stream_type == 'text':
3130                     subtitles.setdefault(stream_language, []).append({
3131                         'ext': 'ismt',
3132                         'protocol': 'ism',
3133                         'url': ism_url,
3134                         'manifest_url': ism_url,
3135                         'fragments': fragments,
3136                         '_download_params': {
3137                             'stream_type': stream_type,
3138                             'duration': duration,
3139                             'timescale': stream_timescale,
3140                             'fourcc': fourcc,
3141                             'language': stream_language,
3142                             'codec_private_data': track.get('CodecPrivateData'),
3143                         }
3144                     })
3145                 elif stream_type in ('video', 'audio'):
3146                     formats.append({
3147                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3148                         'url': ism_url,
3149                         'manifest_url': ism_url,
3150                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3151                         'width': width,
3152                         'height': height,
3153                         'tbr': tbr,
3154                         'asr': sampling_rate,
3155                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3156                         'acodec': 'none' if stream_type == 'video' else fourcc,
3157                         'protocol': 'ism',
3158                         'fragments': fragments,
3159                         'has_drm': ism_doc.find('Protection') is not None,
3160                         '_download_params': {
3161                             'stream_type': stream_type,
3162                             'duration': duration,
3163                             'timescale': stream_timescale,
3164                             'width': width or 0,
3165                             'height': height or 0,
3166                             'fourcc': fourcc,
3167                             'language': stream_language,
3168                             'codec_private_data': track.get('CodecPrivateData'),
3169                             'sampling_rate': sampling_rate,
3170                             'channels': int_or_none(track.get('Channels', 2)),
3171                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3172                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3173                         },
3174                     })
3175         return formats, subtitles
3176
3177     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3178         def absolute_url(item_url):
3179             return urljoin(base_url, item_url)
3180
3181         def parse_content_type(content_type):
3182             if not content_type:
3183                 return {}
3184             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3185             if ctr:
3186                 mimetype, codecs = ctr.groups()
3187                 f = parse_codecs(codecs)
3188                 f['ext'] = mimetype2ext(mimetype)
3189                 return f
3190             return {}
3191
3192         def _media_formats(src, cur_media_type, type_info=None):
3193             type_info = type_info or {}
3194             full_url = absolute_url(src)
3195             ext = type_info.get('ext') or determine_ext(full_url)
3196             if ext == 'm3u8':
3197                 is_plain_url = False
3198                 formats = self._extract_m3u8_formats(
3199                     full_url, video_id, ext='mp4',
3200                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3201                     preference=preference, quality=quality, fatal=False)
3202             elif ext == 'mpd':
3203                 is_plain_url = False
3204                 formats = self._extract_mpd_formats(
3205                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3206             else:
3207                 is_plain_url = True
3208                 formats = [{
3209                     'url': full_url,
3210                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3211                     'ext': ext,
3212                 }]
3213             return is_plain_url, formats
3214
3215         entries = []
3216         # amp-video and amp-audio are very similar to their HTML5 counterparts
3217         # so we wll include them right here (see
3218         # https://www.ampproject.org/docs/reference/components/amp-video)
3219         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3220         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3221         media_tags = [(media_tag, media_tag_name, media_type, '')
3222                       for media_tag, media_tag_name, media_type
3223                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3224         media_tags.extend(re.findall(
3225             # We only allow video|audio followed by a whitespace or '>'.
3226             # Allowing more characters may end up in significant slow down (see
3227             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3228             # http://www.porntrex.com/maps/videositemap.xml).
3229             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3230         for media_tag, _, media_type, media_content in media_tags:
3231             media_info = {
3232                 'formats': [],
3233                 'subtitles': {},
3234             }
3235             media_attributes = extract_attributes(media_tag)
3236             src = strip_or_none(media_attributes.get('src'))
3237             if src:
3238                 f = parse_content_type(media_attributes.get('type'))
3239                 _, formats = _media_formats(src, media_type, f)
3240                 media_info['formats'].extend(formats)
3241             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3242             if media_content:
3243                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3244                     s_attr = extract_attributes(source_tag)
3245                     # data-video-src and data-src are non standard but seen
3246                     # several times in the wild
3247                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3248                     if not src:
3249                         continue
3250                     f = parse_content_type(s_attr.get('type'))
3251                     is_plain_url, formats = _media_formats(src, media_type, f)
3252                     if is_plain_url:
3253                         # width, height, res, label and title attributes are
3254                         # all not standard but seen several times in the wild
3255                         labels = [
3256                             s_attr.get(lbl)
3257                             for lbl in ('label', 'title')
3258                             if str_or_none(s_attr.get(lbl))
3259                         ]
3260                         width = int_or_none(s_attr.get('width'))
3261                         height = (int_or_none(s_attr.get('height'))
3262                                   or int_or_none(s_attr.get('res')))
3263                         if not width or not height:
3264                             for lbl in labels:
3265                                 resolution = parse_resolution(lbl)
3266                                 if not resolution:
3267                                     continue
3268                                 width = width or resolution.get('width')
3269                                 height = height or resolution.get('height')
3270                         for lbl in labels:
3271                             tbr = parse_bitrate(lbl)
3272                             if tbr:
3273                                 break
3274                         else:
3275                             tbr = None
3276                         f.update({
3277                             'width': width,
3278                             'height': height,
3279                             'tbr': tbr,
3280                             'format_id': s_attr.get('label') or s_attr.get('title'),
3281                         })
3282                         f.update(formats[0])
3283                         media_info['formats'].append(f)
3284                     else:
3285                         media_info['formats'].extend(formats)
3286                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3287                     track_attributes = extract_attributes(track_tag)
3288                     kind = track_attributes.get('kind')
3289                     if not kind or kind in ('subtitles', 'captions'):
3290                         src = strip_or_none(track_attributes.get('src'))
3291                         if not src:
3292                             continue
3293                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3294                         media_info['subtitles'].setdefault(lang, []).append({
3295                             'url': absolute_url(src),
3296                         })
3297             for f in media_info['formats']:
3298                 f.setdefault('http_headers', {})['Referer'] = base_url
3299             if media_info['formats'] or media_info['subtitles']:
3300                 entries.append(media_info)
3301         return entries
3302
3303     def _extract_akamai_formats(self, *args, **kwargs):
3304         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3305         if subs:
3306             self._report_ignoring_subs('akamai')
3307         return fmts
3308
3309     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3310         signed = 'hdnea=' in manifest_url
3311         if not signed:
3312             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3313             manifest_url = re.sub(
3314                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3315                 '', manifest_url).strip('?')
3316
3317         formats = []
3318         subtitles = {}
3319
3320         hdcore_sign = 'hdcore=3.7.0'
3321         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3322         hds_host = hosts.get('hds')
3323         if hds_host:
3324             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3325         if 'hdcore=' not in f4m_url:
3326             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3327         f4m_formats = self._extract_f4m_formats(
3328             f4m_url, video_id, f4m_id='hds', fatal=False)
3329         for entry in f4m_formats:
3330             entry.update({'extra_param_to_segment_url': hdcore_sign})
3331         formats.extend(f4m_formats)
3332
3333         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3334         hls_host = hosts.get('hls')
3335         if hls_host:
3336             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3337         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3338             m3u8_url, video_id, 'mp4', 'm3u8_native',
3339             m3u8_id='hls', fatal=False)
3340         formats.extend(m3u8_formats)
3341         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3342
3343         http_host = hosts.get('http')
3344         if http_host and m3u8_formats and not signed:
3345             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3346             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3347             qualities_length = len(qualities)
3348             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3349                 i = 0
3350                 for f in m3u8_formats:
3351                     if f['vcodec'] != 'none':
3352                         for protocol in ('http', 'https'):
3353                             http_f = f.copy()
3354                             del http_f['manifest_url']
3355                             http_url = re.sub(
3356                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3357                             http_f.update({
3358                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3359                                 'url': http_url,
3360                                 'protocol': protocol,
3361                             })
3362                             formats.append(http_f)
3363                         i += 1
3364
3365         return formats, subtitles
3366
3367     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3368         query = compat_urlparse.urlparse(url).query
3369         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3370         mobj = re.search(
3371             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3372         url_base = mobj.group('url')
3373         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3374         formats = []
3375
3376         def manifest_url(manifest):
3377             m_url = f'{http_base_url}/{manifest}'
3378             if query:
3379                 m_url += '?%s' % query
3380             return m_url
3381
3382         if 'm3u8' not in skip_protocols:
3383             formats.extend(self._extract_m3u8_formats(
3384                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3385                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3386         if 'f4m' not in skip_protocols:
3387             formats.extend(self._extract_f4m_formats(
3388                 manifest_url('manifest.f4m'),
3389                 video_id, f4m_id='hds', fatal=False))
3390         if 'dash' not in skip_protocols:
3391             formats.extend(self._extract_mpd_formats(
3392                 manifest_url('manifest.mpd'),
3393                 video_id, mpd_id='dash', fatal=False))
3394         if re.search(r'(?:/smil:|\.smil)', url_base):
3395             if 'smil' not in skip_protocols:
3396                 rtmp_formats = self._extract_smil_formats(
3397                     manifest_url('jwplayer.smil'),
3398                     video_id, fatal=False)
3399                 for rtmp_format in rtmp_formats:
3400                     rtsp_format = rtmp_format.copy()
3401                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3402                     del rtsp_format['play_path']
3403                     del rtsp_format['ext']
3404                     rtsp_format.update({
3405                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3406                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3407                         'protocol': 'rtsp',
3408                     })
3409                     formats.extend([rtmp_format, rtsp_format])
3410         else:
3411             for protocol in ('rtmp', 'rtsp'):
3412                 if protocol not in skip_protocols:
3413                     formats.append({
3414                         'url': f'{protocol}:{url_base}',
3415                         'format_id': protocol,
3416                         'protocol': protocol,
3417                     })
3418         return formats
3419
3420     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3421         mobj = re.search(
3422             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3423             webpage)
3424         if mobj:
3425             try:
3426                 jwplayer_data = self._parse_json(mobj.group('options'),
3427                                                  video_id=video_id,
3428                                                  transform_source=transform_source)
3429             except ExtractorError:
3430                 pass
3431             else:
3432                 if isinstance(jwplayer_data, dict):
3433                     return jwplayer_data
3434
3435     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3436         jwplayer_data = self._find_jwplayer_data(
3437             webpage, video_id, transform_source=js_to_json)
3438         return self._parse_jwplayer_data(
3439             jwplayer_data, video_id, *args, **kwargs)
3440
3441     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3442                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3443         # JWPlayer backward compatibility: flattened playlists
3444         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3445         if 'playlist' not in jwplayer_data:
3446             jwplayer_data = {'playlist': [jwplayer_data]}
3447
3448         entries = []
3449
3450         # JWPlayer backward compatibility: single playlist item
3451         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3452         if not isinstance(jwplayer_data['playlist'], list):
3453             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3454
3455         for video_data in jwplayer_data['playlist']:
3456             # JWPlayer backward compatibility: flattened sources
3457             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3458             if 'sources' not in video_data:
3459                 video_data['sources'] = [video_data]
3460
3461             this_video_id = video_id or video_data['mediaid']
3462
3463             formats = self._parse_jwplayer_formats(
3464                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3465                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3466
3467             subtitles = {}
3468             tracks = video_data.get('tracks')
3469             if tracks and isinstance(tracks, list):
3470                 for track in tracks:
3471                     if not isinstance(track, dict):
3472                         continue
3473                     track_kind = track.get('kind')
3474                     if not track_kind or not isinstance(track_kind, compat_str):
3475                         continue
3476                     if track_kind.lower() not in ('captions', 'subtitles'):
3477                         continue
3478                     track_url = urljoin(base_url, track.get('file'))
3479                     if not track_url:
3480                         continue
3481                     subtitles.setdefault(track.get('label') or 'en', []).append({
3482                         'url': self._proto_relative_url(track_url)
3483                     })
3484
3485             entry = {
3486                 'id': this_video_id,
3487                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3488                 'description': clean_html(video_data.get('description')),
3489                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3490                 'timestamp': int_or_none(video_data.get('pubdate')),
3491                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3492                 'subtitles': subtitles,
3493             }
3494             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3495             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3496                 entry.update({
3497                     '_type': 'url_transparent',
3498                     'url': formats[0]['url'],
3499                 })
3500             else:
3501                 self._sort_formats(formats)
3502                 entry['formats'] = formats
3503             entries.append(entry)
3504         if len(entries) == 1:
3505             return entries[0]
3506         else:
3507             return self.playlist_result(entries)
3508
3509     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3510                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3511         urls = []
3512         formats = []
3513         for source in jwplayer_sources_data:
3514             if not isinstance(source, dict):
3515                 continue
3516             source_url = urljoin(
3517                 base_url, self._proto_relative_url(source.get('file')))
3518             if not source_url or source_url in urls:
3519                 continue
3520             urls.append(source_url)
3521             source_type = source.get('type') or ''
3522             ext = mimetype2ext(source_type) or determine_ext(source_url)
3523             if source_type == 'hls' or ext == 'm3u8':
3524                 formats.extend(self._extract_m3u8_formats(
3525                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3526                     m3u8_id=m3u8_id, fatal=False))
3527             elif source_type == 'dash' or ext == 'mpd':
3528                 formats.extend(self._extract_mpd_formats(
3529                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3530             elif ext == 'smil':
3531                 formats.extend(self._extract_smil_formats(
3532                     source_url, video_id, fatal=False))
3533             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3534             elif source_type.startswith('audio') or ext in (
3535                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3536                 formats.append({
3537                     'url': source_url,
3538                     'vcodec': 'none',
3539                     'ext': ext,
3540                 })
3541             else:
3542                 height = int_or_none(source.get('height'))
3543                 if height is None:
3544                     # Often no height is provided but there is a label in
3545                     # format like "1080p", "720p SD", or 1080.
3546                     height = int_or_none(self._search_regex(
3547                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3548                         'height', default=None))
3549                 a_format = {
3550                     'url': source_url,
3551                     'width': int_or_none(source.get('width')),
3552                     'height': height,
3553                     'tbr': int_or_none(source.get('bitrate')),
3554                     'ext': ext,
3555                 }
3556                 if source_url.startswith('rtmp'):
3557                     a_format['ext'] = 'flv'
3558                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3559                     # of jwplayer.flash.swf
3560                     rtmp_url_parts = re.split(
3561                         r'((?:mp4|mp3|flv):)', source_url, 1)
3562                     if len(rtmp_url_parts) == 3:
3563                         rtmp_url, prefix, play_path = rtmp_url_parts
3564                         a_format.update({
3565                             'url': rtmp_url,
3566                             'play_path': prefix + play_path,
3567                         })
3568                     if rtmp_params:
3569                         a_format.update(rtmp_params)
3570                 formats.append(a_format)
3571         return formats
3572
3573     def _live_title(self, name):
3574         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3575         return name
3576
3577     def _int(self, v, name, fatal=False, **kwargs):
3578         res = int_or_none(v, **kwargs)
3579         if res is None:
3580             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3581             if fatal:
3582                 raise ExtractorError(msg)
3583             else:
3584                 self.report_warning(msg)
3585         return res
3586
3587     def _float(self, v, name, fatal=False, **kwargs):
3588         res = float_or_none(v, **kwargs)
3589         if res is None:
3590             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3591             if fatal:
3592                 raise ExtractorError(msg)
3593             else:
3594                 self.report_warning(msg)
3595         return res
3596
3597     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3598                     path='/', secure=False, discard=False, rest={}, **kwargs):
3599         cookie = http.cookiejar.Cookie(
3600             0, name, value, port, port is not None, domain, True,
3601             domain.startswith('.'), path, True, secure, expire_time,
3602             discard, None, None, rest)
3603         self.cookiejar.set_cookie(cookie)
3604
3605     def _get_cookies(self, url):
3606         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3607         return http.cookies.SimpleCookie(self._downloader._calc_cookies(url))
3608
3609     def _apply_first_set_cookie_header(self, url_handle, cookie):
3610         """
3611         Apply first Set-Cookie header instead of the last. Experimental.
3612
3613         Some sites (e.g. [1-3]) may serve two cookies under the same name
3614         in Set-Cookie header and expect the first (old) one to be set rather
3615         than second (new). However, as of RFC6265 the newer one cookie
3616         should be set into cookie store what actually happens.
3617         We will workaround this issue by resetting the cookie to
3618         the first one manually.
3619         1. https://new.vk.com/
3620         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3621         3. https://learning.oreilly.com/
3622         """
3623         for header, cookies in url_handle.headers.items():
3624             if header.lower() != 'set-cookie':
3625                 continue
3626             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3627             cookie_value = re.search(
3628                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3629             if cookie_value:
3630                 value, domain = cookie_value.groups()
3631                 self._set_cookie(domain, cookie, value)
3632                 break
3633
3634     @classmethod
3635     def get_testcases(cls, include_onlymatching=False):
3636         t = getattr(cls, '_TEST', None)
3637         if t:
3638             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3639             tests = [t]
3640         else:
3641             tests = getattr(cls, '_TESTS', [])
3642         for t in tests:
3643             if not include_onlymatching and t.get('only_matching', False):
3644                 continue
3645             t['name'] = cls.ie_key()
3646             yield t
3647
3648     @classproperty
3649     def age_limit(cls):
3650         """Get age limit from the testcases"""
3651         return max(traverse_obj(
3652             tuple(cls.get_testcases(include_onlymatching=False)),
3653             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3654
3655     @classmethod
3656     def is_suitable(cls, age_limit):
3657         """Test whether the extractor is generally suitable for the given age limit"""
3658         return not age_restricted(cls.age_limit, age_limit)
3659
3660     @classmethod
3661     def description(cls, *, markdown=True, search_examples=None):
3662         """Description of the extractor"""
3663         desc = ''
3664         if cls._NETRC_MACHINE:
3665             if markdown:
3666                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3667             else:
3668                 desc += f' [{cls._NETRC_MACHINE}]'
3669         if cls.IE_DESC is False:
3670             desc += ' [HIDDEN]'
3671         elif cls.IE_DESC:
3672             desc += f' {cls.IE_DESC}'
3673         if cls.SEARCH_KEY:
3674             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3675             if search_examples:
3676                 _COUNTS = ('', '5', '10', 'all')
3677                 desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3678         if not cls.working():
3679             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3680
3681         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3682         return f'{name}:{desc}' if desc else name
3683
3684     def extract_subtitles(self, *args, **kwargs):
3685         if (self.get_param('writesubtitles', False)
3686                 or self.get_param('listsubtitles')):
3687             return self._get_subtitles(*args, **kwargs)
3688         return {}
3689
3690     def _get_subtitles(self, *args, **kwargs):
3691         raise NotImplementedError('This method must be implemented by subclasses')
3692
3693     def extract_comments(self, *args, **kwargs):
3694         if not self.get_param('getcomments'):
3695             return None
3696         generator = self._get_comments(*args, **kwargs)
3697
3698         def extractor():
3699             comments = []
3700             interrupted = True
3701             try:
3702                 while True:
3703                     comments.append(next(generator))
3704             except StopIteration:
3705                 interrupted = False
3706             except KeyboardInterrupt:
3707                 self.to_screen('Interrupted by user')
3708             except Exception as e:
3709                 if self.get_param('ignoreerrors') is not True:
3710                     raise
3711                 self._downloader.report_error(e)
3712             comment_count = len(comments)
3713             self.to_screen(f'Extracted {comment_count} comments')
3714             return {
3715                 'comments': comments,
3716                 'comment_count': None if interrupted else comment_count
3717             }
3718         return extractor
3719
3720     def _get_comments(self, *args, **kwargs):
3721         raise NotImplementedError('This method must be implemented by subclasses')
3722
3723     @staticmethod
3724     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3725         """ Merge subtitle items for one language. Items with duplicated URLs/data
3726         will be dropped. """
3727         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3728         ret = list(subtitle_list1)
3729         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3730         return ret
3731
3732     @classmethod
3733     def _merge_subtitles(cls, *dicts, target=None):
3734         """ Merge subtitle dictionaries, language by language. """
3735         if target is None:
3736             target = {}
3737         for d in dicts:
3738             for lang, subs in d.items():
3739                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3740         return target
3741
3742     def extract_automatic_captions(self, *args, **kwargs):
3743         if (self.get_param('writeautomaticsub', False)
3744                 or self.get_param('listsubtitles')):
3745             return self._get_automatic_captions(*args, **kwargs)
3746         return {}
3747
3748     def _get_automatic_captions(self, *args, **kwargs):
3749         raise NotImplementedError('This method must be implemented by subclasses')
3750
3751     @functools.cached_property
3752     def _cookies_passed(self):
3753         """Whether cookies have been passed to YoutubeDL"""
3754         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3755
3756     def mark_watched(self, *args, **kwargs):
3757         if not self.get_param('mark_watched', False):
3758             return
3759         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3760             self._mark_watched(*args, **kwargs)
3761
3762     def _mark_watched(self, *args, **kwargs):
3763         raise NotImplementedError('This method must be implemented by subclasses')
3764
3765     def geo_verification_headers(self):
3766         headers = {}
3767         geo_verification_proxy = self.get_param('geo_verification_proxy')
3768         if geo_verification_proxy:
3769             headers['Ytdl-request-proxy'] = geo_verification_proxy
3770         return headers
3771
3772     def _generic_id(self, url):
3773         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3774
3775     def _generic_title(self, url):
3776         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3777
3778     @staticmethod
3779     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3780         all_known = all(map(
3781             lambda x: x is not None,
3782             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3783         return (
3784             'private' if is_private
3785             else 'premium_only' if needs_premium
3786             else 'subscriber_only' if needs_subscription
3787             else 'needs_auth' if needs_auth
3788             else 'unlisted' if is_unlisted
3789             else 'public' if all_known
3790             else None)
3791
3792     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3793         '''
3794         @returns            A list of values for the extractor argument given by "key"
3795                             or "default" if no such key is present
3796         @param default      The default value to return when the key is not present (default: [])
3797         @param casesense    When false, the values are converted to lower case
3798         '''
3799         val = traverse_obj(
3800             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3801         if val is None:
3802             return [] if default is NO_DEFAULT else default
3803         return list(val) if casesense else [x.lower() for x in val]
3804
3805     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3806         if not playlist_id or not video_id:
3807             return not video_id
3808
3809         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3810         if no_playlist is not None:
3811             return not no_playlist
3812
3813         video_id = '' if video_id is True else f' {video_id}'
3814         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3815         if self.get_param('noplaylist'):
3816             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3817             return False
3818         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3819         return True
3820
3821
3822 class SearchInfoExtractor(InfoExtractor):
3823     """
3824     Base class for paged search queries extractors.
3825     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3826     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3827     """
3828
3829     _MAX_RESULTS = float('inf')
3830
3831     @classmethod
3832     def _make_valid_url(cls):
3833         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3834
3835     def _real_extract(self, query):
3836         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3837         if prefix == '':
3838             return self._get_n_results(query, 1)
3839         elif prefix == 'all':
3840             return self._get_n_results(query, self._MAX_RESULTS)
3841         else:
3842             n = int(prefix)
3843             if n <= 0:
3844                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3845             elif n > self._MAX_RESULTS:
3846                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3847                 n = self._MAX_RESULTS
3848             return self._get_n_results(query, n)
3849
3850     def _get_n_results(self, query, n):
3851         """Get a specified number of results for a query.
3852         Either this function or _search_results must be overridden by subclasses """
3853         return self.playlist_result(
3854             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3855             query, query)
3856
3857     def _search_results(self, query):
3858         """Returns an iterator of search results"""
3859         raise NotImplementedError('This method must be implemented by subclasses')
3860
3861     @classproperty
3862     def SEARCH_KEY(cls):
3863         return cls._SEARCH_KEY