yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import itertools
   9 import json
  10 import math
  11 import netrc
  12 import os
  13 import random
  14 import re
  15 import sys
  16 import time
  17 import types
  18 import urllib.parse
  19 import urllib.request
  20 import xml.etree.ElementTree
  21
  22 from ..compat import functools  # isort: split
  23 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  24 from ..downloader import FileDownloader
  25 from ..downloader.f4m import get_base_url, remove_encrypted_media
  26 from ..utils import (
  27     IDENTITY,
  28     JSON_LD_RE,
  29     NO_DEFAULT,
  30     ExtractorError,
  31     GeoRestrictedError,
  32     GeoUtils,
  33     LenientJSONDecoder,
  34     RegexNotFoundError,
  35     UnsupportedError,
  36     age_restricted,
  37     base_url,
  38     bug_reports_message,
  39     classproperty,
  40     clean_html,
  41     determine_ext,
  42     determine_protocol,
  43     dict_get,
  44     encode_data_uri,
  45     error_to_compat_str,
  46     extract_attributes,
  47     filter_dict,
  48     fix_xml_ampersands,
  49     float_or_none,
  50     format_field,
  51     int_or_none,
  52     join_nonempty,
  53     js_to_json,
  54     mimetype2ext,
  55     network_exceptions,
  56     orderedSet,
  57     parse_bitrate,
  58     parse_codecs,
  59     parse_duration,
  60     parse_iso8601,
  61     parse_m3u8_attributes,
  62     parse_resolution,
  63     sanitize_filename,
  64     sanitize_url,
  65     sanitized_Request,
  66     str_or_none,
  67     str_to_int,
  68     strip_or_none,
  69     traverse_obj,
  70     try_call,
  71     try_get,
  72     unescapeHTML,
  73     unified_strdate,
  74     unified_timestamp,
  75     update_Request,
  76     update_url_query,
  77     url_basename,
  78     url_or_none,
  79     urljoin,
  80     variadic,
  81     xpath_element,
  82     xpath_text,
  83     xpath_with_ns,
  84 )
  85
  86
  87 class InfoExtractor:
  88     """Information Extractor class.
  89
  90     Information extractors are the classes that, given a URL, extract
  91     information about the video (or videos) the URL refers to. This
  92     information includes the real video URL, the video title, author and
  93     others. The information is stored in a dictionary which is then
  94     passed to the YoutubeDL. The YoutubeDL processes this
  95     information possibly downloading the video to the file system, among
  96     other possible outcomes.
  97
  98     The type field determines the type of the result.
  99     By far the most common value (and the default if _type is missing) is
 100     "video", which indicates a single video.
 101
 102     For a video, the dictionaries must include the following fields:
 103
 104     id:             Video identifier.
 105     title:          Video title, unescaped. Set to an empty string if video has
 106                     no title as opposed to "None" which signifies that the
 107                     extractor failed to obtain a title
 108
 109     Additionally, it must contain either a formats entry or a url one:
 110
 111     formats:        A list of dictionaries for each format available, ordered
 112                     from worst to best quality.
 113
 114                     Potential fields:
 115                     * url        The mandatory URL representing the media:
 116                                    for plain file media - HTTP URL of this file,
 117                                    for RTMP - RTMP URL,
 118                                    for HLS - URL of the M3U8 media playlist,
 119                                    for HDS - URL of the F4M manifest,
 120                                    for DASH
 121                                      - HTTP URL to plain file media (in case of
 122                                        unfragmented media)
 123                                      - URL of the MPD manifest or base URL
 124                                        representing the media if MPD manifest
 125                                        is parsed from a string (in case of
 126                                        fragmented media)
 127                                    for MSS - URL of the ISM manifest.
 128                     * manifest_url
 129                                  The URL of the manifest file in case of
 130                                  fragmented media:
 131                                    for HLS - URL of the M3U8 master playlist,
 132                                    for HDS - URL of the F4M manifest,
 133                                    for DASH - URL of the MPD manifest,
 134                                    for MSS - URL of the ISM manifest.
 135                     * manifest_stream_number  (For internal use only)
 136                                  The index of the stream in the manifest file
 137                     * ext        Will be calculated from URL if missing
 138                     * format     A human-readable description of the format
 139                                  ("mp4 container with h264/opus").
 140                                  Calculated from the format_id, width, height.
 141                                  and format_note fields if missing.
 142                     * format_id  A short description of the format
 143                                  ("mp4_h264_opus" or "19").
 144                                 Technically optional, but strongly recommended.
 145                     * format_note Additional info about the format
 146                                  ("3D" or "DASH video")
 147                     * width      Width of the video, if known
 148                     * height     Height of the video, if known
 149                     * resolution Textual description of width and height
 150                     * dynamic_range The dynamic range of the video. One of:
 151                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 152                     * tbr        Average bitrate of audio and video in KBit/s
 153                     * abr        Average audio bitrate in KBit/s
 154                     * acodec     Name of the audio codec in use
 155                     * asr        Audio sampling rate in Hertz
 156                     * vbr        Average video bitrate in KBit/s
 157                     * fps        Frame rate
 158                     * vcodec     Name of the video codec in use
 159                     * container  Name of the container format
 160                     * filesize   The number of bytes, if known in advance
 161                     * filesize_approx  An estimate for the number of bytes
 162                     * player_url SWF Player URL (used for rtmpdump).
 163                     * protocol   The protocol that will be used for the actual
 164                                  download, lower-case. One of "http", "https" or
 165                                  one of the protocols defined in downloader.PROTOCOL_MAP
 166                     * fragment_base_url
 167                                  Base URL for fragments. Each fragment's path
 168                                  value (if present) will be relative to
 169                                  this URL.
 170                     * fragments  A list of fragments of a fragmented media.
 171                                  Each fragment entry must contain either an url
 172                                  or a path. If an url is present it should be
 173                                  considered by a client. Otherwise both path and
 174                                  fragment_base_url must be present. Here is
 175                                  the list of all potential fields:
 176                                  * "url" - fragment's URL
 177                                  * "path" - fragment's path relative to
 178                                             fragment_base_url
 179                                  * "duration" (optional, int or float)
 180                                  * "filesize" (optional, int)
 181                     * is_from_start  Is a live format that can be downloaded
 182                                 from the start. Boolean
 183                     * preference Order number of this format. If this field is
 184                                  present and not None, the formats get sorted
 185                                  by this field, regardless of all other values.
 186                                  -1 for default (order by other properties),
 187                                  -2 or smaller for less than default.
 188                                  < -1000 to hide the format (if there is
 189                                     another one which is strictly better)
 190                     * language   Language code, e.g. "de" or "en-US".
 191                     * language_preference  Is this in the language mentioned in
 192                                  the URL?
 193                                  10 if it's what the URL is about,
 194                                  -1 for default (don't know),
 195                                  -10 otherwise, other values reserved for now.
 196                     * quality    Order number of the video quality of this
 197                                  format, irrespective of the file format.
 198                                  -1 for default (order by other properties),
 199                                  -2 or smaller for less than default.
 200                     * source_preference  Order number for this video source
 201                                   (quality takes higher priority)
 202                                  -1 for default (order by other properties),
 203                                  -2 or smaller for less than default.
 204                     * http_headers  A dictionary of additional HTTP headers
 205                                  to add to the request.
 206                     * stretched_ratio  If given and not 1, indicates that the
 207                                  video's pixels are not square.
 208                                  width : height ratio as float.
 209                     * no_resume  The server does not support resuming the
 210                                  (HTTP or RTMP) download. Boolean.
 211                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 212                     * downloader_options  A dictionary of downloader options
 213                                  (For internal use only)
 214                                  * http_chunk_size Chunk size for HTTP downloads
 215                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 216                     RTMP formats can also have the additional fields: page_url,
 217                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 218                     rtmp_protocol, rtmp_real_time
 219
 220     url:            Final video URL.
 221     ext:            Video filename extension.
 222     format:         The video format, defaults to ext (used for --get-format)
 223     player_url:     SWF Player URL (used for rtmpdump).
 224
 225     The following fields are optional:
 226
 227     direct:         True if a direct video file was given (must only be set by GenericIE)
 228     alt_title:      A secondary title of the video.
 229     display_id      An alternative identifier for the video, not necessarily
 230                     unique, but available before title. Typically, id is
 231                     something like "4234987", title "Dancing naked mole rats",
 232                     and display_id "dancing-naked-mole-rats"
 233     thumbnails:     A list of dictionaries, with the following entries:
 234                         * "id" (optional, string) - Thumbnail format ID
 235                         * "url"
 236                         * "preference" (optional, int) - quality of the image
 237                         * "width" (optional, int)
 238                         * "height" (optional, int)
 239                         * "resolution" (optional, string "{width}x{height}",
 240                                         deprecated)
 241                         * "filesize" (optional, int)
 242                         * "http_headers" (dict) - HTTP headers for the request
 243     thumbnail:      Full URL to a video thumbnail image.
 244     description:    Full video description.
 245     uploader:       Full name of the video uploader.
 246     license:        License name the video is licensed under.
 247     creator:        The creator of the video.
 248     timestamp:      UNIX timestamp of the moment the video was uploaded
 249     upload_date:    Video upload date in UTC (YYYYMMDD).
 250                     If not explicitly set, calculated from timestamp
 251     release_timestamp: UNIX timestamp of the moment the video was released.
 252                     If it is not clear whether to use timestamp or this, use the former
 253     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 254                     If not explicitly set, calculated from release_timestamp
 255     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 256     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 257                     If not explicitly set, calculated from modified_timestamp
 258     uploader_id:    Nickname or id of the video uploader.
 259     uploader_url:   Full URL to a personal webpage of the video uploader.
 260     channel:        Full name of the channel the video is uploaded on.
 261                     Note that channel fields may or may not repeat uploader
 262                     fields. This depends on a particular extractor.
 263     channel_id:     Id of the channel.
 264     channel_url:    Full URL to a channel webpage.
 265     channel_follower_count: Number of followers of the channel.
 266     location:       Physical location where the video was filmed.
 267     subtitles:      The available subtitles as a dictionary in the format
 268                     {tag: subformats}. "tag" is usually a language code, and
 269                     "subformats" is a list sorted from lower to higher
 270                     preference, each element is a dictionary with the "ext"
 271                     entry and one of:
 272                         * "data": The subtitles file contents
 273                         * "url": A URL pointing to the subtitles file
 274                     It can optionally also have:
 275                         * "name": Name or description of the subtitles
 276                         * "http_headers": A dictionary of additional HTTP headers
 277                                   to add to the request.
 278                     "ext" will be calculated from URL if missing
 279     automatic_captions: Like 'subtitles'; contains automatically generated
 280                     captions instead of normal subtitles
 281     duration:       Length of the video in seconds, as an integer or float.
 282     view_count:     How many users have watched the video on the platform.
 283     like_count:     Number of positive ratings of the video
 284     dislike_count:  Number of negative ratings of the video
 285     repost_count:   Number of reposts of the video
 286     average_rating: Average rating give by users, the scale used depends on the webpage
 287     comment_count:  Number of comments on the video
 288     comments:       A list of comments, each with one or more of the following
 289                     properties (all but one of text or html optional):
 290                         * "author" - human-readable name of the comment author
 291                         * "author_id" - user ID of the comment author
 292                         * "author_thumbnail" - The thumbnail of the comment author
 293                         * "id" - Comment ID
 294                         * "html" - Comment as HTML
 295                         * "text" - Plain text of the comment
 296                         * "timestamp" - UNIX timestamp of comment
 297                         * "parent" - ID of the comment this one is replying to.
 298                                      Set to "root" to indicate that this is a
 299                                      comment to the original video.
 300                         * "like_count" - Number of positive ratings of the comment
 301                         * "dislike_count" - Number of negative ratings of the comment
 302                         * "is_favorited" - Whether the comment is marked as
 303                                            favorite by the video uploader
 304                         * "author_is_uploader" - Whether the comment is made by
 305                                                  the video uploader
 306     age_limit:      Age restriction for the video, as an integer (years)
 307     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 308                     should allow to get the same result again. (It will be set
 309                     by YoutubeDL if it's missing)
 310     categories:     A list of categories that the video falls in, for example
 311                     ["Sports", "Berlin"]
 312     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 313     cast:           A list of the video cast
 314     is_live:        True, False, or None (=unknown). Whether this video is a
 315                     live stream that goes on instead of a fixed-length video.
 316     was_live:       True, False, or None (=unknown). Whether this video was
 317                     originally a live stream.
 318     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live'
 319                     or 'post_live' (was live, but VOD is not yet processed)
 320                     If absent, automatically set from is_live, was_live
 321     start_time:     Time in seconds where the reproduction should start, as
 322                     specified in the URL.
 323     end_time:       Time in seconds where the reproduction should end, as
 324                     specified in the URL.
 325     chapters:       A list of dictionaries, with the following entries:
 326                         * "start_time" - The start time of the chapter in seconds
 327                         * "end_time" - The end time of the chapter in seconds
 328                         * "title" (optional, string)
 329     playable_in_embed: Whether this video is allowed to play in embedded
 330                     players on other sites. Can be True (=always allowed),
 331                     False (=never allowed), None (=unknown), or a string
 332                     specifying the criteria for embedability (Eg: 'whitelist')
 333     availability:   Under what condition the video is available. One of
 334                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 335                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 336                     to set it
 337     _old_archive_ids: A list of old archive ids needed for backward compatibility
 338     __post_extractor: A function to be called just before the metadata is
 339                     written to either disk, logger or console. The function
 340                     must return a dict which will be added to the info_dict.
 341                     This is usefull for additional information that is
 342                     time-consuming to extract. Note that the fields thus
 343                     extracted will not be available to output template and
 344                     match_filter. So, only "comments" and "comment_count" are
 345                     currently allowed to be extracted via this method.
 346
 347     The following fields should only be used when the video belongs to some logical
 348     chapter or section:
 349
 350     chapter:        Name or title of the chapter the video belongs to.
 351     chapter_number: Number of the chapter the video belongs to, as an integer.
 352     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 353
 354     The following fields should only be used when the video is an episode of some
 355     series, programme or podcast:
 356
 357     series:         Title of the series or programme the video episode belongs to.
 358     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 359     season:         Title of the season the video episode belongs to.
 360     season_number:  Number of the season the video episode belongs to, as an integer.
 361     season_id:      Id of the season the video episode belongs to, as a unicode string.
 362     episode:        Title of the video episode. Unlike mandatory video title field,
 363                     this field should denote the exact title of the video episode
 364                     without any kind of decoration.
 365     episode_number: Number of the video episode within a season, as an integer.
 366     episode_id:     Id of the video episode, as a unicode string.
 367
 368     The following fields should only be used when the media is a track or a part of
 369     a music album:
 370
 371     track:          Title of the track.
 372     track_number:   Number of the track within an album or a disc, as an integer.
 373     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 374                     as a unicode string.
 375     artist:         Artist(s) of the track.
 376     genre:          Genre(s) of the track.
 377     album:          Title of the album the track belongs to.
 378     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 379     album_artist:   List of all artists appeared on the album (e.g.
 380                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 381                     and compilations).
 382     disc_number:    Number of the disc or other physical medium the track belongs to,
 383                     as an integer.
 384     release_year:   Year (YYYY) when the album was released.
 385     composer:       Composer of the piece
 386
 387     The following fields should only be set for clips that should be cut from the original video:
 388
 389     section_start:  Start time of the section in seconds
 390     section_end:    End time of the section in seconds
 391
 392     The following fields should only be set for storyboards:
 393     rows:           Number of rows in each storyboard fragment, as an integer
 394     columns:        Number of columns in each storyboard fragment, as an integer
 395
 396     Unless mentioned otherwise, the fields should be Unicode strings.
 397
 398     Unless mentioned otherwise, None is equivalent to absence of information.
 399
 400
 401     _type "playlist" indicates multiple videos.
 402     There must be a key "entries", which is a list, an iterable, or a PagedList
 403     object, each element of which is a valid dictionary by this specification.
 404
 405     Additionally, playlists can have "id", "title", and any other relevant
 406     attributes with the same semantics as videos (see above).
 407
 408     It can also have the following optional fields:
 409
 410     playlist_count: The total number of videos in a playlist. If not given,
 411                     YoutubeDL tries to calculate it from "entries"
 412
 413
 414     _type "multi_video" indicates that there are multiple videos that
 415     form a single show, for examples multiple acts of an opera or TV episode.
 416     It must have an entries key like a playlist and contain all the keys
 417     required for a video at the same time.
 418
 419
 420     _type "url" indicates that the video must be extracted from another
 421     location, possibly by a different extractor. Its only required key is:
 422     "url" - the next URL to extract.
 423     The key "ie_key" can be set to the class name (minus the trailing "IE",
 424     e.g. "Youtube") if the extractor class is known in advance.
 425     Additionally, the dictionary may have any properties of the resolved entity
 426     known in advance, for example "title" if the title of the referred video is
 427     known ahead of time.
 428
 429
 430     _type "url_transparent" entities have the same specification as "url", but
 431     indicate that the given additional information is more precise than the one
 432     associated with the resolved URL.
 433     This is useful when a site employs a video service that hosts the video and
 434     its technical metadata, but that video service does not embed a useful
 435     title, description etc.
 436
 437
 438     Subclasses of this should also be added to the list of extractors and
 439     should define a _VALID_URL regexp and, re-define the _real_extract() and
 440     (optionally) _real_initialize() methods.
 441
 442     Subclasses may also override suitable() if necessary, but ensure the function
 443     signature is preserved and that this function imports everything it needs
 444     (except other extractors), so that lazy_extractors works correctly.
 445
 446     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 447     the HTML of Generic webpages. It may also override _extract_embed_urls
 448     or _extract_from_webpage as necessary. While these are normally classmethods,
 449     _extract_from_webpage is allowed to be an instance method.
 450
 451     _extract_from_webpage may raise self.StopExtraction() to stop further
 452     processing of the webpage and obtain exclusive rights to it. This is useful
 453     when the extractor cannot reliably be matched using just the URL.
 454     Eg: invidious/peertube instances
 455
 456     Embed-only extractors can be defined by setting _VALID_URL = False.
 457
 458     To support username + password (or netrc) login, the extractor must define a
 459     _NETRC_MACHINE and re-define _perform_login(username, password) and
 460     (optionally) _initialize_pre_login() methods. The _perform_login method will
 461     be called between _initialize_pre_login and _real_initialize if credentials
 462     are passed by the user. In cases where it is necessary to have the login
 463     process as part of the extraction rather than initialization, _perform_login
 464     can be left undefined.
 465
 466     _GEO_BYPASS attribute may be set to False in order to disable
 467     geo restriction bypass mechanisms for a particular extractor.
 468     Though it won't disable explicit geo restriction bypass based on
 469     country code provided with geo_bypass_country.
 470
 471     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 472     countries for this extractor. One of these countries will be used by
 473     geo restriction bypass mechanism right away in order to bypass
 474     geo restriction, of course, if the mechanism is not disabled.
 475
 476     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 477     IP blocks in CIDR notation for this extractor. One of these IP blocks
 478     will be used by geo restriction bypass mechanism similarly
 479     to _GEO_COUNTRIES.
 480
 481     The _WORKING attribute should be set to False for broken IEs
 482     in order to warn the users and skip the tests.
 483     """
 484
 485     _ready = False
 486     _downloader = None
 487     _x_forwarded_for_ip = None
 488     _GEO_BYPASS = True
 489     _GEO_COUNTRIES = None
 490     _GEO_IP_BLOCKS = None
 491     _WORKING = True
 492     _NETRC_MACHINE = None
 493     IE_DESC = None
 494     SEARCH_KEY = None
 495     _VALID_URL = None
 496     _EMBED_REGEX = []
 497
 498     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 499         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 500         return {
 501             None: '',
 502             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 503             'password': f'Use {password_hint}',
 504             'cookies': (
 505                 'Use --cookies-from-browser or --cookies for the authentication. '
 506                 'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 507         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 508
 509     def __init__(self, downloader=None):
 510         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 511         If a downloader is not passed during initialization,
 512         it must be set using "set_downloader()" before "extract()" is called"""
 513         self._ready = False
 514         self._x_forwarded_for_ip = None
 515         self._printed_messages = set()
 516         self.set_downloader(downloader)
 517
 518     @classmethod
 519     def _match_valid_url(cls, url):
 520         if cls._VALID_URL is False:
 521             return None
 522         # This does not use has/getattr intentionally - we want to know whether
 523         # we have cached the regexp for *this* class, whereas getattr would also
 524         # match the superclass
 525         if '_VALID_URL_RE' not in cls.__dict__:
 526             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 527         return cls._VALID_URL_RE.match(url)
 528
 529     @classmethod
 530     def suitable(cls, url):
 531         """Receives a URL and returns True if suitable for this IE."""
 532         # This function must import everything it needs (except other extractors),
 533         # so that lazy_extractors works correctly
 534         return cls._match_valid_url(url) is not None
 535
 536     @classmethod
 537     def _match_id(cls, url):
 538         return cls._match_valid_url(url).group('id')
 539
 540     @classmethod
 541     def get_temp_id(cls, url):
 542         try:
 543             return cls._match_id(url)
 544         except (IndexError, AttributeError):
 545             return None
 546
 547     @classmethod
 548     def working(cls):
 549         """Getter method for _WORKING."""
 550         return cls._WORKING
 551
 552     @classmethod
 553     def supports_login(cls):
 554         return bool(cls._NETRC_MACHINE)
 555
 556     def initialize(self):
 557         """Initializes an instance (authentication, etc)."""
 558         self._printed_messages = set()
 559         self._initialize_geo_bypass({
 560             'countries': self._GEO_COUNTRIES,
 561             'ip_blocks': self._GEO_IP_BLOCKS,
 562         })
 563         if not self._ready:
 564             self._initialize_pre_login()
 565             if self.supports_login():
 566                 username, password = self._get_login_info()
 567                 if username:
 568                     self._perform_login(username, password)
 569             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 570                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 571             self._real_initialize()
 572             self._ready = True
 573
 574     def _initialize_geo_bypass(self, geo_bypass_context):
 575         """
 576         Initialize geo restriction bypass mechanism.
 577
 578         This method is used to initialize geo bypass mechanism based on faking
 579         X-Forwarded-For HTTP header. A random country from provided country list
 580         is selected and a random IP belonging to this country is generated. This
 581         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 582         HTTP requests.
 583
 584         This method will be used for initial geo bypass mechanism initialization
 585         during the instance initialization with _GEO_COUNTRIES and
 586         _GEO_IP_BLOCKS.
 587
 588         You may also manually call it from extractor's code if geo bypass
 589         information is not available beforehand (e.g. obtained during
 590         extraction) or due to some other reason. In this case you should pass
 591         this information in geo bypass context passed as first argument. It may
 592         contain following fields:
 593
 594         countries:  List of geo unrestricted countries (similar
 595                     to _GEO_COUNTRIES)
 596         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 597                     (similar to _GEO_IP_BLOCKS)
 598
 599         """
 600         if not self._x_forwarded_for_ip:
 601
 602             # Geo bypass mechanism is explicitly disabled by user
 603             if not self.get_param('geo_bypass', True):
 604                 return
 605
 606             if not geo_bypass_context:
 607                 geo_bypass_context = {}
 608
 609             # Backward compatibility: previously _initialize_geo_bypass
 610             # expected a list of countries, some 3rd party code may still use
 611             # it this way
 612             if isinstance(geo_bypass_context, (list, tuple)):
 613                 geo_bypass_context = {
 614                     'countries': geo_bypass_context,
 615                 }
 616
 617             # The whole point of geo bypass mechanism is to fake IP
 618             # as X-Forwarded-For HTTP header based on some IP block or
 619             # country code.
 620
 621             # Path 1: bypassing based on IP block in CIDR notation
 622
 623             # Explicit IP block specified by user, use it right away
 624             # regardless of whether extractor is geo bypassable or not
 625             ip_block = self.get_param('geo_bypass_ip_block', None)
 626
 627             # Otherwise use random IP block from geo bypass context but only
 628             # if extractor is known as geo bypassable
 629             if not ip_block:
 630                 ip_blocks = geo_bypass_context.get('ip_blocks')
 631                 if self._GEO_BYPASS and ip_blocks:
 632                     ip_block = random.choice(ip_blocks)
 633
 634             if ip_block:
 635                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 636                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 637                 return
 638
 639             # Path 2: bypassing based on country code
 640
 641             # Explicit country code specified by user, use it right away
 642             # regardless of whether extractor is geo bypassable or not
 643             country = self.get_param('geo_bypass_country', None)
 644
 645             # Otherwise use random country code from geo bypass context but
 646             # only if extractor is known as geo bypassable
 647             if not country:
 648                 countries = geo_bypass_context.get('countries')
 649                 if self._GEO_BYPASS and countries:
 650                     country = random.choice(countries)
 651
 652             if country:
 653                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 654                 self._downloader.write_debug(
 655                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 656
 657     def extract(self, url):
 658         """Extracts URL information and returns it in list of dicts."""
 659         try:
 660             for _ in range(2):
 661                 try:
 662                     self.initialize()
 663                     self.write_debug('Extracting URL: %s' % url)
 664                     ie_result = self._real_extract(url)
 665                     if ie_result is None:
 666                         return None
 667                     if self._x_forwarded_for_ip:
 668                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 669                     subtitles = ie_result.get('subtitles') or {}
 670                     if 'no-live-chat' in self.get_param('compat_opts'):
 671                         for lang in ('live_chat', 'comments', 'danmaku'):
 672                             subtitles.pop(lang, None)
 673                     return ie_result
 674                 except GeoRestrictedError as e:
 675                     if self.__maybe_fake_ip_and_retry(e.countries):
 676                         continue
 677                     raise
 678         except UnsupportedError:
 679             raise
 680         except ExtractorError as e:
 681             kwargs = {
 682                 'video_id': e.video_id or self.get_temp_id(url),
 683                 'ie': self.IE_NAME,
 684                 'tb': e.traceback or sys.exc_info()[2],
 685                 'expected': e.expected,
 686                 'cause': e.cause
 687             }
 688             if hasattr(e, 'countries'):
 689                 kwargs['countries'] = e.countries
 690             raise type(e)(e.orig_msg, **kwargs)
 691         except http.client.IncompleteRead as e:
 692             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 693         except (KeyError, StopIteration) as e:
 694             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 695
 696     def __maybe_fake_ip_and_retry(self, countries):
 697         if (not self.get_param('geo_bypass_country', None)
 698                 and self._GEO_BYPASS
 699                 and self.get_param('geo_bypass', True)
 700                 and not self._x_forwarded_for_ip
 701                 and countries):
 702             country_code = random.choice(countries)
 703             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 704             if self._x_forwarded_for_ip:
 705                 self.report_warning(
 706                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 707                     % (self._x_forwarded_for_ip, country_code.upper()))
 708                 return True
 709         return False
 710
 711     def set_downloader(self, downloader):
 712         """Sets a YoutubeDL instance as the downloader for this IE."""
 713         self._downloader = downloader
 714
 715     @property
 716     def cache(self):
 717         return self._downloader.cache
 718
 719     @property
 720     def cookiejar(self):
 721         return self._downloader.cookiejar
 722
 723     def _initialize_pre_login(self):
 724         """ Initialization before login. Redefine in subclasses."""
 725         pass
 726
 727     def _perform_login(self, username, password):
 728         """ Login with username and password. Redefine in subclasses."""
 729         pass
 730
 731     def _real_initialize(self):
 732         """Real initialization process. Redefine in subclasses."""
 733         pass
 734
 735     def _real_extract(self, url):
 736         """Real extraction process. Redefine in subclasses."""
 737         raise NotImplementedError('This method must be implemented by subclasses')
 738
 739     @classmethod
 740     def ie_key(cls):
 741         """A string for getting the InfoExtractor with get_info_extractor"""
 742         return cls.__name__[:-2]
 743
 744     @classproperty
 745     def IE_NAME(cls):
 746         return cls.__name__[:-2]
 747
 748     @staticmethod
 749     def __can_accept_status_code(err, expected_status):
 750         assert isinstance(err, urllib.error.HTTPError)
 751         if expected_status is None:
 752             return False
 753         elif callable(expected_status):
 754             return expected_status(err.code) is True
 755         else:
 756             return err.code in variadic(expected_status)
 757
 758     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 759         if isinstance(url_or_request, urllib.request.Request):
 760             return update_Request(url_or_request, data=data, headers=headers, query=query)
 761         if query:
 762             url_or_request = update_url_query(url_or_request, query)
 763         return sanitized_Request(url_or_request, data, headers or {})
 764
 765     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 766         """
 767         Return the response handle.
 768
 769         See _download_webpage docstring for arguments specification.
 770         """
 771         if not self._downloader._first_webpage_request:
 772             sleep_interval = self.get_param('sleep_interval_requests') or 0
 773             if sleep_interval > 0:
 774                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 775                 time.sleep(sleep_interval)
 776         else:
 777             self._downloader._first_webpage_request = False
 778
 779         if note is None:
 780             self.report_download_webpage(video_id)
 781         elif note is not False:
 782             if video_id is None:
 783                 self.to_screen(str(note))
 784             else:
 785                 self.to_screen(f'{video_id}: {note}')
 786
 787         # Some sites check X-Forwarded-For HTTP header in order to figure out
 788         # the origin of the client behind proxy. This allows bypassing geo
 789         # restriction by faking this header's value to IP that belongs to some
 790         # geo unrestricted country. We will do so once we encounter any
 791         # geo restriction error.
 792         if self._x_forwarded_for_ip:
 793             headers = (headers or {}).copy()
 794             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 795
 796         try:
 797             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 798         except network_exceptions as err:
 799             if isinstance(err, urllib.error.HTTPError):
 800                 if self.__can_accept_status_code(err, expected_status):
 801                     # Retain reference to error to prevent file object from
 802                     # being closed before it can be read. Works around the
 803                     # effects of <https://bugs.python.org/issue15002>
 804                     # introduced in Python 3.4.1.
 805                     err.fp._error = err
 806                     return err.fp
 807
 808             if errnote is False:
 809                 return False
 810             if errnote is None:
 811                 errnote = 'Unable to download webpage'
 812
 813             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 814             if fatal:
 815                 raise ExtractorError(errmsg, cause=err)
 816             else:
 817                 self.report_warning(errmsg)
 818                 return False
 819
 820     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 821                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 822         """
 823         Return a tuple (page content as string, URL handle).
 824
 825         Arguments:
 826         url_or_request -- plain text URL as a string or
 827             a urllib.request.Request object
 828         video_id -- Video/playlist/item identifier (string)
 829
 830         Keyword arguments:
 831         note -- note printed before downloading (string)
 832         errnote -- note printed in case of an error (string)
 833         fatal -- flag denoting whether error should be considered fatal,
 834             i.e. whether it should cause ExtractionError to be raised,
 835             otherwise a warning will be reported and extraction continued
 836         encoding -- encoding for a page content decoding, guessed automatically
 837             when not explicitly specified
 838         data -- POST data (bytes)
 839         headers -- HTTP headers (dict)
 840         query -- URL query (dict)
 841         expected_status -- allows to accept failed HTTP requests (non 2xx
 842             status code) by explicitly specifying a set of accepted status
 843             codes. Can be any of the following entities:
 844                 - an integer type specifying an exact failed status code to
 845                   accept
 846                 - a list or a tuple of integer types specifying a list of
 847                   failed status codes to accept
 848                 - a callable accepting an actual failed status code and
 849                   returning True if it should be accepted
 850             Note that this argument does not affect success status codes (2xx)
 851             which are always accepted.
 852         """
 853
 854         # Strip hashes from the URL (#1038)
 855         if isinstance(url_or_request, str):
 856             url_or_request = url_or_request.partition('#')[0]
 857
 858         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 859         if urlh is False:
 860             assert not fatal
 861             return False
 862         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 863         return (content, urlh)
 864
 865     @staticmethod
 866     def _guess_encoding_from_content(content_type, webpage_bytes):
 867         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 868         if m:
 869             encoding = m.group(1)
 870         else:
 871             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 872                           webpage_bytes[:1024])
 873             if m:
 874                 encoding = m.group(1).decode('ascii')
 875             elif webpage_bytes.startswith(b'\xff\xfe'):
 876                 encoding = 'utf-16'
 877             else:
 878                 encoding = 'utf-8'
 879
 880         return encoding
 881
 882     def __check_blocked(self, content):
 883         first_block = content[:512]
 884         if ('<title>Access to this site is blocked</title>' in content
 885                 and 'Websense' in first_block):
 886             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 887             blocked_iframe = self._html_search_regex(
 888                 r'<iframe src="([^"]+)"', content,
 889                 'Websense information URL', default=None)
 890             if blocked_iframe:
 891                 msg += ' Visit %s for more details' % blocked_iframe
 892             raise ExtractorError(msg, expected=True)
 893         if '<title>The URL you requested has been blocked</title>' in first_block:
 894             msg = (
 895                 'Access to this webpage has been blocked by Indian censorship. '
 896                 'Use a VPN or proxy server (with --proxy) to route around it.')
 897             block_msg = self._html_search_regex(
 898                 r'</h1><p>(.*?)</p>',
 899                 content, 'block message', default=None)
 900             if block_msg:
 901                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 902             raise ExtractorError(msg, expected=True)
 903         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 904                 and 'blocklist.rkn.gov.ru' in content):
 905             raise ExtractorError(
 906                 'Access to this webpage has been blocked by decision of the Russian government. '
 907                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 908                 expected=True)
 909
 910     def _request_dump_filename(self, url, video_id):
 911         basen = f'{video_id}_{url}'
 912         trim_length = self.get_param('trim_file_name') or 240
 913         if len(basen) > trim_length:
 914             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 915             basen = basen[:trim_length - len(h)] + h
 916         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 917         # Working around MAX_PATH limitation on Windows (see
 918         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 919         if compat_os_name == 'nt':
 920             absfilepath = os.path.abspath(filename)
 921             if len(absfilepath) > 259:
 922                 filename = fR'\\?\{absfilepath}'
 923         return filename
 924
 925     def __decode_webpage(self, webpage_bytes, encoding, headers):
 926         if not encoding:
 927             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 928         try:
 929             return webpage_bytes.decode(encoding, 'replace')
 930         except LookupError:
 931             return webpage_bytes.decode('utf-8', 'replace')
 932
 933     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 934         webpage_bytes = urlh.read()
 935         if prefix is not None:
 936             webpage_bytes = prefix + webpage_bytes
 937         if self.get_param('dump_intermediate_pages', False):
 938             self.to_screen('Dumping request to ' + urlh.geturl())
 939             dump = base64.b64encode(webpage_bytes).decode('ascii')
 940             self._downloader.to_screen(dump)
 941         if self.get_param('write_pages'):
 942             filename = self._request_dump_filename(urlh.geturl(), video_id)
 943             self.to_screen(f'Saving request to {filename}')
 944             with open(filename, 'wb') as outf:
 945                 outf.write(webpage_bytes)
 946
 947         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 948         self.__check_blocked(content)
 949
 950         return content
 951
 952     def __print_error(self, errnote, fatal, video_id, err):
 953         if fatal:
 954             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 955         elif errnote:
 956             self.report_warning(f'{video_id}: {errnote}: {err}')
 957
 958     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 959         if transform_source:
 960             xml_string = transform_source(xml_string)
 961         try:
 962             return compat_etree_fromstring(xml_string.encode('utf-8'))
 963         except xml.etree.ElementTree.ParseError as ve:
 964             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 965
 966     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
 967         try:
 968             return json.loads(
 969                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 970         except ValueError as ve:
 971             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
 972
 973     def _parse_socket_response_as_json(self, data, *args, **kwargs):
 974         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
 975
 976     def __create_download_methods(name, parser, note, errnote, return_value):
 977
 978         def parse(ie, content, *args, errnote=errnote, **kwargs):
 979             if parser is None:
 980                 return content
 981             if errnote is False:
 982                 kwargs['errnote'] = errnote
 983             # parser is fetched by name so subclasses can override it
 984             return getattr(ie, parser)(content, *args, **kwargs)
 985
 986         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 987                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 988             res = self._download_webpage_handle(
 989                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 990                 data=data, headers=headers, query=query, expected_status=expected_status)
 991             if res is False:
 992                 return res
 993             content, urlh = res
 994             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
 995
 996         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 997                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 998             if self.get_param('load_pages'):
 999                 url_or_request = self._create_request(url_or_request, data, headers, query)
1000                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1001                 self.to_screen(f'Loading request from {filename}')
1002                 try:
1003                     with open(filename, 'rb') as dumpf:
1004                         webpage_bytes = dumpf.read()
1005                 except OSError as e:
1006                     self.report_warning(f'Unable to load request from disk: {e}')
1007                 else:
1008                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1009                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1010             kwargs = {
1011                 'note': note,
1012                 'errnote': errnote,
1013                 'transform_source': transform_source,
1014                 'fatal': fatal,
1015                 'encoding': encoding,
1016                 'data': data,
1017                 'headers': headers,
1018                 'query': query,
1019                 'expected_status': expected_status,
1020             }
1021             if parser is None:
1022                 kwargs.pop('transform_source')
1023             # The method is fetched by name so subclasses can override _download_..._handle
1024             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1025             return res if res is False else res[0]
1026
1027         def impersonate(func, name, return_value):
1028             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1029             func.__doc__ = f'''
1030                 @param transform_source     Apply this transformation before parsing
1031                 @returns                    {return_value}
1032
1033                 See _download_webpage_handle docstring for other arguments specification
1034             '''
1035
1036         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1037         impersonate(download_content, f'_download_{name}', f'{return_value}')
1038         return download_handle, download_content
1039
1040     _download_xml_handle, _download_xml = __create_download_methods(
1041         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1042     _download_json_handle, _download_json = __create_download_methods(
1043         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1044     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1045         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1046     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1047
1048     def _download_webpage(
1049             self, url_or_request, video_id, note=None, errnote=None,
1050             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1051         """
1052         Return the data of the page as a string.
1053
1054         Keyword arguments:
1055         tries -- number of tries
1056         timeout -- sleep interval between tries
1057
1058         See _download_webpage_handle docstring for other arguments specification.
1059         """
1060
1061         R''' # NB: These are unused; should they be deprecated?
1062         if tries != 1:
1063             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1064         if timeout is NO_DEFAULT:
1065             timeout = 5
1066         else:
1067             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1068         '''
1069
1070         try_count = 0
1071         while True:
1072             try:
1073                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1074             except http.client.IncompleteRead as e:
1075                 try_count += 1
1076                 if try_count >= tries:
1077                     raise e
1078                 self._sleep(timeout, video_id)
1079
1080     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1081         idstr = format_field(video_id, None, '%s: ')
1082         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1083         if only_once:
1084             if f'WARNING: {msg}' in self._printed_messages:
1085                 return
1086             self._printed_messages.add(f'WARNING: {msg}')
1087         self._downloader.report_warning(msg, *args, **kwargs)
1088
1089     def to_screen(self, msg, *args, **kwargs):
1090         """Print msg to screen, prefixing it with '[ie_name]'"""
1091         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1092
1093     def write_debug(self, msg, *args, **kwargs):
1094         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1095
1096     def get_param(self, name, default=None, *args, **kwargs):
1097         if self._downloader:
1098             return self._downloader.params.get(name, default, *args, **kwargs)
1099         return default
1100
1101     def report_drm(self, video_id, partial=False):
1102         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1103
1104     def report_extraction(self, id_or_name):
1105         """Report information extraction."""
1106         self.to_screen('%s: Extracting information' % id_or_name)
1107
1108     def report_download_webpage(self, video_id):
1109         """Report webpage download."""
1110         self.to_screen('%s: Downloading webpage' % video_id)
1111
1112     def report_age_confirmation(self):
1113         """Report attempt to confirm age."""
1114         self.to_screen('Confirming age')
1115
1116     def report_login(self):
1117         """Report attempt to log in."""
1118         self.to_screen('Logging in')
1119
1120     def raise_login_required(
1121             self, msg='This video is only available for registered users',
1122             metadata_available=False, method=NO_DEFAULT):
1123         if metadata_available and (
1124                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1125             self.report_warning(msg)
1126             return
1127         msg += format_field(self._login_hint(method), None, '. %s')
1128         raise ExtractorError(msg, expected=True)
1129
1130     def raise_geo_restricted(
1131             self, msg='This video is not available from your location due to geo restriction',
1132             countries=None, metadata_available=False):
1133         if metadata_available and (
1134                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1135             self.report_warning(msg)
1136         else:
1137             raise GeoRestrictedError(msg, countries=countries)
1138
1139     def raise_no_formats(self, msg, expected=False, video_id=None):
1140         if expected and (
1141                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1142             self.report_warning(msg, video_id)
1143         elif isinstance(msg, ExtractorError):
1144             raise msg
1145         else:
1146             raise ExtractorError(msg, expected=expected, video_id=video_id)
1147
1148     # Methods for following #608
1149     @staticmethod
1150     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1151         """Returns a URL that points to a page that should be processed"""
1152         if ie is not None:
1153             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1154         if video_id is not None:
1155             kwargs['id'] = video_id
1156         if video_title is not None:
1157             kwargs['title'] = video_title
1158         return {
1159             **kwargs,
1160             '_type': 'url_transparent' if url_transparent else 'url',
1161             'url': url,
1162         }
1163
1164     @classmethod
1165     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1166                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1167         return cls.playlist_result(
1168             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1169             playlist_id, playlist_title, **kwargs)
1170
1171     @staticmethod
1172     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1173         """Returns a playlist"""
1174         if playlist_id:
1175             kwargs['id'] = playlist_id
1176         if playlist_title:
1177             kwargs['title'] = playlist_title
1178         if playlist_description is not None:
1179             kwargs['description'] = playlist_description
1180         return {
1181             **kwargs,
1182             '_type': 'multi_video' if multi_video else 'playlist',
1183             'entries': entries,
1184         }
1185
1186     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1187         """
1188         Perform a regex search on the given string, using a single or a list of
1189         patterns returning the first matching group.
1190         In case of failure return a default value or raise a WARNING or a
1191         RegexNotFoundError, depending on fatal, specifying the field name.
1192         """
1193         if string is None:
1194             mobj = None
1195         elif isinstance(pattern, (str, re.Pattern)):
1196             mobj = re.search(pattern, string, flags)
1197         else:
1198             for p in pattern:
1199                 mobj = re.search(p, string, flags)
1200                 if mobj:
1201                     break
1202
1203         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1204
1205         if mobj:
1206             if group is None:
1207                 # return the first matching group
1208                 return next(g for g in mobj.groups() if g is not None)
1209             elif isinstance(group, (list, tuple)):
1210                 return tuple(mobj.group(g) for g in group)
1211             else:
1212                 return mobj.group(group)
1213         elif default is not NO_DEFAULT:
1214             return default
1215         elif fatal:
1216             raise RegexNotFoundError('Unable to extract %s' % _name)
1217         else:
1218             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1219             return None
1220
1221     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1222                      contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs):
1223         """Searches string for the JSON object specified by start_pattern"""
1224         # NB: end_pattern is only used to reduce the size of the initial match
1225         if default is NO_DEFAULT:
1226             default, has_default = {}, False
1227         else:
1228             fatal, has_default = False, True
1229
1230         json_string = self._search_regex(
1231             rf'{start_pattern}\s*(?P<json>{{\s*{contains_pattern}\s*}})\s*{end_pattern}',
1232             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1233         if not json_string:
1234             return default
1235
1236         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1237         try:
1238             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1239         except ExtractorError as e:
1240             if fatal:
1241                 raise ExtractorError(
1242                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1243             elif not has_default:
1244                 self.report_warning(
1245                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1246         return default
1247
1248     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1249         """
1250         Like _search_regex, but strips HTML tags and unescapes entities.
1251         """
1252         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1253         if res:
1254             return clean_html(res).strip()
1255         else:
1256             return res
1257
1258     def _get_netrc_login_info(self, netrc_machine=None):
1259         username = None
1260         password = None
1261         netrc_machine = netrc_machine or self._NETRC_MACHINE
1262
1263         if self.get_param('usenetrc', False):
1264             try:
1265                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1266                 if os.path.isdir(netrc_file):
1267                     netrc_file = os.path.join(netrc_file, '.netrc')
1268                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1269                 if info is not None:
1270                     username = info[0]
1271                     password = info[2]
1272                 else:
1273                     raise netrc.NetrcParseError(
1274                         'No authenticators for %s' % netrc_machine)
1275             except (OSError, netrc.NetrcParseError) as err:
1276                 self.report_warning(
1277                     'parsing .netrc: %s' % error_to_compat_str(err))
1278
1279         return username, password
1280
1281     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1282         """
1283         Get the login info as (username, password)
1284         First look for the manually specified credentials using username_option
1285         and password_option as keys in params dictionary. If no such credentials
1286         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1287         value.
1288         If there's no info available, return (None, None)
1289         """
1290
1291         # Attempt to use provided username and password or .netrc data
1292         username = self.get_param(username_option)
1293         if username is not None:
1294             password = self.get_param(password_option)
1295         else:
1296             username, password = self._get_netrc_login_info(netrc_machine)
1297
1298         return username, password
1299
1300     def _get_tfa_info(self, note='two-factor verification code'):
1301         """
1302         Get the two-factor authentication info
1303         TODO - asking the user will be required for sms/phone verify
1304         currently just uses the command line option
1305         If there's no info available, return None
1306         """
1307
1308         tfa = self.get_param('twofactor')
1309         if tfa is not None:
1310             return tfa
1311
1312         return getpass.getpass('Type %s and press [Return]: ' % note)
1313
1314     # Helper functions for extracting OpenGraph info
1315     @staticmethod
1316     def _og_regexes(prop):
1317         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1318         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1319                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1320         template = r'<meta[^>]+?%s[^>]+?%s'
1321         return [
1322             template % (property_re, content_re),
1323             template % (content_re, property_re),
1324         ]
1325
1326     @staticmethod
1327     def _meta_regex(prop):
1328         return r'''(?isx)<meta
1329                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1330                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1331
1332     def _og_search_property(self, prop, html, name=None, **kargs):
1333         prop = variadic(prop)
1334         if name is None:
1335             name = 'OpenGraph %s' % prop[0]
1336         og_regexes = []
1337         for p in prop:
1338             og_regexes.extend(self._og_regexes(p))
1339         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1340         if escaped is None:
1341             return None
1342         return unescapeHTML(escaped)
1343
1344     def _og_search_thumbnail(self, html, **kargs):
1345         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1346
1347     def _og_search_description(self, html, **kargs):
1348         return self._og_search_property('description', html, fatal=False, **kargs)
1349
1350     def _og_search_title(self, html, *, fatal=False, **kargs):
1351         return self._og_search_property('title', html, fatal=fatal, **kargs)
1352
1353     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1354         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1355         if secure:
1356             regexes = self._og_regexes('video:secure_url') + regexes
1357         return self._html_search_regex(regexes, html, name, **kargs)
1358
1359     def _og_search_url(self, html, **kargs):
1360         return self._og_search_property('url', html, **kargs)
1361
1362     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1363         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1364
1365     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1366         name = variadic(name)
1367         if display_name is None:
1368             display_name = name[0]
1369         return self._html_search_regex(
1370             [self._meta_regex(n) for n in name],
1371             html, display_name, fatal=fatal, group='content', **kwargs)
1372
1373     def _dc_search_uploader(self, html):
1374         return self._html_search_meta('dc.creator', html, 'uploader')
1375
1376     @staticmethod
1377     def _rta_search(html):
1378         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1379         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1380                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1381                      html):
1382             return 18
1383
1384         # And then there are the jokers who advertise that they use RTA, but actually don't.
1385         AGE_LIMIT_MARKERS = [
1386             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1387         ]
1388         if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
1389             return 18
1390         return 0
1391
1392     def _media_rating_search(self, html):
1393         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1394         rating = self._html_search_meta('rating', html)
1395
1396         if not rating:
1397             return None
1398
1399         RATING_TABLE = {
1400             'safe for kids': 0,
1401             'general': 8,
1402             '14 years': 14,
1403             'mature': 17,
1404             'restricted': 19,
1405         }
1406         return RATING_TABLE.get(rating.lower())
1407
1408     def _family_friendly_search(self, html):
1409         # See http://schema.org/VideoObject
1410         family_friendly = self._html_search_meta(
1411             'isFamilyFriendly', html, default=None)
1412
1413         if not family_friendly:
1414             return None
1415
1416         RATING_TABLE = {
1417             '1': 0,
1418             'true': 0,
1419             '0': 18,
1420             'false': 18,
1421         }
1422         return RATING_TABLE.get(family_friendly.lower())
1423
1424     def _twitter_search_player(self, html):
1425         return self._html_search_meta('twitter:player', html,
1426                                       'twitter card player')
1427
1428     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1429         """Yield all json ld objects in the html"""
1430         if default is not NO_DEFAULT:
1431             fatal = False
1432         for mobj in re.finditer(JSON_LD_RE, html):
1433             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1434             for json_ld in variadic(json_ld_item):
1435                 if isinstance(json_ld, dict):
1436                     yield json_ld
1437
1438     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1439         """Search for a video in any json ld in the html"""
1440         if default is not NO_DEFAULT:
1441             fatal = False
1442         info = self._json_ld(
1443             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1444             video_id, fatal=fatal, expected_type=expected_type)
1445         if info:
1446             return info
1447         if default is not NO_DEFAULT:
1448             return default
1449         elif fatal:
1450             raise RegexNotFoundError('Unable to extract JSON-LD')
1451         else:
1452             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1453             return {}
1454
1455     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1456         if isinstance(json_ld, str):
1457             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1458         if not json_ld:
1459             return {}
1460         info = {}
1461         if not isinstance(json_ld, (list, tuple, dict)):
1462             return info
1463         if isinstance(json_ld, dict):
1464             json_ld = [json_ld]
1465
1466         INTERACTION_TYPE_MAP = {
1467             'CommentAction': 'comment',
1468             'AgreeAction': 'like',
1469             'DisagreeAction': 'dislike',
1470             'LikeAction': 'like',
1471             'DislikeAction': 'dislike',
1472             'ListenAction': 'view',
1473             'WatchAction': 'view',
1474             'ViewAction': 'view',
1475         }
1476
1477         def is_type(e, *expected_types):
1478             type = variadic(traverse_obj(e, '@type'))
1479             return any(x in type for x in expected_types)
1480
1481         def extract_interaction_type(e):
1482             interaction_type = e.get('interactionType')
1483             if isinstance(interaction_type, dict):
1484                 interaction_type = interaction_type.get('@type')
1485             return str_or_none(interaction_type)
1486
1487         def extract_interaction_statistic(e):
1488             interaction_statistic = e.get('interactionStatistic')
1489             if isinstance(interaction_statistic, dict):
1490                 interaction_statistic = [interaction_statistic]
1491             if not isinstance(interaction_statistic, list):
1492                 return
1493             for is_e in interaction_statistic:
1494                 if not is_type(is_e, 'InteractionCounter'):
1495                     continue
1496                 interaction_type = extract_interaction_type(is_e)
1497                 if not interaction_type:
1498                     continue
1499                 # For interaction count some sites provide string instead of
1500                 # an integer (as per spec) with non digit characters (e.g. ",")
1501                 # so extracting count with more relaxed str_to_int
1502                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1503                 if interaction_count is None:
1504                     continue
1505                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1506                 if not count_kind:
1507                     continue
1508                 count_key = '%s_count' % count_kind
1509                 if info.get(count_key) is not None:
1510                     continue
1511                 info[count_key] = interaction_count
1512
1513         def extract_chapter_information(e):
1514             chapters = [{
1515                 'title': part.get('name'),
1516                 'start_time': part.get('startOffset'),
1517                 'end_time': part.get('endOffset'),
1518             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1519             for idx, (last_c, current_c, next_c) in enumerate(zip(
1520                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1521                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1522                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1523                 if None in current_c.values():
1524                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1525                     return
1526             if chapters:
1527                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1528                 info['chapters'] = chapters
1529
1530         def extract_video_object(e):
1531             assert is_type(e, 'VideoObject')
1532             author = e.get('author')
1533             info.update({
1534                 'url': url_or_none(e.get('contentUrl')),
1535                 'title': unescapeHTML(e.get('name')),
1536                 'description': unescapeHTML(e.get('description')),
1537                 'thumbnails': [{'url': unescapeHTML(url)}
1538                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1539                                if url_or_none(url)],
1540                 'duration': parse_duration(e.get('duration')),
1541                 'timestamp': unified_timestamp(e.get('uploadDate')),
1542                 # author can be an instance of 'Organization' or 'Person' types.
1543                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1544                 # however some websites are using 'Text' type instead.
1545                 # 1. https://schema.org/VideoObject
1546                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1547                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1548                 'tbr': int_or_none(e.get('bitrate')),
1549                 'width': int_or_none(e.get('width')),
1550                 'height': int_or_none(e.get('height')),
1551                 'view_count': int_or_none(e.get('interactionCount')),
1552             })
1553             extract_interaction_statistic(e)
1554             extract_chapter_information(e)
1555
1556         def traverse_json_ld(json_ld, at_top_level=True):
1557             for e in json_ld:
1558                 if at_top_level and '@context' not in e:
1559                     continue
1560                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1561                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1562                     break
1563                 if expected_type is not None and not is_type(e, expected_type):
1564                     continue
1565                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1566                 if rating is not None:
1567                     info['average_rating'] = rating
1568                 if is_type(e, 'TVEpisode', 'Episode'):
1569                     episode_name = unescapeHTML(e.get('name'))
1570                     info.update({
1571                         'episode': episode_name,
1572                         'episode_number': int_or_none(e.get('episodeNumber')),
1573                         'description': unescapeHTML(e.get('description')),
1574                     })
1575                     if not info.get('title') and episode_name:
1576                         info['title'] = episode_name
1577                     part_of_season = e.get('partOfSeason')
1578                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1579                         info.update({
1580                             'season': unescapeHTML(part_of_season.get('name')),
1581                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1582                         })
1583                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1584                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1585                         info['series'] = unescapeHTML(part_of_series.get('name'))
1586                 elif is_type(e, 'Movie'):
1587                     info.update({
1588                         'title': unescapeHTML(e.get('name')),
1589                         'description': unescapeHTML(e.get('description')),
1590                         'duration': parse_duration(e.get('duration')),
1591                         'timestamp': unified_timestamp(e.get('dateCreated')),
1592                     })
1593                 elif is_type(e, 'Article', 'NewsArticle'):
1594                     info.update({
1595                         'timestamp': parse_iso8601(e.get('datePublished')),
1596                         'title': unescapeHTML(e.get('headline')),
1597                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1598                     })
1599                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1600                         extract_video_object(e['video'][0])
1601                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1602                         extract_video_object(e['subjectOf'][0])
1603                 elif is_type(e, 'VideoObject'):
1604                     extract_video_object(e)
1605                     if expected_type is None:
1606                         continue
1607                     else:
1608                         break
1609                 video = e.get('video')
1610                 if is_type(video, 'VideoObject'):
1611                     extract_video_object(video)
1612                 if expected_type is None:
1613                     continue
1614                 else:
1615                     break
1616         traverse_json_ld(json_ld)
1617
1618         return filter_dict(info)
1619
1620     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1621         return self._parse_json(
1622             self._search_regex(
1623                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1624                 webpage, 'next.js data', fatal=fatal, **kw),
1625             video_id, transform_source=transform_source, fatal=fatal)
1626
1627     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1628         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1629         rectx = re.escape(context_name)
1630         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1631         js, arg_keys, arg_vals = self._search_regex(
1632             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1633             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
1634
1635         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1636
1637         for key, val in args.items():
1638             if val in ('undefined', 'void 0'):
1639                 args[key] = 'null'
1640
1641         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1642         return traverse_obj(ret, traverse) or {}
1643
1644     @staticmethod
1645     def _hidden_inputs(html):
1646         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1647         hidden_inputs = {}
1648         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1649             attrs = extract_attributes(input)
1650             if not input:
1651                 continue
1652             if attrs.get('type') not in ('hidden', 'submit'):
1653                 continue
1654             name = attrs.get('name') or attrs.get('id')
1655             value = attrs.get('value')
1656             if name and value is not None:
1657                 hidden_inputs[name] = value
1658         return hidden_inputs
1659
1660     def _form_hidden_inputs(self, form_id, html):
1661         form = self._search_regex(
1662             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1663             html, '%s form' % form_id, group='form')
1664         return self._hidden_inputs(form)
1665
1666     class FormatSort:
1667         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1668
1669         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1670                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1671                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1672         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1673                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1674                         'fps', 'fs_approx', 'source', 'id')
1675
1676         settings = {
1677             'vcodec': {'type': 'ordered', 'regex': True,
1678                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1679             'acodec': {'type': 'ordered', 'regex': True,
1680                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1681             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1682                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1683             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1684                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1685             'vext': {'type': 'ordered', 'field': 'video_ext',
1686                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1687                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1688             'aext': {'type': 'ordered', 'field': 'audio_ext',
1689                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1690                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1691             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1692             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1693                            'field': ('vcodec', 'acodec'),
1694                            'function': lambda it: int(any(v != 'none' for v in it))},
1695             'ie_pref': {'priority': True, 'type': 'extractor'},
1696             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1697             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1698             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1699             'quality': {'convert': 'float', 'default': -1},
1700             'filesize': {'convert': 'bytes'},
1701             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1702             'id': {'convert': 'string', 'field': 'format_id'},
1703             'height': {'convert': 'float_none'},
1704             'width': {'convert': 'float_none'},
1705             'fps': {'convert': 'float_none'},
1706             'tbr': {'convert': 'float_none'},
1707             'vbr': {'convert': 'float_none'},
1708             'abr': {'convert': 'float_none'},
1709             'asr': {'convert': 'float_none'},
1710             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1711
1712             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1713             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1714             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1715             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1716             'res': {'type': 'multiple', 'field': ('height', 'width'),
1717                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1718
1719             # For compatibility with youtube-dl
1720             'format_id': {'type': 'alias', 'field': 'id'},
1721             'preference': {'type': 'alias', 'field': 'ie_pref'},
1722             'language_preference': {'type': 'alias', 'field': 'lang'},
1723             'source_preference': {'type': 'alias', 'field': 'source'},
1724             'protocol': {'type': 'alias', 'field': 'proto'},
1725             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1726
1727             # Deprecated
1728             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1729             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1730             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1731             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1732             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1733             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1734             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1735             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1736             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1737             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1738             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1739             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1740             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1741             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1742             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1743             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1744             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1745             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1746             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1747             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1748         }
1749
1750         def __init__(self, ie, field_preference):
1751             self._order = []
1752             self.ydl = ie._downloader
1753             self.evaluate_params(self.ydl.params, field_preference)
1754             if ie.get_param('verbose'):
1755                 self.print_verbose_info(self.ydl.write_debug)
1756
1757         def _get_field_setting(self, field, key):
1758             if field not in self.settings:
1759                 if key in ('forced', 'priority'):
1760                     return False
1761                 self.ydl.deprecation_warning(
1762                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1763                     'and may be removed in a future version')
1764                 self.settings[field] = {}
1765             propObj = self.settings[field]
1766             if key not in propObj:
1767                 type = propObj.get('type')
1768                 if key == 'field':
1769                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1770                 elif key == 'convert':
1771                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1772                 else:
1773                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1774                 propObj[key] = default
1775             return propObj[key]
1776
1777         def _resolve_field_value(self, field, value, convertNone=False):
1778             if value is None:
1779                 if not convertNone:
1780                     return None
1781             else:
1782                 value = value.lower()
1783             conversion = self._get_field_setting(field, 'convert')
1784             if conversion == 'ignore':
1785                 return None
1786             if conversion == 'string':
1787                 return value
1788             elif conversion == 'float_none':
1789                 return float_or_none(value)
1790             elif conversion == 'bytes':
1791                 return FileDownloader.parse_bytes(value)
1792             elif conversion == 'order':
1793                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1794                 use_regex = self._get_field_setting(field, 'regex')
1795                 list_length = len(order_list)
1796                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1797                 if use_regex and value is not None:
1798                     for i, regex in enumerate(order_list):
1799                         if regex and re.match(regex, value):
1800                             return list_length - i
1801                     return list_length - empty_pos  # not in list
1802                 else:  # not regex or  value = None
1803                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1804             else:
1805                 if value.isnumeric():
1806                     return float(value)
1807                 else:
1808                     self.settings[field]['convert'] = 'string'
1809                     return value
1810
1811         def evaluate_params(self, params, sort_extractor):
1812             self._use_free_order = params.get('prefer_free_formats', False)
1813             self._sort_user = params.get('format_sort', [])
1814             self._sort_extractor = sort_extractor
1815
1816             def add_item(field, reverse, closest, limit_text):
1817                 field = field.lower()
1818                 if field in self._order:
1819                     return
1820                 self._order.append(field)
1821                 limit = self._resolve_field_value(field, limit_text)
1822                 data = {
1823                     'reverse': reverse,
1824                     'closest': False if limit is None else closest,
1825                     'limit_text': limit_text,
1826                     'limit': limit}
1827                 if field in self.settings:
1828                     self.settings[field].update(data)
1829                 else:
1830                     self.settings[field] = data
1831
1832             sort_list = (
1833                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1834                 + (tuple() if params.get('format_sort_force', False)
1835                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1836                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1837
1838             for item in sort_list:
1839                 match = re.match(self.regex, item)
1840                 if match is None:
1841                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1842                 field = match.group('field')
1843                 if field is None:
1844                     continue
1845                 if self._get_field_setting(field, 'type') == 'alias':
1846                     alias, field = field, self._get_field_setting(field, 'field')
1847                     if self._get_field_setting(alias, 'deprecated'):
1848                         self.ydl.deprecation_warning(
1849                             f'Format sorting alias {alias} is deprecated '
1850                             f'and may be removed in a future version. Please use {field} instead')
1851                 reverse = match.group('reverse') is not None
1852                 closest = match.group('separator') == '~'
1853                 limit_text = match.group('limit')
1854
1855                 has_limit = limit_text is not None
1856                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1857                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1858
1859                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1860                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1861                 limit_count = len(limits)
1862                 for (i, f) in enumerate(fields):
1863                     add_item(f, reverse, closest,
1864                              limits[i] if i < limit_count
1865                              else limits[0] if has_limit and not has_multiple_limits
1866                              else None)
1867
1868         def print_verbose_info(self, write_debug):
1869             if self._sort_user:
1870                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1871             if self._sort_extractor:
1872                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1873             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1874                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1875                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1876                               self._get_field_setting(field, 'limit_text'),
1877                               self._get_field_setting(field, 'limit'))
1878                 if self._get_field_setting(field, 'limit_text') is not None else '')
1879                 for field in self._order if self._get_field_setting(field, 'visible')]))
1880
1881         def _calculate_field_preference_from_value(self, format, field, type, value):
1882             reverse = self._get_field_setting(field, 'reverse')
1883             closest = self._get_field_setting(field, 'closest')
1884             limit = self._get_field_setting(field, 'limit')
1885
1886             if type == 'extractor':
1887                 maximum = self._get_field_setting(field, 'max')
1888                 if value is None or (maximum is not None and value >= maximum):
1889                     value = -1
1890             elif type == 'boolean':
1891                 in_list = self._get_field_setting(field, 'in_list')
1892                 not_in_list = self._get_field_setting(field, 'not_in_list')
1893                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1894             elif type == 'ordered':
1895                 value = self._resolve_field_value(field, value, True)
1896
1897             # try to convert to number
1898             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1899             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1900             if is_num:
1901                 value = val_num
1902
1903             return ((-10, 0) if value is None
1904                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1905                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1906                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1907                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1908                     else (-1, value, 0))
1909
1910         def _calculate_field_preference(self, format, field):
1911             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1912             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1913             if type == 'multiple':
1914                 type = 'field'  # Only 'field' is allowed in multiple for now
1915                 actual_fields = self._get_field_setting(field, 'field')
1916
1917                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1918             else:
1919                 value = get_value(field)
1920             return self._calculate_field_preference_from_value(format, field, type, value)
1921
1922         def calculate_preference(self, format):
1923             # Determine missing protocol
1924             if not format.get('protocol'):
1925                 format['protocol'] = determine_protocol(format)
1926
1927             # Determine missing ext
1928             if not format.get('ext') and 'url' in format:
1929                 format['ext'] = determine_ext(format['url'])
1930             if format.get('vcodec') == 'none':
1931                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1932                 format['video_ext'] = 'none'
1933             else:
1934                 format['video_ext'] = format['ext']
1935                 format['audio_ext'] = 'none'
1936             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1937             #    format['preference'] = -1000
1938
1939             # Determine missing bitrates
1940             if format.get('tbr') is None:
1941                 if format.get('vbr') is not None and format.get('abr') is not None:
1942                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1943             else:
1944                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1945                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1946                 if format.get('acodec') != 'none' and format.get('abr') is None:
1947                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1948
1949             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1950
1951     def _sort_formats(self, formats, field_preference=[]):
1952         if not formats:
1953             return
1954         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1955
1956     def _check_formats(self, formats, video_id):
1957         if formats:
1958             formats[:] = filter(
1959                 lambda f: self._is_valid_url(
1960                     f['url'], video_id,
1961                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1962                 formats)
1963
1964     @staticmethod
1965     def _remove_duplicate_formats(formats):
1966         format_urls = set()
1967         unique_formats = []
1968         for f in formats:
1969             if f['url'] not in format_urls:
1970                 format_urls.add(f['url'])
1971                 unique_formats.append(f)
1972         formats[:] = unique_formats
1973
1974     def _is_valid_url(self, url, video_id, item='video', headers={}):
1975         url = self._proto_relative_url(url, scheme='http:')
1976         # For now assume non HTTP(S) URLs always valid
1977         if not (url.startswith('http://') or url.startswith('https://')):
1978             return True
1979         try:
1980             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1981             return True
1982         except ExtractorError as e:
1983             self.to_screen(
1984                 '%s: %s URL is invalid, skipping: %s'
1985                 % (video_id, item, error_to_compat_str(e.cause)))
1986             return False
1987
1988     def http_scheme(self):
1989         """ Either "http:" or "https:", depending on the user's preferences """
1990         return (
1991             'http:'
1992             if self.get_param('prefer_insecure', False)
1993             else 'https:')
1994
1995     def _proto_relative_url(self, url, scheme=None):
1996         scheme = scheme or self.http_scheme()
1997         assert scheme.endswith(':')
1998         return sanitize_url(url, scheme=scheme[:-1])
1999
2000     def _sleep(self, timeout, video_id, msg_template=None):
2001         if msg_template is None:
2002             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
2003         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
2004         self.to_screen(msg)
2005         time.sleep(timeout)
2006
2007     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2008                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
2009                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
2010         res = self._download_xml_handle(
2011             manifest_url, video_id, 'Downloading f4m manifest',
2012             'Unable to download f4m manifest',
2013             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
2014             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
2015             transform_source=transform_source,
2016             fatal=fatal, data=data, headers=headers, query=query)
2017         if res is False:
2018             return []
2019
2020         manifest, urlh = res
2021         manifest_url = urlh.geturl()
2022
2023         return self._parse_f4m_formats(
2024             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2025             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2026
2027     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2028                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2029                            fatal=True, m3u8_id=None):
2030         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2031             return []
2032
2033         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2034         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2035         if akamai_pv is not None and ';' in akamai_pv.text:
2036             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2037             if playerVerificationChallenge.strip() != '':
2038                 return []
2039
2040         formats = []
2041         manifest_version = '1.0'
2042         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2043         if not media_nodes:
2044             manifest_version = '2.0'
2045             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2046         # Remove unsupported DRM protected media from final formats
2047         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2048         media_nodes = remove_encrypted_media(media_nodes)
2049         if not media_nodes:
2050             return formats
2051
2052         manifest_base_url = get_base_url(manifest)
2053
2054         bootstrap_info = xpath_element(
2055             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2056             'bootstrap info', default=None)
2057
2058         vcodec = None
2059         mime_type = xpath_text(
2060             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2061             'base URL', default=None)
2062         if mime_type and mime_type.startswith('audio/'):
2063             vcodec = 'none'
2064
2065         for i, media_el in enumerate(media_nodes):
2066             tbr = int_or_none(media_el.attrib.get('bitrate'))
2067             width = int_or_none(media_el.attrib.get('width'))
2068             height = int_or_none(media_el.attrib.get('height'))
2069             format_id = join_nonempty(f4m_id, tbr or i)
2070             # If <bootstrapInfo> is present, the specified f4m is a
2071             # stream-level manifest, and only set-level manifests may refer to
2072             # external resources.  See section 11.4 and section 4 of F4M spec
2073             if bootstrap_info is None:
2074                 media_url = None
2075                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2076                 if manifest_version == '2.0':
2077                     media_url = media_el.attrib.get('href')
2078                 if media_url is None:
2079                     media_url = media_el.attrib.get('url')
2080                 if not media_url:
2081                     continue
2082                 manifest_url = (
2083                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2084                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2085                 # If media_url is itself a f4m manifest do the recursive extraction
2086                 # since bitrates in parent manifest (this one) and media_url manifest
2087                 # may differ leading to inability to resolve the format by requested
2088                 # bitrate in f4m downloader
2089                 ext = determine_ext(manifest_url)
2090                 if ext == 'f4m':
2091                     f4m_formats = self._extract_f4m_formats(
2092                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2093                         transform_source=transform_source, fatal=fatal)
2094                     # Sometimes stream-level manifest contains single media entry that
2095                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2096                     # At the same time parent's media entry in set-level manifest may
2097                     # contain it. We will copy it from parent in such cases.
2098                     if len(f4m_formats) == 1:
2099                         f = f4m_formats[0]
2100                         f.update({
2101                             'tbr': f.get('tbr') or tbr,
2102                             'width': f.get('width') or width,
2103                             'height': f.get('height') or height,
2104                             'format_id': f.get('format_id') if not tbr else format_id,
2105                             'vcodec': vcodec,
2106                         })
2107                     formats.extend(f4m_formats)
2108                     continue
2109                 elif ext == 'm3u8':
2110                     formats.extend(self._extract_m3u8_formats(
2111                         manifest_url, video_id, 'mp4', preference=preference,
2112                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2113                     continue
2114             formats.append({
2115                 'format_id': format_id,
2116                 'url': manifest_url,
2117                 'manifest_url': manifest_url,
2118                 'ext': 'flv' if bootstrap_info is not None else None,
2119                 'protocol': 'f4m',
2120                 'tbr': tbr,
2121                 'width': width,
2122                 'height': height,
2123                 'vcodec': vcodec,
2124                 'preference': preference,
2125                 'quality': quality,
2126             })
2127         return formats
2128
2129     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2130         return {
2131             'format_id': join_nonempty(m3u8_id, 'meta'),
2132             'url': m3u8_url,
2133             'ext': ext,
2134             'protocol': 'm3u8',
2135             'preference': preference - 100 if preference else -100,
2136             'quality': quality,
2137             'resolution': 'multiple',
2138             'format_note': 'Quality selection URL',
2139         }
2140
2141     def _report_ignoring_subs(self, name):
2142         self.report_warning(bug_reports_message(
2143             f'Ignoring subtitle tracks found in the {name} manifest; '
2144             'if any subtitle tracks are missing,'
2145         ), only_once=True)
2146
2147     def _extract_m3u8_formats(self, *args, **kwargs):
2148         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2149         if subs:
2150             self._report_ignoring_subs('HLS')
2151         return fmts
2152
2153     def _extract_m3u8_formats_and_subtitles(
2154             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2155             preference=None, quality=None, m3u8_id=None, note=None,
2156             errnote=None, fatal=True, live=False, data=None, headers={},
2157             query={}):
2158
2159         res = self._download_webpage_handle(
2160             m3u8_url, video_id,
2161             note='Downloading m3u8 information' if note is None else note,
2162             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2163             fatal=fatal, data=data, headers=headers, query=query)
2164
2165         if res is False:
2166             return [], {}
2167
2168         m3u8_doc, urlh = res
2169         m3u8_url = urlh.geturl()
2170
2171         return self._parse_m3u8_formats_and_subtitles(
2172             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2173             preference=preference, quality=quality, m3u8_id=m3u8_id,
2174             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2175             headers=headers, query=query, video_id=video_id)
2176
2177     def _parse_m3u8_formats_and_subtitles(
2178             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2179             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2180             errnote=None, fatal=True, data=None, headers={}, query={},
2181             video_id=None):
2182         formats, subtitles = [], {}
2183
2184         has_drm = re.search('|'.join([
2185             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2186             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2187         ]), m3u8_doc)
2188
2189         def format_url(url):
2190             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2191
2192         if self.get_param('hls_split_discontinuity', False):
2193             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2194                 if not m3u8_doc:
2195                     if not manifest_url:
2196                         return []
2197                     m3u8_doc = self._download_webpage(
2198                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2199                         note=False, errnote='Failed to download m3u8 playlist information')
2200                     if m3u8_doc is False:
2201                         return []
2202                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2203
2204         else:
2205             def _extract_m3u8_playlist_indices(*args, **kwargs):
2206                 return [None]
2207
2208         # References:
2209         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2210         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2211         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2212
2213         # We should try extracting formats only from master playlists [1, 4.3.4],
2214         # i.e. playlists that describe available qualities. On the other hand
2215         # media playlists [1, 4.3.3] should be returned as is since they contain
2216         # just the media without qualities renditions.
2217         # Fortunately, master playlist can be easily distinguished from media
2218         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2219         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2220         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2221         # media playlist and MUST NOT appear in master playlist thus we can
2222         # clearly detect media playlist with this criterion.
2223
2224         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2225             formats = [{
2226                 'format_id': join_nonempty(m3u8_id, idx),
2227                 'format_index': idx,
2228                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2229                 'ext': ext,
2230                 'protocol': entry_protocol,
2231                 'preference': preference,
2232                 'quality': quality,
2233                 'has_drm': has_drm,
2234             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2235
2236             return formats, subtitles
2237
2238         groups = {}
2239         last_stream_inf = {}
2240
2241         def extract_media(x_media_line):
2242             media = parse_m3u8_attributes(x_media_line)
2243             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2244             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2245             if not (media_type and group_id and name):
2246                 return
2247             groups.setdefault(group_id, []).append(media)
2248             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2249             if media_type == 'SUBTITLES':
2250                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2251                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2252                 # However, lack of URI has been spotted in the wild.
2253                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2254                 if not media.get('URI'):
2255                     return
2256                 url = format_url(media['URI'])
2257                 sub_info = {
2258                     'url': url,
2259                     'ext': determine_ext(url),
2260                 }
2261                 if sub_info['ext'] == 'm3u8':
2262                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2263                     # files may contain is WebVTT:
2264                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2265                     sub_info['ext'] = 'vtt'
2266                     sub_info['protocol'] = 'm3u8_native'
2267                 lang = media.get('LANGUAGE') or 'und'
2268                 subtitles.setdefault(lang, []).append(sub_info)
2269             if media_type not in ('VIDEO', 'AUDIO'):
2270                 return
2271             media_url = media.get('URI')
2272             if media_url:
2273                 manifest_url = format_url(media_url)
2274                 formats.extend({
2275                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2276                     'format_note': name,
2277                     'format_index': idx,
2278                     'url': manifest_url,
2279                     'manifest_url': m3u8_url,
2280                     'language': media.get('LANGUAGE'),
2281                     'ext': ext,
2282                     'protocol': entry_protocol,
2283                     'preference': preference,
2284                     'quality': quality,
2285                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2286                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2287
2288         def build_stream_name():
2289             # Despite specification does not mention NAME attribute for
2290             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2291             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2292             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2293             stream_name = last_stream_inf.get('NAME')
2294             if stream_name:
2295                 return stream_name
2296             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2297             # from corresponding rendition group
2298             stream_group_id = last_stream_inf.get('VIDEO')
2299             if not stream_group_id:
2300                 return
2301             stream_group = groups.get(stream_group_id)
2302             if not stream_group:
2303                 return stream_group_id
2304             rendition = stream_group[0]
2305             return rendition.get('NAME') or stream_group_id
2306
2307         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2308         # chance to detect video only formats when EXT-X-STREAM-INF tags
2309         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2310         for line in m3u8_doc.splitlines():
2311             if line.startswith('#EXT-X-MEDIA:'):
2312                 extract_media(line)
2313
2314         for line in m3u8_doc.splitlines():
2315             if line.startswith('#EXT-X-STREAM-INF:'):
2316                 last_stream_inf = parse_m3u8_attributes(line)
2317             elif line.startswith('#') or not line.strip():
2318                 continue
2319             else:
2320                 tbr = float_or_none(
2321                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2322                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2323                 manifest_url = format_url(line.strip())
2324
2325                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2326                     format_id = [m3u8_id, None, idx]
2327                     # Bandwidth of live streams may differ over time thus making
2328                     # format_id unpredictable. So it's better to keep provided
2329                     # format_id intact.
2330                     if not live:
2331                         stream_name = build_stream_name()
2332                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2333                     f = {
2334                         'format_id': join_nonempty(*format_id),
2335                         'format_index': idx,
2336                         'url': manifest_url,
2337                         'manifest_url': m3u8_url,
2338                         'tbr': tbr,
2339                         'ext': ext,
2340                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2341                         'protocol': entry_protocol,
2342                         'preference': preference,
2343                         'quality': quality,
2344                     }
2345                     resolution = last_stream_inf.get('RESOLUTION')
2346                     if resolution:
2347                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2348                         if mobj:
2349                             f['width'] = int(mobj.group('width'))
2350                             f['height'] = int(mobj.group('height'))
2351                     # Unified Streaming Platform
2352                     mobj = re.search(
2353                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2354                     if mobj:
2355                         abr, vbr = mobj.groups()
2356                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2357                         f.update({
2358                             'vbr': vbr,
2359                             'abr': abr,
2360                         })
2361                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2362                     f.update(codecs)
2363                     audio_group_id = last_stream_inf.get('AUDIO')
2364                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2365                     # references a rendition group MUST have a CODECS attribute.
2366                     # However, this is not always respected, for example, [2]
2367                     # contains EXT-X-STREAM-INF tag which references AUDIO
2368                     # rendition group but does not have CODECS and despite
2369                     # referencing an audio group it represents a complete
2370                     # (with audio and video) format. So, for such cases we will
2371                     # ignore references to rendition groups and treat them
2372                     # as complete formats.
2373                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2374                         audio_group = groups.get(audio_group_id)
2375                         if audio_group and audio_group[0].get('URI'):
2376                             # TODO: update acodec for audio only formats with
2377                             # the same GROUP-ID
2378                             f['acodec'] = 'none'
2379                     if not f.get('ext'):
2380                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2381                     formats.append(f)
2382
2383                     # for DailyMotion
2384                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2385                     if progressive_uri:
2386                         http_f = f.copy()
2387                         del http_f['manifest_url']
2388                         http_f.update({
2389                             'format_id': f['format_id'].replace('hls-', 'http-'),
2390                             'protocol': 'http',
2391                             'url': progressive_uri,
2392                         })
2393                         formats.append(http_f)
2394
2395                 last_stream_inf = {}
2396         return formats, subtitles
2397
2398     def _extract_m3u8_vod_duration(
2399             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2400
2401         m3u8_vod = self._download_webpage(
2402             m3u8_vod_url, video_id,
2403             note='Downloading m3u8 VOD manifest' if note is None else note,
2404             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2405             fatal=False, data=data, headers=headers, query=query)
2406
2407         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2408
2409     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2410         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2411             return None
2412
2413         return int(sum(
2414             float(line[len('#EXTINF:'):].split(',')[0])
2415             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2416
2417     @staticmethod
2418     def _xpath_ns(path, namespace=None):
2419         if not namespace:
2420             return path
2421         out = []
2422         for c in path.split('/'):
2423             if not c or c == '.':
2424                 out.append(c)
2425             else:
2426                 out.append('{%s}%s' % (namespace, c))
2427         return '/'.join(out)
2428
2429     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2430         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2431         if res is False:
2432             assert not fatal
2433             return [], {}
2434
2435         smil, urlh = res
2436         smil_url = urlh.geturl()
2437
2438         namespace = self._parse_smil_namespace(smil)
2439
2440         fmts = self._parse_smil_formats(
2441             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2442         subs = self._parse_smil_subtitles(
2443             smil, namespace=namespace)
2444
2445         return fmts, subs
2446
2447     def _extract_smil_formats(self, *args, **kwargs):
2448         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2449         if subs:
2450             self._report_ignoring_subs('SMIL')
2451         return fmts
2452
2453     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2454         res = self._download_smil(smil_url, video_id, fatal=fatal)
2455         if res is False:
2456             return {}
2457
2458         smil, urlh = res
2459         smil_url = urlh.geturl()
2460
2461         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2462
2463     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2464         return self._download_xml_handle(
2465             smil_url, video_id, 'Downloading SMIL file',
2466             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2467
2468     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2469         namespace = self._parse_smil_namespace(smil)
2470
2471         formats = self._parse_smil_formats(
2472             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2473         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2474
2475         video_id = os.path.splitext(url_basename(smil_url))[0]
2476         title = None
2477         description = None
2478         upload_date = None
2479         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2480             name = meta.attrib.get('name')
2481             content = meta.attrib.get('content')
2482             if not name or not content:
2483                 continue
2484             if not title and name == 'title':
2485                 title = content
2486             elif not description and name in ('description', 'abstract'):
2487                 description = content
2488             elif not upload_date and name == 'date':
2489                 upload_date = unified_strdate(content)
2490
2491         thumbnails = [{
2492             'id': image.get('type'),
2493             'url': image.get('src'),
2494             'width': int_or_none(image.get('width')),
2495             'height': int_or_none(image.get('height')),
2496         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2497
2498         return {
2499             'id': video_id,
2500             'title': title or video_id,
2501             'description': description,
2502             'upload_date': upload_date,
2503             'thumbnails': thumbnails,
2504             'formats': formats,
2505             'subtitles': subtitles,
2506         }
2507
2508     def _parse_smil_namespace(self, smil):
2509         return self._search_regex(
2510             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2511
2512     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2513         base = smil_url
2514         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2515             b = meta.get('base') or meta.get('httpBase')
2516             if b:
2517                 base = b
2518                 break
2519
2520         formats = []
2521         rtmp_count = 0
2522         http_count = 0
2523         m3u8_count = 0
2524         imgs_count = 0
2525
2526         srcs = set()
2527         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2528         for medium in media:
2529             src = medium.get('src')
2530             if not src or src in srcs:
2531                 continue
2532             srcs.add(src)
2533
2534             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2535             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2536             width = int_or_none(medium.get('width'))
2537             height = int_or_none(medium.get('height'))
2538             proto = medium.get('proto')
2539             ext = medium.get('ext')
2540             src_ext = determine_ext(src)
2541             streamer = medium.get('streamer') or base
2542
2543             if proto == 'rtmp' or streamer.startswith('rtmp'):
2544                 rtmp_count += 1
2545                 formats.append({
2546                     'url': streamer,
2547                     'play_path': src,
2548                     'ext': 'flv',
2549                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2550                     'tbr': bitrate,
2551                     'filesize': filesize,
2552                     'width': width,
2553                     'height': height,
2554                 })
2555                 if transform_rtmp_url:
2556                     streamer, src = transform_rtmp_url(streamer, src)
2557                     formats[-1].update({
2558                         'url': streamer,
2559                         'play_path': src,
2560                     })
2561                 continue
2562
2563             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2564             src_url = src_url.strip()
2565
2566             if proto == 'm3u8' or src_ext == 'm3u8':
2567                 m3u8_formats = self._extract_m3u8_formats(
2568                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2569                 if len(m3u8_formats) == 1:
2570                     m3u8_count += 1
2571                     m3u8_formats[0].update({
2572                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2573                         'tbr': bitrate,
2574                         'width': width,
2575                         'height': height,
2576                     })
2577                 formats.extend(m3u8_formats)
2578             elif src_ext == 'f4m':
2579                 f4m_url = src_url
2580                 if not f4m_params:
2581                     f4m_params = {
2582                         'hdcore': '3.2.0',
2583                         'plugin': 'flowplayer-3.2.0.1',
2584                     }
2585                 f4m_url += '&' if '?' in f4m_url else '?'
2586                 f4m_url += urllib.parse.urlencode(f4m_params)
2587                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2588             elif src_ext == 'mpd':
2589                 formats.extend(self._extract_mpd_formats(
2590                     src_url, video_id, mpd_id='dash', fatal=False))
2591             elif re.search(r'\.ism/[Mm]anifest', src_url):
2592                 formats.extend(self._extract_ism_formats(
2593                     src_url, video_id, ism_id='mss', fatal=False))
2594             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2595                 http_count += 1
2596                 formats.append({
2597                     'url': src_url,
2598                     'ext': ext or src_ext or 'flv',
2599                     'format_id': 'http-%d' % (bitrate or http_count),
2600                     'tbr': bitrate,
2601                     'filesize': filesize,
2602                     'width': width,
2603                     'height': height,
2604                 })
2605
2606         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2607             src = medium.get('src')
2608             if not src or src in srcs:
2609                 continue
2610             srcs.add(src)
2611
2612             imgs_count += 1
2613             formats.append({
2614                 'format_id': 'imagestream-%d' % (imgs_count),
2615                 'url': src,
2616                 'ext': mimetype2ext(medium.get('type')),
2617                 'acodec': 'none',
2618                 'vcodec': 'none',
2619                 'width': int_or_none(medium.get('width')),
2620                 'height': int_or_none(medium.get('height')),
2621                 'format_note': 'SMIL storyboards',
2622             })
2623
2624         return formats
2625
2626     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2627         urls = []
2628         subtitles = {}
2629         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2630             src = textstream.get('src')
2631             if not src or src in urls:
2632                 continue
2633             urls.append(src)
2634             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2635             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2636             subtitles.setdefault(lang, []).append({
2637                 'url': src,
2638                 'ext': ext,
2639             })
2640         return subtitles
2641
2642     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2643         res = self._download_xml_handle(
2644             xspf_url, playlist_id, 'Downloading xpsf playlist',
2645             'Unable to download xspf manifest', fatal=fatal)
2646         if res is False:
2647             return []
2648
2649         xspf, urlh = res
2650         xspf_url = urlh.geturl()
2651
2652         return self._parse_xspf(
2653             xspf, playlist_id, xspf_url=xspf_url,
2654             xspf_base_url=base_url(xspf_url))
2655
2656     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2657         NS_MAP = {
2658             'xspf': 'http://xspf.org/ns/0/',
2659             's1': 'http://static.streamone.nl/player/ns/0',
2660         }
2661
2662         entries = []
2663         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2664             title = xpath_text(
2665                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2666             description = xpath_text(
2667                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2668             thumbnail = xpath_text(
2669                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2670             duration = float_or_none(
2671                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2672
2673             formats = []
2674             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2675                 format_url = urljoin(xspf_base_url, location.text)
2676                 if not format_url:
2677                     continue
2678                 formats.append({
2679                     'url': format_url,
2680                     'manifest_url': xspf_url,
2681                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2682                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2683                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2684                 })
2685             self._sort_formats(formats)
2686
2687             entries.append({
2688                 'id': playlist_id,
2689                 'title': title,
2690                 'description': description,
2691                 'thumbnail': thumbnail,
2692                 'duration': duration,
2693                 'formats': formats,
2694             })
2695         return entries
2696
2697     def _extract_mpd_formats(self, *args, **kwargs):
2698         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2699         if subs:
2700             self._report_ignoring_subs('DASH')
2701         return fmts
2702
2703     def _extract_mpd_formats_and_subtitles(
2704             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2705             fatal=True, data=None, headers={}, query={}):
2706         res = self._download_xml_handle(
2707             mpd_url, video_id,
2708             note='Downloading MPD manifest' if note is None else note,
2709             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2710             fatal=fatal, data=data, headers=headers, query=query)
2711         if res is False:
2712             return [], {}
2713         mpd_doc, urlh = res
2714         if mpd_doc is None:
2715             return [], {}
2716
2717         # We could have been redirected to a new url when we retrieved our mpd file.
2718         mpd_url = urlh.geturl()
2719         mpd_base_url = base_url(mpd_url)
2720
2721         return self._parse_mpd_formats_and_subtitles(
2722             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2723
2724     def _parse_mpd_formats(self, *args, **kwargs):
2725         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2726         if subs:
2727             self._report_ignoring_subs('DASH')
2728         return fmts
2729
2730     def _parse_mpd_formats_and_subtitles(
2731             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2732         """
2733         Parse formats from MPD manifest.
2734         References:
2735          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2736             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2737          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2738         """
2739         if not self.get_param('dynamic_mpd', True):
2740             if mpd_doc.get('type') == 'dynamic':
2741                 return [], {}
2742
2743         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2744
2745         def _add_ns(path):
2746             return self._xpath_ns(path, namespace)
2747
2748         def is_drm_protected(element):
2749             return element.find(_add_ns('ContentProtection')) is not None
2750
2751         def extract_multisegment_info(element, ms_parent_info):
2752             ms_info = ms_parent_info.copy()
2753
2754             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2755             # common attributes and elements.  We will only extract relevant
2756             # for us.
2757             def extract_common(source):
2758                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2759                 if segment_timeline is not None:
2760                     s_e = segment_timeline.findall(_add_ns('S'))
2761                     if s_e:
2762                         ms_info['total_number'] = 0
2763                         ms_info['s'] = []
2764                         for s in s_e:
2765                             r = int(s.get('r', 0))
2766                             ms_info['total_number'] += 1 + r
2767                             ms_info['s'].append({
2768                                 't': int(s.get('t', 0)),
2769                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2770                                 'd': int(s.attrib['d']),
2771                                 'r': r,
2772                             })
2773                 start_number = source.get('startNumber')
2774                 if start_number:
2775                     ms_info['start_number'] = int(start_number)
2776                 timescale = source.get('timescale')
2777                 if timescale:
2778                     ms_info['timescale'] = int(timescale)
2779                 segment_duration = source.get('duration')
2780                 if segment_duration:
2781                     ms_info['segment_duration'] = float(segment_duration)
2782
2783             def extract_Initialization(source):
2784                 initialization = source.find(_add_ns('Initialization'))
2785                 if initialization is not None:
2786                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2787
2788             segment_list = element.find(_add_ns('SegmentList'))
2789             if segment_list is not None:
2790                 extract_common(segment_list)
2791                 extract_Initialization(segment_list)
2792                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2793                 if segment_urls_e:
2794                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2795             else:
2796                 segment_template = element.find(_add_ns('SegmentTemplate'))
2797                 if segment_template is not None:
2798                     extract_common(segment_template)
2799                     media = segment_template.get('media')
2800                     if media:
2801                         ms_info['media'] = media
2802                     initialization = segment_template.get('initialization')
2803                     if initialization:
2804                         ms_info['initialization'] = initialization
2805                     else:
2806                         extract_Initialization(segment_template)
2807             return ms_info
2808
2809         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2810         formats, subtitles = [], {}
2811         stream_numbers = collections.defaultdict(int)
2812         for period in mpd_doc.findall(_add_ns('Period')):
2813             period_duration = parse_duration(period.get('duration')) or mpd_duration
2814             period_ms_info = extract_multisegment_info(period, {
2815                 'start_number': 1,
2816                 'timescale': 1,
2817             })
2818             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2819                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2820                 for representation in adaptation_set.findall(_add_ns('Representation')):
2821                     representation_attrib = adaptation_set.attrib.copy()
2822                     representation_attrib.update(representation.attrib)
2823                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2824                     mime_type = representation_attrib['mimeType']
2825                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2826
2827                     codec_str = representation_attrib.get('codecs', '')
2828                     # Some kind of binary subtitle found in some youtube livestreams
2829                     if mime_type == 'application/x-rawcc':
2830                         codecs = {'scodec': codec_str}
2831                     else:
2832                         codecs = parse_codecs(codec_str)
2833                     if content_type not in ('video', 'audio', 'text'):
2834                         if mime_type == 'image/jpeg':
2835                             content_type = mime_type
2836                         elif codecs.get('vcodec', 'none') != 'none':
2837                             content_type = 'video'
2838                         elif codecs.get('acodec', 'none') != 'none':
2839                             content_type = 'audio'
2840                         elif codecs.get('scodec', 'none') != 'none':
2841                             content_type = 'text'
2842                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2843                             content_type = 'text'
2844                         else:
2845                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2846                             continue
2847
2848                     base_url = ''
2849                     for element in (representation, adaptation_set, period, mpd_doc):
2850                         base_url_e = element.find(_add_ns('BaseURL'))
2851                         if try_call(lambda: base_url_e.text) is not None:
2852                             base_url = base_url_e.text + base_url
2853                             if re.match(r'^https?://', base_url):
2854                                 break
2855                     if mpd_base_url and base_url.startswith('/'):
2856                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2857                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2858                         if not mpd_base_url.endswith('/'):
2859                             mpd_base_url += '/'
2860                         base_url = mpd_base_url + base_url
2861                     representation_id = representation_attrib.get('id')
2862                     lang = representation_attrib.get('lang')
2863                     url_el = representation.find(_add_ns('BaseURL'))
2864                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2865                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2866                     if representation_id is not None:
2867                         format_id = representation_id
2868                     else:
2869                         format_id = content_type
2870                     if mpd_id:
2871                         format_id = mpd_id + '-' + format_id
2872                     if content_type in ('video', 'audio'):
2873                         f = {
2874                             'format_id': format_id,
2875                             'manifest_url': mpd_url,
2876                             'ext': mimetype2ext(mime_type),
2877                             'width': int_or_none(representation_attrib.get('width')),
2878                             'height': int_or_none(representation_attrib.get('height')),
2879                             'tbr': float_or_none(bandwidth, 1000),
2880                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2881                             'fps': int_or_none(representation_attrib.get('frameRate')),
2882                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2883                             'format_note': 'DASH %s' % content_type,
2884                             'filesize': filesize,
2885                             'container': mimetype2ext(mime_type) + '_dash',
2886                             **codecs
2887                         }
2888                     elif content_type == 'text':
2889                         f = {
2890                             'ext': mimetype2ext(mime_type),
2891                             'manifest_url': mpd_url,
2892                             'filesize': filesize,
2893                         }
2894                     elif content_type == 'image/jpeg':
2895                         # See test case in VikiIE
2896                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2897                         f = {
2898                             'format_id': format_id,
2899                             'ext': 'mhtml',
2900                             'manifest_url': mpd_url,
2901                             'format_note': 'DASH storyboards (jpeg)',
2902                             'acodec': 'none',
2903                             'vcodec': 'none',
2904                         }
2905                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2906                         f['has_drm'] = True
2907                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2908
2909                     def prepare_template(template_name, identifiers):
2910                         tmpl = representation_ms_info[template_name]
2911                         # First of, % characters outside $...$ templates
2912                         # must be escaped by doubling for proper processing
2913                         # by % operator string formatting used further (see
2914                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2915                         t = ''
2916                         in_template = False
2917                         for c in tmpl:
2918                             t += c
2919                             if c == '$':
2920                                 in_template = not in_template
2921                             elif c == '%' and not in_template:
2922                                 t += c
2923                         # Next, $...$ templates are translated to their
2924                         # %(...) counterparts to be used with % operator
2925                         if representation_id is not None:
2926                             t = t.replace('$RepresentationID$', representation_id)
2927                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2928                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2929                         t.replace('$$', '$')
2930                         return t
2931
2932                     # @initialization is a regular template like @media one
2933                     # so it should be handled just the same way (see
2934                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2935                     if 'initialization' in representation_ms_info:
2936                         initialization_template = prepare_template(
2937                             'initialization',
2938                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2939                             # $Time$ shall not be included for @initialization thus
2940                             # only $Bandwidth$ remains
2941                             ('Bandwidth', ))
2942                         representation_ms_info['initialization_url'] = initialization_template % {
2943                             'Bandwidth': bandwidth,
2944                         }
2945
2946                     def location_key(location):
2947                         return 'url' if re.match(r'^https?://', location) else 'path'
2948
2949                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2950
2951                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2952                         media_location_key = location_key(media_template)
2953
2954                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2955                         # can't be used at the same time
2956                         if '%(Number' in media_template and 's' not in representation_ms_info:
2957                             segment_duration = None
2958                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2959                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2960                                 representation_ms_info['total_number'] = int(math.ceil(
2961                                     float_or_none(period_duration, segment_duration, default=0)))
2962                             representation_ms_info['fragments'] = [{
2963                                 media_location_key: media_template % {
2964                                     'Number': segment_number,
2965                                     'Bandwidth': bandwidth,
2966                                 },
2967                                 'duration': segment_duration,
2968                             } for segment_number in range(
2969                                 representation_ms_info['start_number'],
2970                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2971                         else:
2972                             # $Number*$ or $Time$ in media template with S list available
2973                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2974                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2975                             representation_ms_info['fragments'] = []
2976                             segment_time = 0
2977                             segment_d = None
2978                             segment_number = representation_ms_info['start_number']
2979
2980                             def add_segment_url():
2981                                 segment_url = media_template % {
2982                                     'Time': segment_time,
2983                                     'Bandwidth': bandwidth,
2984                                     'Number': segment_number,
2985                                 }
2986                                 representation_ms_info['fragments'].append({
2987                                     media_location_key: segment_url,
2988                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2989                                 })
2990
2991                             for num, s in enumerate(representation_ms_info['s']):
2992                                 segment_time = s.get('t') or segment_time
2993                                 segment_d = s['d']
2994                                 add_segment_url()
2995                                 segment_number += 1
2996                                 for r in range(s.get('r', 0)):
2997                                     segment_time += segment_d
2998                                     add_segment_url()
2999                                     segment_number += 1
3000                                 segment_time += segment_d
3001                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
3002                         # No media template
3003                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
3004                         # or any YouTube dashsegments video
3005                         fragments = []
3006                         segment_index = 0
3007                         timescale = representation_ms_info['timescale']
3008                         for s in representation_ms_info['s']:
3009                             duration = float_or_none(s['d'], timescale)
3010                             for r in range(s.get('r', 0) + 1):
3011                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
3012                                 fragments.append({
3013                                     location_key(segment_uri): segment_uri,
3014                                     'duration': duration,
3015                                 })
3016                                 segment_index += 1
3017                         representation_ms_info['fragments'] = fragments
3018                     elif 'segment_urls' in representation_ms_info:
3019                         # Segment URLs with no SegmentTimeline
3020                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3021                         # https://github.com/ytdl-org/youtube-dl/pull/14844
3022                         fragments = []
3023                         segment_duration = float_or_none(
3024                             representation_ms_info['segment_duration'],
3025                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3026                         for segment_url in representation_ms_info['segment_urls']:
3027                             fragment = {
3028                                 location_key(segment_url): segment_url,
3029                             }
3030                             if segment_duration:
3031                                 fragment['duration'] = segment_duration
3032                             fragments.append(fragment)
3033                         representation_ms_info['fragments'] = fragments
3034                     # If there is a fragments key available then we correctly recognized fragmented media.
3035                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3036                     # assumption is not necessarily correct since we may simply have no support for
3037                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3038                     if 'fragments' in representation_ms_info:
3039                         f.update({
3040                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3041                             'url': mpd_url or base_url,
3042                             'fragment_base_url': base_url,
3043                             'fragments': [],
3044                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3045                         })
3046                         if 'initialization_url' in representation_ms_info:
3047                             initialization_url = representation_ms_info['initialization_url']
3048                             if not f.get('url'):
3049                                 f['url'] = initialization_url
3050                             f['fragments'].append({location_key(initialization_url): initialization_url})
3051                         f['fragments'].extend(representation_ms_info['fragments'])
3052                         if not period_duration:
3053                             period_duration = try_get(
3054                                 representation_ms_info,
3055                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3056                     else:
3057                         # Assuming direct URL to unfragmented media.
3058                         f['url'] = base_url
3059                     if content_type in ('video', 'audio', 'image/jpeg'):
3060                         f['manifest_stream_number'] = stream_numbers[f['url']]
3061                         stream_numbers[f['url']] += 1
3062                         formats.append(f)
3063                     elif content_type == 'text':
3064                         subtitles.setdefault(lang or 'und', []).append(f)
3065
3066         return formats, subtitles
3067
3068     def _extract_ism_formats(self, *args, **kwargs):
3069         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3070         if subs:
3071             self._report_ignoring_subs('ISM')
3072         return fmts
3073
3074     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3075         res = self._download_xml_handle(
3076             ism_url, video_id,
3077             note='Downloading ISM manifest' if note is None else note,
3078             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3079             fatal=fatal, data=data, headers=headers, query=query)
3080         if res is False:
3081             return [], {}
3082         ism_doc, urlh = res
3083         if ism_doc is None:
3084             return [], {}
3085
3086         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3087
3088     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3089         """
3090         Parse formats from ISM manifest.
3091         References:
3092          1. [MS-SSTR]: Smooth Streaming Protocol,
3093             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3094         """
3095         if ism_doc.get('IsLive') == 'TRUE':
3096             return [], {}
3097
3098         duration = int(ism_doc.attrib['Duration'])
3099         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3100
3101         formats = []
3102         subtitles = {}
3103         for stream in ism_doc.findall('StreamIndex'):
3104             stream_type = stream.get('Type')
3105             if stream_type not in ('video', 'audio', 'text'):
3106                 continue
3107             url_pattern = stream.attrib['Url']
3108             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3109             stream_name = stream.get('Name')
3110             stream_language = stream.get('Language', 'und')
3111             for track in stream.findall('QualityLevel'):
3112                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3113                 # TODO: add support for WVC1 and WMAP
3114                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3115                     self.report_warning('%s is not a supported codec' % fourcc)
3116                     continue
3117                 tbr = int(track.attrib['Bitrate']) // 1000
3118                 # [1] does not mention Width and Height attributes. However,
3119                 # they're often present while MaxWidth and MaxHeight are
3120                 # missing, so should be used as fallbacks
3121                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3122                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3123                 sampling_rate = int_or_none(track.get('SamplingRate'))
3124
3125                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3126                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3127
3128                 fragments = []
3129                 fragment_ctx = {
3130                     'time': 0,
3131                 }
3132                 stream_fragments = stream.findall('c')
3133                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3134                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3135                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3136                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3137                     if not fragment_ctx['duration']:
3138                         try:
3139                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3140                         except IndexError:
3141                             next_fragment_time = duration
3142                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3143                     for _ in range(fragment_repeat):
3144                         fragments.append({
3145                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3146                             'duration': fragment_ctx['duration'] / stream_timescale,
3147                         })
3148                         fragment_ctx['time'] += fragment_ctx['duration']
3149
3150                 if stream_type == 'text':
3151                     subtitles.setdefault(stream_language, []).append({
3152                         'ext': 'ismt',
3153                         'protocol': 'ism',
3154                         'url': ism_url,
3155                         'manifest_url': ism_url,
3156                         'fragments': fragments,
3157                         '_download_params': {
3158                             'stream_type': stream_type,
3159                             'duration': duration,
3160                             'timescale': stream_timescale,
3161                             'fourcc': fourcc,
3162                             'language': stream_language,
3163                             'codec_private_data': track.get('CodecPrivateData'),
3164                         }
3165                     })
3166                 elif stream_type in ('video', 'audio'):
3167                     formats.append({
3168                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3169                         'url': ism_url,
3170                         'manifest_url': ism_url,
3171                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3172                         'width': width,
3173                         'height': height,
3174                         'tbr': tbr,
3175                         'asr': sampling_rate,
3176                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3177                         'acodec': 'none' if stream_type == 'video' else fourcc,
3178                         'protocol': 'ism',
3179                         'fragments': fragments,
3180                         'has_drm': ism_doc.find('Protection') is not None,
3181                         '_download_params': {
3182                             'stream_type': stream_type,
3183                             'duration': duration,
3184                             'timescale': stream_timescale,
3185                             'width': width or 0,
3186                             'height': height or 0,
3187                             'fourcc': fourcc,
3188                             'language': stream_language,
3189                             'codec_private_data': track.get('CodecPrivateData'),
3190                             'sampling_rate': sampling_rate,
3191                             'channels': int_or_none(track.get('Channels', 2)),
3192                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3193                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3194                         },
3195                     })
3196         return formats, subtitles
3197
3198     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3199         def absolute_url(item_url):
3200             return urljoin(base_url, item_url)
3201
3202         def parse_content_type(content_type):
3203             if not content_type:
3204                 return {}
3205             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3206             if ctr:
3207                 mimetype, codecs = ctr.groups()
3208                 f = parse_codecs(codecs)
3209                 f['ext'] = mimetype2ext(mimetype)
3210                 return f
3211             return {}
3212
3213         def _media_formats(src, cur_media_type, type_info=None):
3214             type_info = type_info or {}
3215             full_url = absolute_url(src)
3216             ext = type_info.get('ext') or determine_ext(full_url)
3217             if ext == 'm3u8':
3218                 is_plain_url = False
3219                 formats = self._extract_m3u8_formats(
3220                     full_url, video_id, ext='mp4',
3221                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3222                     preference=preference, quality=quality, fatal=False)
3223             elif ext == 'mpd':
3224                 is_plain_url = False
3225                 formats = self._extract_mpd_formats(
3226                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3227             else:
3228                 is_plain_url = True
3229                 formats = [{
3230                     'url': full_url,
3231                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3232                     'ext': ext,
3233                 }]
3234             return is_plain_url, formats
3235
3236         entries = []
3237         # amp-video and amp-audio are very similar to their HTML5 counterparts
3238         # so we will include them right here (see
3239         # https://www.ampproject.org/docs/reference/components/amp-video)
3240         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3241         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3242         media_tags = [(media_tag, media_tag_name, media_type, '')
3243                       for media_tag, media_tag_name, media_type
3244                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3245         media_tags.extend(re.findall(
3246             # We only allow video|audio followed by a whitespace or '>'.
3247             # Allowing more characters may end up in significant slow down (see
3248             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3249             # http://www.porntrex.com/maps/videositemap.xml).
3250             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3251         for media_tag, _, media_type, media_content in media_tags:
3252             media_info = {
3253                 'formats': [],
3254                 'subtitles': {},
3255             }
3256             media_attributes = extract_attributes(media_tag)
3257             src = strip_or_none(media_attributes.get('src'))
3258             if src:
3259                 f = parse_content_type(media_attributes.get('type'))
3260                 _, formats = _media_formats(src, media_type, f)
3261                 media_info['formats'].extend(formats)
3262             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3263             if media_content:
3264                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3265                     s_attr = extract_attributes(source_tag)
3266                     # data-video-src and data-src are non standard but seen
3267                     # several times in the wild
3268                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3269                     if not src:
3270                         continue
3271                     f = parse_content_type(s_attr.get('type'))
3272                     is_plain_url, formats = _media_formats(src, media_type, f)
3273                     if is_plain_url:
3274                         # width, height, res, label and title attributes are
3275                         # all not standard but seen several times in the wild
3276                         labels = [
3277                             s_attr.get(lbl)
3278                             for lbl in ('label', 'title')
3279                             if str_or_none(s_attr.get(lbl))
3280                         ]
3281                         width = int_or_none(s_attr.get('width'))
3282                         height = (int_or_none(s_attr.get('height'))
3283                                   or int_or_none(s_attr.get('res')))
3284                         if not width or not height:
3285                             for lbl in labels:
3286                                 resolution = parse_resolution(lbl)
3287                                 if not resolution:
3288                                     continue
3289                                 width = width or resolution.get('width')
3290                                 height = height or resolution.get('height')
3291                         for lbl in labels:
3292                             tbr = parse_bitrate(lbl)
3293                             if tbr:
3294                                 break
3295                         else:
3296                             tbr = None
3297                         f.update({
3298                             'width': width,
3299                             'height': height,
3300                             'tbr': tbr,
3301                             'format_id': s_attr.get('label') or s_attr.get('title'),
3302                         })
3303                         f.update(formats[0])
3304                         media_info['formats'].append(f)
3305                     else:
3306                         media_info['formats'].extend(formats)
3307                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3308                     track_attributes = extract_attributes(track_tag)
3309                     kind = track_attributes.get('kind')
3310                     if not kind or kind in ('subtitles', 'captions'):
3311                         src = strip_or_none(track_attributes.get('src'))
3312                         if not src:
3313                             continue
3314                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3315                         media_info['subtitles'].setdefault(lang, []).append({
3316                             'url': absolute_url(src),
3317                         })
3318             for f in media_info['formats']:
3319                 f.setdefault('http_headers', {})['Referer'] = base_url
3320             if media_info['formats'] or media_info['subtitles']:
3321                 entries.append(media_info)
3322         return entries
3323
3324     def _extract_akamai_formats(self, *args, **kwargs):
3325         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3326         if subs:
3327             self._report_ignoring_subs('akamai')
3328         return fmts
3329
3330     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3331         signed = 'hdnea=' in manifest_url
3332         if not signed:
3333             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3334             manifest_url = re.sub(
3335                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3336                 '', manifest_url).strip('?')
3337
3338         formats = []
3339         subtitles = {}
3340
3341         hdcore_sign = 'hdcore=3.7.0'
3342         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3343         hds_host = hosts.get('hds')
3344         if hds_host:
3345             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3346         if 'hdcore=' not in f4m_url:
3347             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3348         f4m_formats = self._extract_f4m_formats(
3349             f4m_url, video_id, f4m_id='hds', fatal=False)
3350         for entry in f4m_formats:
3351             entry.update({'extra_param_to_segment_url': hdcore_sign})
3352         formats.extend(f4m_formats)
3353
3354         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3355         hls_host = hosts.get('hls')
3356         if hls_host:
3357             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3358         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3359             m3u8_url, video_id, 'mp4', 'm3u8_native',
3360             m3u8_id='hls', fatal=False)
3361         formats.extend(m3u8_formats)
3362         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3363
3364         http_host = hosts.get('http')
3365         if http_host and m3u8_formats and not signed:
3366             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3367             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3368             qualities_length = len(qualities)
3369             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3370                 i = 0
3371                 for f in m3u8_formats:
3372                     if f['vcodec'] != 'none':
3373                         for protocol in ('http', 'https'):
3374                             http_f = f.copy()
3375                             del http_f['manifest_url']
3376                             http_url = re.sub(
3377                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3378                             http_f.update({
3379                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3380                                 'url': http_url,
3381                                 'protocol': protocol,
3382                             })
3383                             formats.append(http_f)
3384                         i += 1
3385
3386         return formats, subtitles
3387
3388     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3389         query = urllib.parse.urlparse(url).query
3390         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3391         mobj = re.search(
3392             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3393         url_base = mobj.group('url')
3394         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3395         formats = []
3396
3397         def manifest_url(manifest):
3398             m_url = f'{http_base_url}/{manifest}'
3399             if query:
3400                 m_url += '?%s' % query
3401             return m_url
3402
3403         if 'm3u8' not in skip_protocols:
3404             formats.extend(self._extract_m3u8_formats(
3405                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3406                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3407         if 'f4m' not in skip_protocols:
3408             formats.extend(self._extract_f4m_formats(
3409                 manifest_url('manifest.f4m'),
3410                 video_id, f4m_id='hds', fatal=False))
3411         if 'dash' not in skip_protocols:
3412             formats.extend(self._extract_mpd_formats(
3413                 manifest_url('manifest.mpd'),
3414                 video_id, mpd_id='dash', fatal=False))
3415         if re.search(r'(?:/smil:|\.smil)', url_base):
3416             if 'smil' not in skip_protocols:
3417                 rtmp_formats = self._extract_smil_formats(
3418                     manifest_url('jwplayer.smil'),
3419                     video_id, fatal=False)
3420                 for rtmp_format in rtmp_formats:
3421                     rtsp_format = rtmp_format.copy()
3422                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3423                     del rtsp_format['play_path']
3424                     del rtsp_format['ext']
3425                     rtsp_format.update({
3426                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3427                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3428                         'protocol': 'rtsp',
3429                     })
3430                     formats.extend([rtmp_format, rtsp_format])
3431         else:
3432             for protocol in ('rtmp', 'rtsp'):
3433                 if protocol not in skip_protocols:
3434                     formats.append({
3435                         'url': f'{protocol}:{url_base}',
3436                         'format_id': protocol,
3437                         'protocol': protocol,
3438                     })
3439         return formats
3440
3441     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3442         mobj = re.search(
3443             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3444             webpage)
3445         if mobj:
3446             try:
3447                 jwplayer_data = self._parse_json(mobj.group('options'),
3448                                                  video_id=video_id,
3449                                                  transform_source=transform_source)
3450             except ExtractorError:
3451                 pass
3452             else:
3453                 if isinstance(jwplayer_data, dict):
3454                     return jwplayer_data
3455
3456     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3457         jwplayer_data = self._find_jwplayer_data(
3458             webpage, video_id, transform_source=js_to_json)
3459         return self._parse_jwplayer_data(
3460             jwplayer_data, video_id, *args, **kwargs)
3461
3462     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3463                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3464         # JWPlayer backward compatibility: flattened playlists
3465         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3466         if 'playlist' not in jwplayer_data:
3467             jwplayer_data = {'playlist': [jwplayer_data]}
3468
3469         entries = []
3470
3471         # JWPlayer backward compatibility: single playlist item
3472         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3473         if not isinstance(jwplayer_data['playlist'], list):
3474             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3475
3476         for video_data in jwplayer_data['playlist']:
3477             # JWPlayer backward compatibility: flattened sources
3478             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3479             if 'sources' not in video_data:
3480                 video_data['sources'] = [video_data]
3481
3482             this_video_id = video_id or video_data['mediaid']
3483
3484             formats = self._parse_jwplayer_formats(
3485                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3486                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3487
3488             subtitles = {}
3489             tracks = video_data.get('tracks')
3490             if tracks and isinstance(tracks, list):
3491                 for track in tracks:
3492                     if not isinstance(track, dict):
3493                         continue
3494                     track_kind = track.get('kind')
3495                     if not track_kind or not isinstance(track_kind, str):
3496                         continue
3497                     if track_kind.lower() not in ('captions', 'subtitles'):
3498                         continue
3499                     track_url = urljoin(base_url, track.get('file'))
3500                     if not track_url:
3501                         continue
3502                     subtitles.setdefault(track.get('label') or 'en', []).append({
3503                         'url': self._proto_relative_url(track_url)
3504                     })
3505
3506             entry = {
3507                 'id': this_video_id,
3508                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3509                 'description': clean_html(video_data.get('description')),
3510                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3511                 'timestamp': int_or_none(video_data.get('pubdate')),
3512                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3513                 'subtitles': subtitles,
3514             }
3515             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3516             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3517                 entry.update({
3518                     '_type': 'url_transparent',
3519                     'url': formats[0]['url'],
3520                 })
3521             else:
3522                 self._sort_formats(formats)
3523                 entry['formats'] = formats
3524             entries.append(entry)
3525         if len(entries) == 1:
3526             return entries[0]
3527         else:
3528             return self.playlist_result(entries)
3529
3530     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3531                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3532         urls = []
3533         formats = []
3534         for source in jwplayer_sources_data:
3535             if not isinstance(source, dict):
3536                 continue
3537             source_url = urljoin(
3538                 base_url, self._proto_relative_url(source.get('file')))
3539             if not source_url or source_url in urls:
3540                 continue
3541             urls.append(source_url)
3542             source_type = source.get('type') or ''
3543             ext = mimetype2ext(source_type) or determine_ext(source_url)
3544             if source_type == 'hls' or ext == 'm3u8':
3545                 formats.extend(self._extract_m3u8_formats(
3546                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3547                     m3u8_id=m3u8_id, fatal=False))
3548             elif source_type == 'dash' or ext == 'mpd':
3549                 formats.extend(self._extract_mpd_formats(
3550                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3551             elif ext == 'smil':
3552                 formats.extend(self._extract_smil_formats(
3553                     source_url, video_id, fatal=False))
3554             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3555             elif source_type.startswith('audio') or ext in (
3556                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3557                 formats.append({
3558                     'url': source_url,
3559                     'vcodec': 'none',
3560                     'ext': ext,
3561                 })
3562             else:
3563                 height = int_or_none(source.get('height'))
3564                 if height is None:
3565                     # Often no height is provided but there is a label in
3566                     # format like "1080p", "720p SD", or 1080.
3567                     height = int_or_none(self._search_regex(
3568                         r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
3569                         'height', default=None))
3570                 a_format = {
3571                     'url': source_url,
3572                     'width': int_or_none(source.get('width')),
3573                     'height': height,
3574                     'tbr': int_or_none(source.get('bitrate')),
3575                     'ext': ext,
3576                 }
3577                 if source_url.startswith('rtmp'):
3578                     a_format['ext'] = 'flv'
3579                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3580                     # of jwplayer.flash.swf
3581                     rtmp_url_parts = re.split(
3582                         r'((?:mp4|mp3|flv):)', source_url, 1)
3583                     if len(rtmp_url_parts) == 3:
3584                         rtmp_url, prefix, play_path = rtmp_url_parts
3585                         a_format.update({
3586                             'url': rtmp_url,
3587                             'play_path': prefix + play_path,
3588                         })
3589                     if rtmp_params:
3590                         a_format.update(rtmp_params)
3591                 formats.append(a_format)
3592         return formats
3593
3594     def _live_title(self, name):
3595         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3596         return name
3597
3598     def _int(self, v, name, fatal=False, **kwargs):
3599         res = int_or_none(v, **kwargs)
3600         if res is None:
3601             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3602             if fatal:
3603                 raise ExtractorError(msg)
3604             else:
3605                 self.report_warning(msg)
3606         return res
3607
3608     def _float(self, v, name, fatal=False, **kwargs):
3609         res = float_or_none(v, **kwargs)
3610         if res is None:
3611             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3612             if fatal:
3613                 raise ExtractorError(msg)
3614             else:
3615                 self.report_warning(msg)
3616         return res
3617
3618     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3619                     path='/', secure=False, discard=False, rest={}, **kwargs):
3620         cookie = http.cookiejar.Cookie(
3621             0, name, value, port, port is not None, domain, True,
3622             domain.startswith('.'), path, True, secure, expire_time,
3623             discard, None, None, rest)
3624         self.cookiejar.set_cookie(cookie)
3625
3626     def _get_cookies(self, url):
3627         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3628         return http.cookies.SimpleCookie(self._downloader._calc_cookies(url))
3629
3630     def _apply_first_set_cookie_header(self, url_handle, cookie):
3631         """
3632         Apply first Set-Cookie header instead of the last. Experimental.
3633
3634         Some sites (e.g. [1-3]) may serve two cookies under the same name
3635         in Set-Cookie header and expect the first (old) one to be set rather
3636         than second (new). However, as of RFC6265 the newer one cookie
3637         should be set into cookie store what actually happens.
3638         We will workaround this issue by resetting the cookie to
3639         the first one manually.
3640         1. https://new.vk.com/
3641         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3642         3. https://learning.oreilly.com/
3643         """
3644         for header, cookies in url_handle.headers.items():
3645             if header.lower() != 'set-cookie':
3646                 continue
3647             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3648             cookie_value = re.search(
3649                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3650             if cookie_value:
3651                 value, domain = cookie_value.groups()
3652                 self._set_cookie(domain, cookie, value)
3653                 break
3654
3655     @classmethod
3656     def get_testcases(cls, include_onlymatching=False):
3657         t = getattr(cls, '_TEST', None)
3658         if t:
3659             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3660             tests = [t]
3661         else:
3662             tests = getattr(cls, '_TESTS', [])
3663         for t in tests:
3664             if not include_onlymatching and t.get('only_matching', False):
3665                 continue
3666             t['name'] = cls.ie_key()
3667             yield t
3668
3669     @classmethod
3670     def get_webpage_testcases(cls):
3671         tests = getattr(cls, '_WEBPAGE_TESTS', [])
3672         for t in tests:
3673             t['name'] = cls.ie_key()
3674         return tests
3675
3676     @classproperty
3677     def age_limit(cls):
3678         """Get age limit from the testcases"""
3679         return max(traverse_obj(
3680             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3681             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3682
3683     @classmethod
3684     def is_suitable(cls, age_limit):
3685         """Test whether the extractor is generally suitable for the given age limit"""
3686         return not age_restricted(cls.age_limit, age_limit)
3687
3688     @classmethod
3689     def description(cls, *, markdown=True, search_examples=None):
3690         """Description of the extractor"""
3691         desc = ''
3692         if cls._NETRC_MACHINE:
3693             if markdown:
3694                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3695             else:
3696                 desc += f' [{cls._NETRC_MACHINE}]'
3697         if cls.IE_DESC is False:
3698             desc += ' [HIDDEN]'
3699         elif cls.IE_DESC:
3700             desc += f' {cls.IE_DESC}'
3701         if cls.SEARCH_KEY:
3702             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3703             if search_examples:
3704                 _COUNTS = ('', '5', '10', 'all')
3705                 desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3706         if not cls.working():
3707             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3708
3709         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3710         return f'{name}:{desc}' if desc else name
3711
3712     def extract_subtitles(self, *args, **kwargs):
3713         if (self.get_param('writesubtitles', False)
3714                 or self.get_param('listsubtitles')):
3715             return self._get_subtitles(*args, **kwargs)
3716         return {}
3717
3718     def _get_subtitles(self, *args, **kwargs):
3719         raise NotImplementedError('This method must be implemented by subclasses')
3720
3721     def extract_comments(self, *args, **kwargs):
3722         if not self.get_param('getcomments'):
3723             return None
3724         generator = self._get_comments(*args, **kwargs)
3725
3726         def extractor():
3727             comments = []
3728             interrupted = True
3729             try:
3730                 while True:
3731                     comments.append(next(generator))
3732             except StopIteration:
3733                 interrupted = False
3734             except KeyboardInterrupt:
3735                 self.to_screen('Interrupted by user')
3736             except Exception as e:
3737                 if self.get_param('ignoreerrors') is not True:
3738                     raise
3739                 self._downloader.report_error(e)
3740             comment_count = len(comments)
3741             self.to_screen(f'Extracted {comment_count} comments')
3742             return {
3743                 'comments': comments,
3744                 'comment_count': None if interrupted else comment_count
3745             }
3746         return extractor
3747
3748     def _get_comments(self, *args, **kwargs):
3749         raise NotImplementedError('This method must be implemented by subclasses')
3750
3751     @staticmethod
3752     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3753         """ Merge subtitle items for one language. Items with duplicated URLs/data
3754         will be dropped. """
3755         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3756         ret = list(subtitle_list1)
3757         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3758         return ret
3759
3760     @classmethod
3761     def _merge_subtitles(cls, *dicts, target=None):
3762         """ Merge subtitle dictionaries, language by language. """
3763         if target is None:
3764             target = {}
3765         for d in dicts:
3766             for lang, subs in d.items():
3767                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3768         return target
3769
3770     def extract_automatic_captions(self, *args, **kwargs):
3771         if (self.get_param('writeautomaticsub', False)
3772                 or self.get_param('listsubtitles')):
3773             return self._get_automatic_captions(*args, **kwargs)
3774         return {}
3775
3776     def _get_automatic_captions(self, *args, **kwargs):
3777         raise NotImplementedError('This method must be implemented by subclasses')
3778
3779     @functools.cached_property
3780     def _cookies_passed(self):
3781         """Whether cookies have been passed to YoutubeDL"""
3782         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3783
3784     def mark_watched(self, *args, **kwargs):
3785         if not self.get_param('mark_watched', False):
3786             return
3787         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3788             self._mark_watched(*args, **kwargs)
3789
3790     def _mark_watched(self, *args, **kwargs):
3791         raise NotImplementedError('This method must be implemented by subclasses')
3792
3793     def geo_verification_headers(self):
3794         headers = {}
3795         geo_verification_proxy = self.get_param('geo_verification_proxy')
3796         if geo_verification_proxy:
3797             headers['Ytdl-request-proxy'] = geo_verification_proxy
3798         return headers
3799
3800     @staticmethod
3801     def _generic_id(url):
3802         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3803
3804     @staticmethod
3805     def _generic_title(url):
3806         return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3807
3808     @staticmethod
3809     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3810         all_known = all(map(
3811             lambda x: x is not None,
3812             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3813         return (
3814             'private' if is_private
3815             else 'premium_only' if needs_premium
3816             else 'subscriber_only' if needs_subscription
3817             else 'needs_auth' if needs_auth
3818             else 'unlisted' if is_unlisted
3819             else 'public' if all_known
3820             else None)
3821
3822     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3823         '''
3824         @returns            A list of values for the extractor argument given by "key"
3825                             or "default" if no such key is present
3826         @param default      The default value to return when the key is not present (default: [])
3827         @param casesense    When false, the values are converted to lower case
3828         '''
3829         val = traverse_obj(
3830             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3831         if val is None:
3832             return [] if default is NO_DEFAULT else default
3833         return list(val) if casesense else [x.lower() for x in val]
3834
3835     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3836         if not playlist_id or not video_id:
3837             return not video_id
3838
3839         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3840         if no_playlist is not None:
3841             return not no_playlist
3842
3843         video_id = '' if video_id is True else f' {video_id}'
3844         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3845         if self.get_param('noplaylist'):
3846             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3847             return False
3848         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3849         return True
3850
3851     @classmethod
3852     def extract_from_webpage(cls, ydl, url, webpage):
3853         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3854               else ydl.get_info_extractor(cls.ie_key()))
3855         for info in ie._extract_from_webpage(url, webpage) or []:
3856             # url = None since we do not want to set (webpage/original)_url
3857             ydl.add_default_extra_info(info, ie, None)
3858             yield info
3859
3860     @classmethod
3861     def _extract_from_webpage(cls, url, webpage):
3862         for embed_url in orderedSet(
3863                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3864             yield cls.url_result(embed_url, cls)
3865
3866     @classmethod
3867     def _extract_embed_urls(cls, url, webpage):
3868         """@returns all the embed urls on the webpage"""
3869         if '_EMBED_URL_RE' not in cls.__dict__:
3870             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3871             for idx, regex in enumerate(cls._EMBED_REGEX):
3872                 assert regex.count('(?P<url>') == 1, \
3873                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3874             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3875
3876         for regex in cls._EMBED_URL_RE:
3877             for mobj in regex.finditer(webpage):
3878                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3879                 if cls._VALID_URL is False or cls.suitable(embed_url):
3880                     yield embed_url
3881
3882     class StopExtraction(Exception):
3883         pass
3884
3885     @classmethod
3886     def _extract_url(cls, webpage):  # TODO: Remove
3887         """Only for compatibility with some older extractors"""
3888         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3889
3890
3891 class SearchInfoExtractor(InfoExtractor):
3892     """
3893     Base class for paged search queries extractors.
3894     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3895     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3896     """
3897
3898     _MAX_RESULTS = float('inf')
3899
3900     @classproperty
3901     def _VALID_URL(cls):
3902         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3903
3904     def _real_extract(self, query):
3905         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3906         if prefix == '':
3907             return self._get_n_results(query, 1)
3908         elif prefix == 'all':
3909             return self._get_n_results(query, self._MAX_RESULTS)
3910         else:
3911             n = int(prefix)
3912             if n <= 0:
3913                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3914             elif n > self._MAX_RESULTS:
3915                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3916                 n = self._MAX_RESULTS
3917             return self._get_n_results(query, n)
3918
3919     def _get_n_results(self, query, n):
3920         """Get a specified number of results for a query.
3921         Either this function or _search_results must be overridden by subclasses """
3922         return self.playlist_result(
3923             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3924             query, query)
3925
3926     def _search_results(self, query):
3927         """Returns an iterator of search results"""
3928         raise NotImplementedError('This method must be implemented by subclasses')
3929
3930     @classproperty
3931     def SEARCH_KEY(cls):
3932         return cls._SEARCH_KEY