yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import itertools
   9 import json
  10 import math
  11 import netrc
  12 import os
  13 import random
  14 import re
  15 import sys
  16 import time
  17 import types
  18 import urllib.parse
  19 import urllib.request
  20 import xml.etree.ElementTree
  21
  22 from ..compat import functools  # isort: split
  23 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  24 from ..downloader import FileDownloader
  25 from ..downloader.f4m import get_base_url, remove_encrypted_media
  26 from ..utils import (
  27     IDENTITY,
  28     JSON_LD_RE,
  29     NO_DEFAULT,
  30     ExtractorError,
  31     GeoRestrictedError,
  32     GeoUtils,
  33     LenientJSONDecoder,
  34     RegexNotFoundError,
  35     UnsupportedError,
  36     age_restricted,
  37     base_url,
  38     bug_reports_message,
  39     classproperty,
  40     clean_html,
  41     determine_ext,
  42     determine_protocol,
  43     dict_get,
  44     encode_data_uri,
  45     error_to_compat_str,
  46     extract_attributes,
  47     filter_dict,
  48     fix_xml_ampersands,
  49     float_or_none,
  50     format_field,
  51     int_or_none,
  52     join_nonempty,
  53     js_to_json,
  54     mimetype2ext,
  55     network_exceptions,
  56     orderedSet,
  57     parse_bitrate,
  58     parse_codecs,
  59     parse_duration,
  60     parse_iso8601,
  61     parse_m3u8_attributes,
  62     parse_resolution,
  63     sanitize_filename,
  64     sanitize_url,
  65     sanitized_Request,
  66     str_or_none,
  67     str_to_int,
  68     strip_or_none,
  69     traverse_obj,
  70     try_call,
  71     try_get,
  72     unescapeHTML,
  73     unified_strdate,
  74     unified_timestamp,
  75     update_Request,
  76     update_url_query,
  77     url_basename,
  78     url_or_none,
  79     urljoin,
  80     variadic,
  81     xpath_element,
  82     xpath_text,
  83     xpath_with_ns,
  84 )
  85
  86
  87 class InfoExtractor:
  88     """Information Extractor class.
  89
  90     Information extractors are the classes that, given a URL, extract
  91     information about the video (or videos) the URL refers to. This
  92     information includes the real video URL, the video title, author and
  93     others. The information is stored in a dictionary which is then
  94     passed to the YoutubeDL. The YoutubeDL processes this
  95     information possibly downloading the video to the file system, among
  96     other possible outcomes.
  97
  98     The type field determines the type of the result.
  99     By far the most common value (and the default if _type is missing) is
 100     "video", which indicates a single video.
 101
 102     For a video, the dictionaries must include the following fields:
 103
 104     id:             Video identifier.
 105     title:          Video title, unescaped. Set to an empty string if video has
 106                     no title as opposed to "None" which signifies that the
 107                     extractor failed to obtain a title
 108
 109     Additionally, it must contain either a formats entry or a url one:
 110
 111     formats:        A list of dictionaries for each format available, ordered
 112                     from worst to best quality.
 113
 114                     Potential fields:
 115                     * url        The mandatory URL representing the media:
 116                                    for plain file media - HTTP URL of this file,
 117                                    for RTMP - RTMP URL,
 118                                    for HLS - URL of the M3U8 media playlist,
 119                                    for HDS - URL of the F4M manifest,
 120                                    for DASH
 121                                      - HTTP URL to plain file media (in case of
 122                                        unfragmented media)
 123                                      - URL of the MPD manifest or base URL
 124                                        representing the media if MPD manifest
 125                                        is parsed from a string (in case of
 126                                        fragmented media)
 127                                    for MSS - URL of the ISM manifest.
 128                     * manifest_url
 129                                  The URL of the manifest file in case of
 130                                  fragmented media:
 131                                    for HLS - URL of the M3U8 master playlist,
 132                                    for HDS - URL of the F4M manifest,
 133                                    for DASH - URL of the MPD manifest,
 134                                    for MSS - URL of the ISM manifest.
 135                     * manifest_stream_number  (For internal use only)
 136                                  The index of the stream in the manifest file
 137                     * ext        Will be calculated from URL if missing
 138                     * format     A human-readable description of the format
 139                                  ("mp4 container with h264/opus").
 140                                  Calculated from the format_id, width, height.
 141                                  and format_note fields if missing.
 142                     * format_id  A short description of the format
 143                                  ("mp4_h264_opus" or "19").
 144                                 Technically optional, but strongly recommended.
 145                     * format_note Additional info about the format
 146                                  ("3D" or "DASH video")
 147                     * width      Width of the video, if known
 148                     * height     Height of the video, if known
 149                     * resolution Textual description of width and height
 150                     * dynamic_range The dynamic range of the video. One of:
 151                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 152                     * tbr        Average bitrate of audio and video in KBit/s
 153                     * abr        Average audio bitrate in KBit/s
 154                     * acodec     Name of the audio codec in use
 155                     * asr        Audio sampling rate in Hertz
 156                     * vbr        Average video bitrate in KBit/s
 157                     * fps        Frame rate
 158                     * vcodec     Name of the video codec in use
 159                     * container  Name of the container format
 160                     * filesize   The number of bytes, if known in advance
 161                     * filesize_approx  An estimate for the number of bytes
 162                     * player_url SWF Player URL (used for rtmpdump).
 163                     * protocol   The protocol that will be used for the actual
 164                                  download, lower-case. One of "http", "https" or
 165                                  one of the protocols defined in downloader.PROTOCOL_MAP
 166                     * fragment_base_url
 167                                  Base URL for fragments. Each fragment's path
 168                                  value (if present) will be relative to
 169                                  this URL.
 170                     * fragments  A list of fragments of a fragmented media.
 171                                  Each fragment entry must contain either an url
 172                                  or a path. If an url is present it should be
 173                                  considered by a client. Otherwise both path and
 174                                  fragment_base_url must be present. Here is
 175                                  the list of all potential fields:
 176                                  * "url" - fragment's URL
 177                                  * "path" - fragment's path relative to
 178                                             fragment_base_url
 179                                  * "duration" (optional, int or float)
 180                                  * "filesize" (optional, int)
 181                     * is_from_start  Is a live format that can be downloaded
 182                                 from the start. Boolean
 183                     * preference Order number of this format. If this field is
 184                                  present and not None, the formats get sorted
 185                                  by this field, regardless of all other values.
 186                                  -1 for default (order by other properties),
 187                                  -2 or smaller for less than default.
 188                                  < -1000 to hide the format (if there is
 189                                     another one which is strictly better)
 190                     * language   Language code, e.g. "de" or "en-US".
 191                     * language_preference  Is this in the language mentioned in
 192                                  the URL?
 193                                  10 if it's what the URL is about,
 194                                  -1 for default (don't know),
 195                                  -10 otherwise, other values reserved for now.
 196                     * quality    Order number of the video quality of this
 197                                  format, irrespective of the file format.
 198                                  -1 for default (order by other properties),
 199                                  -2 or smaller for less than default.
 200                     * source_preference  Order number for this video source
 201                                   (quality takes higher priority)
 202                                  -1 for default (order by other properties),
 203                                  -2 or smaller for less than default.
 204                     * http_headers  A dictionary of additional HTTP headers
 205                                  to add to the request.
 206                     * stretched_ratio  If given and not 1, indicates that the
 207                                  video's pixels are not square.
 208                                  width : height ratio as float.
 209                     * no_resume  The server does not support resuming the
 210                                  (HTTP or RTMP) download. Boolean.
 211                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 212                     * downloader_options  A dictionary of downloader options
 213                                  (For internal use only)
 214                                  * http_chunk_size Chunk size for HTTP downloads
 215                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 216                     RTMP formats can also have the additional fields: page_url,
 217                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 218                     rtmp_protocol, rtmp_real_time
 219
 220     url:            Final video URL.
 221     ext:            Video filename extension.
 222     format:         The video format, defaults to ext (used for --get-format)
 223     player_url:     SWF Player URL (used for rtmpdump).
 224
 225     The following fields are optional:
 226
 227     direct:         True if a direct video file was given (must only be set by GenericIE)
 228     alt_title:      A secondary title of the video.
 229     display_id      An alternative identifier for the video, not necessarily
 230                     unique, but available before title. Typically, id is
 231                     something like "4234987", title "Dancing naked mole rats",
 232                     and display_id "dancing-naked-mole-rats"
 233     thumbnails:     A list of dictionaries, with the following entries:
 234                         * "id" (optional, string) - Thumbnail format ID
 235                         * "url"
 236                         * "preference" (optional, int) - quality of the image
 237                         * "width" (optional, int)
 238                         * "height" (optional, int)
 239                         * "resolution" (optional, string "{width}x{height}",
 240                                         deprecated)
 241                         * "filesize" (optional, int)
 242                         * "http_headers" (dict) - HTTP headers for the request
 243     thumbnail:      Full URL to a video thumbnail image.
 244     description:    Full video description.
 245     uploader:       Full name of the video uploader.
 246     license:        License name the video is licensed under.
 247     creator:        The creator of the video.
 248     timestamp:      UNIX timestamp of the moment the video was uploaded
 249     upload_date:    Video upload date in UTC (YYYYMMDD).
 250                     If not explicitly set, calculated from timestamp
 251     release_timestamp: UNIX timestamp of the moment the video was released.
 252                     If it is not clear whether to use timestamp or this, use the former
 253     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 254                     If not explicitly set, calculated from release_timestamp
 255     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 256     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 257                     If not explicitly set, calculated from modified_timestamp
 258     uploader_id:    Nickname or id of the video uploader.
 259     uploader_url:   Full URL to a personal webpage of the video uploader.
 260     channel:        Full name of the channel the video is uploaded on.
 261                     Note that channel fields may or may not repeat uploader
 262                     fields. This depends on a particular extractor.
 263     channel_id:     Id of the channel.
 264     channel_url:    Full URL to a channel webpage.
 265     channel_follower_count: Number of followers of the channel.
 266     location:       Physical location where the video was filmed.
 267     subtitles:      The available subtitles as a dictionary in the format
 268                     {tag: subformats}. "tag" is usually a language code, and
 269                     "subformats" is a list sorted from lower to higher
 270                     preference, each element is a dictionary with the "ext"
 271                     entry and one of:
 272                         * "data": The subtitles file contents
 273                         * "url": A URL pointing to the subtitles file
 274                     It can optionally also have:
 275                         * "name": Name or description of the subtitles
 276                         * "http_headers": A dictionary of additional HTTP headers
 277                                   to add to the request.
 278                     "ext" will be calculated from URL if missing
 279     automatic_captions: Like 'subtitles'; contains automatically generated
 280                     captions instead of normal subtitles
 281     duration:       Length of the video in seconds, as an integer or float.
 282     view_count:     How many users have watched the video on the platform.
 283     like_count:     Number of positive ratings of the video
 284     dislike_count:  Number of negative ratings of the video
 285     repost_count:   Number of reposts of the video
 286     average_rating: Average rating give by users, the scale used depends on the webpage
 287     comment_count:  Number of comments on the video
 288     comments:       A list of comments, each with one or more of the following
 289                     properties (all but one of text or html optional):
 290                         * "author" - human-readable name of the comment author
 291                         * "author_id" - user ID of the comment author
 292                         * "author_thumbnail" - The thumbnail of the comment author
 293                         * "id" - Comment ID
 294                         * "html" - Comment as HTML
 295                         * "text" - Plain text of the comment
 296                         * "timestamp" - UNIX timestamp of comment
 297                         * "parent" - ID of the comment this one is replying to.
 298                                      Set to "root" to indicate that this is a
 299                                      comment to the original video.
 300                         * "like_count" - Number of positive ratings of the comment
 301                         * "dislike_count" - Number of negative ratings of the comment
 302                         * "is_favorited" - Whether the comment is marked as
 303                                            favorite by the video uploader
 304                         * "author_is_uploader" - Whether the comment is made by
 305                                                  the video uploader
 306     age_limit:      Age restriction for the video, as an integer (years)
 307     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 308                     should allow to get the same result again. (It will be set
 309                     by YoutubeDL if it's missing)
 310     categories:     A list of categories that the video falls in, for example
 311                     ["Sports", "Berlin"]
 312     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 313     cast:           A list of the video cast
 314     is_live:        True, False, or None (=unknown). Whether this video is a
 315                     live stream that goes on instead of a fixed-length video.
 316     was_live:       True, False, or None (=unknown). Whether this video was
 317                     originally a live stream.
 318     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live'
 319                     or 'post_live' (was live, but VOD is not yet processed)
 320                     If absent, automatically set from is_live, was_live
 321     start_time:     Time in seconds where the reproduction should start, as
 322                     specified in the URL.
 323     end_time:       Time in seconds where the reproduction should end, as
 324                     specified in the URL.
 325     chapters:       A list of dictionaries, with the following entries:
 326                         * "start_time" - The start time of the chapter in seconds
 327                         * "end_time" - The end time of the chapter in seconds
 328                         * "title" (optional, string)
 329     playable_in_embed: Whether this video is allowed to play in embedded
 330                     players on other sites. Can be True (=always allowed),
 331                     False (=never allowed), None (=unknown), or a string
 332                     specifying the criteria for embedability (Eg: 'whitelist')
 333     availability:   Under what condition the video is available. One of
 334                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 335                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 336                     to set it
 337     __post_extractor: A function to be called just before the metadata is
 338                     written to either disk, logger or console. The function
 339                     must return a dict which will be added to the info_dict.
 340                     This is usefull for additional information that is
 341                     time-consuming to extract. Note that the fields thus
 342                     extracted will not be available to output template and
 343                     match_filter. So, only "comments" and "comment_count" are
 344                     currently allowed to be extracted via this method.
 345
 346     The following fields should only be used when the video belongs to some logical
 347     chapter or section:
 348
 349     chapter:        Name or title of the chapter the video belongs to.
 350     chapter_number: Number of the chapter the video belongs to, as an integer.
 351     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 352
 353     The following fields should only be used when the video is an episode of some
 354     series, programme or podcast:
 355
 356     series:         Title of the series or programme the video episode belongs to.
 357     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 358     season:         Title of the season the video episode belongs to.
 359     season_number:  Number of the season the video episode belongs to, as an integer.
 360     season_id:      Id of the season the video episode belongs to, as a unicode string.
 361     episode:        Title of the video episode. Unlike mandatory video title field,
 362                     this field should denote the exact title of the video episode
 363                     without any kind of decoration.
 364     episode_number: Number of the video episode within a season, as an integer.
 365     episode_id:     Id of the video episode, as a unicode string.
 366
 367     The following fields should only be used when the media is a track or a part of
 368     a music album:
 369
 370     track:          Title of the track.
 371     track_number:   Number of the track within an album or a disc, as an integer.
 372     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 373                     as a unicode string.
 374     artist:         Artist(s) of the track.
 375     genre:          Genre(s) of the track.
 376     album:          Title of the album the track belongs to.
 377     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 378     album_artist:   List of all artists appeared on the album (e.g.
 379                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 380                     and compilations).
 381     disc_number:    Number of the disc or other physical medium the track belongs to,
 382                     as an integer.
 383     release_year:   Year (YYYY) when the album was released.
 384     composer:       Composer of the piece
 385
 386     The following fields should only be set for clips that should be cut from the original video:
 387
 388     section_start:  Start time of the section in seconds
 389     section_end:    End time of the section in seconds
 390
 391     The following fields should only be set for storyboards:
 392     rows:           Number of rows in each storyboard fragment, as an integer
 393     columns:        Number of columns in each storyboard fragment, as an integer
 394
 395     Unless mentioned otherwise, the fields should be Unicode strings.
 396
 397     Unless mentioned otherwise, None is equivalent to absence of information.
 398
 399
 400     _type "playlist" indicates multiple videos.
 401     There must be a key "entries", which is a list, an iterable, or a PagedList
 402     object, each element of which is a valid dictionary by this specification.
 403
 404     Additionally, playlists can have "id", "title", and any other relevant
 405     attributes with the same semantics as videos (see above).
 406
 407     It can also have the following optional fields:
 408
 409     playlist_count: The total number of videos in a playlist. If not given,
 410                     YoutubeDL tries to calculate it from "entries"
 411
 412
 413     _type "multi_video" indicates that there are multiple videos that
 414     form a single show, for examples multiple acts of an opera or TV episode.
 415     It must have an entries key like a playlist and contain all the keys
 416     required for a video at the same time.
 417
 418
 419     _type "url" indicates that the video must be extracted from another
 420     location, possibly by a different extractor. Its only required key is:
 421     "url" - the next URL to extract.
 422     The key "ie_key" can be set to the class name (minus the trailing "IE",
 423     e.g. "Youtube") if the extractor class is known in advance.
 424     Additionally, the dictionary may have any properties of the resolved entity
 425     known in advance, for example "title" if the title of the referred video is
 426     known ahead of time.
 427
 428
 429     _type "url_transparent" entities have the same specification as "url", but
 430     indicate that the given additional information is more precise than the one
 431     associated with the resolved URL.
 432     This is useful when a site employs a video service that hosts the video and
 433     its technical metadata, but that video service does not embed a useful
 434     title, description etc.
 435
 436
 437     Subclasses of this should also be added to the list of extractors and
 438     should define a _VALID_URL regexp and, re-define the _real_extract() and
 439     (optionally) _real_initialize() methods.
 440
 441     Subclasses may also override suitable() if necessary, but ensure the function
 442     signature is preserved and that this function imports everything it needs
 443     (except other extractors), so that lazy_extractors works correctly.
 444
 445     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 446     the HTML of Generic webpages. It may also override _extract_embed_urls
 447     or _extract_from_webpage as necessary. While these are normally classmethods,
 448     _extract_from_webpage is allowed to be an instance method.
 449
 450     _extract_from_webpage may raise self.StopExtraction() to stop further
 451     processing of the webpage and obtain exclusive rights to it. This is useful
 452     when the extractor cannot reliably be matched using just the URL.
 453     Eg: invidious/peertube instances
 454
 455     Embed-only extractors can be defined by setting _VALID_URL = False.
 456
 457     To support username + password (or netrc) login, the extractor must define a
 458     _NETRC_MACHINE and re-define _perform_login(username, password) and
 459     (optionally) _initialize_pre_login() methods. The _perform_login method will
 460     be called between _initialize_pre_login and _real_initialize if credentials
 461     are passed by the user. In cases where it is necessary to have the login
 462     process as part of the extraction rather than initialization, _perform_login
 463     can be left undefined.
 464
 465     _GEO_BYPASS attribute may be set to False in order to disable
 466     geo restriction bypass mechanisms for a particular extractor.
 467     Though it won't disable explicit geo restriction bypass based on
 468     country code provided with geo_bypass_country.
 469
 470     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 471     countries for this extractor. One of these countries will be used by
 472     geo restriction bypass mechanism right away in order to bypass
 473     geo restriction, of course, if the mechanism is not disabled.
 474
 475     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 476     IP blocks in CIDR notation for this extractor. One of these IP blocks
 477     will be used by geo restriction bypass mechanism similarly
 478     to _GEO_COUNTRIES.
 479
 480     The _WORKING attribute should be set to False for broken IEs
 481     in order to warn the users and skip the tests.
 482     """
 483
 484     _ready = False
 485     _downloader = None
 486     _x_forwarded_for_ip = None
 487     _GEO_BYPASS = True
 488     _GEO_COUNTRIES = None
 489     _GEO_IP_BLOCKS = None
 490     _WORKING = True
 491     _NETRC_MACHINE = None
 492     IE_DESC = None
 493     SEARCH_KEY = None
 494     _VALID_URL = None
 495     _EMBED_REGEX = []
 496
 497     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 498         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 499         return {
 500             None: '',
 501             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 502             'password': f'Use {password_hint}',
 503             'cookies': (
 504                 'Use --cookies-from-browser or --cookies for the authentication. '
 505                 'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 506         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 507
 508     def __init__(self, downloader=None):
 509         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 510         If a downloader is not passed during initialization,
 511         it must be set using "set_downloader()" before "extract()" is called"""
 512         self._ready = False
 513         self._x_forwarded_for_ip = None
 514         self._printed_messages = set()
 515         self.set_downloader(downloader)
 516
 517     @classmethod
 518     def _match_valid_url(cls, url):
 519         if cls._VALID_URL is False:
 520             return None
 521         # This does not use has/getattr intentionally - we want to know whether
 522         # we have cached the regexp for *this* class, whereas getattr would also
 523         # match the superclass
 524         if '_VALID_URL_RE' not in cls.__dict__:
 525             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 526         return cls._VALID_URL_RE.match(url)
 527
 528     @classmethod
 529     def suitable(cls, url):
 530         """Receives a URL and returns True if suitable for this IE."""
 531         # This function must import everything it needs (except other extractors),
 532         # so that lazy_extractors works correctly
 533         return cls._match_valid_url(url) is not None
 534
 535     @classmethod
 536     def _match_id(cls, url):
 537         return cls._match_valid_url(url).group('id')
 538
 539     @classmethod
 540     def get_temp_id(cls, url):
 541         try:
 542             return cls._match_id(url)
 543         except (IndexError, AttributeError):
 544             return None
 545
 546     @classmethod
 547     def working(cls):
 548         """Getter method for _WORKING."""
 549         return cls._WORKING
 550
 551     @classmethod
 552     def supports_login(cls):
 553         return bool(cls._NETRC_MACHINE)
 554
 555     def initialize(self):
 556         """Initializes an instance (authentication, etc)."""
 557         self._printed_messages = set()
 558         self._initialize_geo_bypass({
 559             'countries': self._GEO_COUNTRIES,
 560             'ip_blocks': self._GEO_IP_BLOCKS,
 561         })
 562         if not self._ready:
 563             self._initialize_pre_login()
 564             if self.supports_login():
 565                 username, password = self._get_login_info()
 566                 if username:
 567                     self._perform_login(username, password)
 568             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 569                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 570             self._real_initialize()
 571             self._ready = True
 572
 573     def _initialize_geo_bypass(self, geo_bypass_context):
 574         """
 575         Initialize geo restriction bypass mechanism.
 576
 577         This method is used to initialize geo bypass mechanism based on faking
 578         X-Forwarded-For HTTP header. A random country from provided country list
 579         is selected and a random IP belonging to this country is generated. This
 580         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 581         HTTP requests.
 582
 583         This method will be used for initial geo bypass mechanism initialization
 584         during the instance initialization with _GEO_COUNTRIES and
 585         _GEO_IP_BLOCKS.
 586
 587         You may also manually call it from extractor's code if geo bypass
 588         information is not available beforehand (e.g. obtained during
 589         extraction) or due to some other reason. In this case you should pass
 590         this information in geo bypass context passed as first argument. It may
 591         contain following fields:
 592
 593         countries:  List of geo unrestricted countries (similar
 594                     to _GEO_COUNTRIES)
 595         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 596                     (similar to _GEO_IP_BLOCKS)
 597
 598         """
 599         if not self._x_forwarded_for_ip:
 600
 601             # Geo bypass mechanism is explicitly disabled by user
 602             if not self.get_param('geo_bypass', True):
 603                 return
 604
 605             if not geo_bypass_context:
 606                 geo_bypass_context = {}
 607
 608             # Backward compatibility: previously _initialize_geo_bypass
 609             # expected a list of countries, some 3rd party code may still use
 610             # it this way
 611             if isinstance(geo_bypass_context, (list, tuple)):
 612                 geo_bypass_context = {
 613                     'countries': geo_bypass_context,
 614                 }
 615
 616             # The whole point of geo bypass mechanism is to fake IP
 617             # as X-Forwarded-For HTTP header based on some IP block or
 618             # country code.
 619
 620             # Path 1: bypassing based on IP block in CIDR notation
 621
 622             # Explicit IP block specified by user, use it right away
 623             # regardless of whether extractor is geo bypassable or not
 624             ip_block = self.get_param('geo_bypass_ip_block', None)
 625
 626             # Otherwise use random IP block from geo bypass context but only
 627             # if extractor is known as geo bypassable
 628             if not ip_block:
 629                 ip_blocks = geo_bypass_context.get('ip_blocks')
 630                 if self._GEO_BYPASS and ip_blocks:
 631                     ip_block = random.choice(ip_blocks)
 632
 633             if ip_block:
 634                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 635                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 636                 return
 637
 638             # Path 2: bypassing based on country code
 639
 640             # Explicit country code specified by user, use it right away
 641             # regardless of whether extractor is geo bypassable or not
 642             country = self.get_param('geo_bypass_country', None)
 643
 644             # Otherwise use random country code from geo bypass context but
 645             # only if extractor is known as geo bypassable
 646             if not country:
 647                 countries = geo_bypass_context.get('countries')
 648                 if self._GEO_BYPASS and countries:
 649                     country = random.choice(countries)
 650
 651             if country:
 652                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 653                 self._downloader.write_debug(
 654                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 655
 656     def extract(self, url):
 657         """Extracts URL information and returns it in list of dicts."""
 658         try:
 659             for _ in range(2):
 660                 try:
 661                     self.initialize()
 662                     self.write_debug('Extracting URL: %s' % url)
 663                     ie_result = self._real_extract(url)
 664                     if ie_result is None:
 665                         return None
 666                     if self._x_forwarded_for_ip:
 667                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 668                     subtitles = ie_result.get('subtitles') or {}
 669                     if 'no-live-chat' in self.get_param('compat_opts'):
 670                         for lang in ('live_chat', 'comments', 'danmaku'):
 671                             subtitles.pop(lang, None)
 672                     return ie_result
 673                 except GeoRestrictedError as e:
 674                     if self.__maybe_fake_ip_and_retry(e.countries):
 675                         continue
 676                     raise
 677         except UnsupportedError:
 678             raise
 679         except ExtractorError as e:
 680             kwargs = {
 681                 'video_id': e.video_id or self.get_temp_id(url),
 682                 'ie': self.IE_NAME,
 683                 'tb': e.traceback or sys.exc_info()[2],
 684                 'expected': e.expected,
 685                 'cause': e.cause
 686             }
 687             if hasattr(e, 'countries'):
 688                 kwargs['countries'] = e.countries
 689             raise type(e)(e.orig_msg, **kwargs)
 690         except http.client.IncompleteRead as e:
 691             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 692         except (KeyError, StopIteration) as e:
 693             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 694
 695     def __maybe_fake_ip_and_retry(self, countries):
 696         if (not self.get_param('geo_bypass_country', None)
 697                 and self._GEO_BYPASS
 698                 and self.get_param('geo_bypass', True)
 699                 and not self._x_forwarded_for_ip
 700                 and countries):
 701             country_code = random.choice(countries)
 702             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 703             if self._x_forwarded_for_ip:
 704                 self.report_warning(
 705                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 706                     % (self._x_forwarded_for_ip, country_code.upper()))
 707                 return True
 708         return False
 709
 710     def set_downloader(self, downloader):
 711         """Sets a YoutubeDL instance as the downloader for this IE."""
 712         self._downloader = downloader
 713
 714     @property
 715     def cache(self):
 716         return self._downloader.cache
 717
 718     @property
 719     def cookiejar(self):
 720         return self._downloader.cookiejar
 721
 722     def _initialize_pre_login(self):
 723         """ Initialization before login. Redefine in subclasses."""
 724         pass
 725
 726     def _perform_login(self, username, password):
 727         """ Login with username and password. Redefine in subclasses."""
 728         pass
 729
 730     def _real_initialize(self):
 731         """Real initialization process. Redefine in subclasses."""
 732         pass
 733
 734     def _real_extract(self, url):
 735         """Real extraction process. Redefine in subclasses."""
 736         raise NotImplementedError('This method must be implemented by subclasses')
 737
 738     @classmethod
 739     def ie_key(cls):
 740         """A string for getting the InfoExtractor with get_info_extractor"""
 741         return cls.__name__[:-2]
 742
 743     @classproperty
 744     def IE_NAME(cls):
 745         return cls.__name__[:-2]
 746
 747     @staticmethod
 748     def __can_accept_status_code(err, expected_status):
 749         assert isinstance(err, urllib.error.HTTPError)
 750         if expected_status is None:
 751             return False
 752         elif callable(expected_status):
 753             return expected_status(err.code) is True
 754         else:
 755             return err.code in variadic(expected_status)
 756
 757     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 758         if isinstance(url_or_request, urllib.request.Request):
 759             return update_Request(url_or_request, data=data, headers=headers, query=query)
 760         if query:
 761             url_or_request = update_url_query(url_or_request, query)
 762         return sanitized_Request(url_or_request, data, headers or {})
 763
 764     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 765         """
 766         Return the response handle.
 767
 768         See _download_webpage docstring for arguments specification.
 769         """
 770         if not self._downloader._first_webpage_request:
 771             sleep_interval = self.get_param('sleep_interval_requests') or 0
 772             if sleep_interval > 0:
 773                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 774                 time.sleep(sleep_interval)
 775         else:
 776             self._downloader._first_webpage_request = False
 777
 778         if note is None:
 779             self.report_download_webpage(video_id)
 780         elif note is not False:
 781             if video_id is None:
 782                 self.to_screen(str(note))
 783             else:
 784                 self.to_screen(f'{video_id}: {note}')
 785
 786         # Some sites check X-Forwarded-For HTTP header in order to figure out
 787         # the origin of the client behind proxy. This allows bypassing geo
 788         # restriction by faking this header's value to IP that belongs to some
 789         # geo unrestricted country. We will do so once we encounter any
 790         # geo restriction error.
 791         if self._x_forwarded_for_ip:
 792             headers = (headers or {}).copy()
 793             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 794
 795         try:
 796             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 797         except network_exceptions as err:
 798             if isinstance(err, urllib.error.HTTPError):
 799                 if self.__can_accept_status_code(err, expected_status):
 800                     # Retain reference to error to prevent file object from
 801                     # being closed before it can be read. Works around the
 802                     # effects of <https://bugs.python.org/issue15002>
 803                     # introduced in Python 3.4.1.
 804                     err.fp._error = err
 805                     return err.fp
 806
 807             if errnote is False:
 808                 return False
 809             if errnote is None:
 810                 errnote = 'Unable to download webpage'
 811
 812             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 813             if fatal:
 814                 raise ExtractorError(errmsg, cause=err)
 815             else:
 816                 self.report_warning(errmsg)
 817                 return False
 818
 819     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 820                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 821         """
 822         Return a tuple (page content as string, URL handle).
 823
 824         Arguments:
 825         url_or_request -- plain text URL as a string or
 826             a urllib.request.Request object
 827         video_id -- Video/playlist/item identifier (string)
 828
 829         Keyword arguments:
 830         note -- note printed before downloading (string)
 831         errnote -- note printed in case of an error (string)
 832         fatal -- flag denoting whether error should be considered fatal,
 833             i.e. whether it should cause ExtractionError to be raised,
 834             otherwise a warning will be reported and extraction continued
 835         encoding -- encoding for a page content decoding, guessed automatically
 836             when not explicitly specified
 837         data -- POST data (bytes)
 838         headers -- HTTP headers (dict)
 839         query -- URL query (dict)
 840         expected_status -- allows to accept failed HTTP requests (non 2xx
 841             status code) by explicitly specifying a set of accepted status
 842             codes. Can be any of the following entities:
 843                 - an integer type specifying an exact failed status code to
 844                   accept
 845                 - a list or a tuple of integer types specifying a list of
 846                   failed status codes to accept
 847                 - a callable accepting an actual failed status code and
 848                   returning True if it should be accepted
 849             Note that this argument does not affect success status codes (2xx)
 850             which are always accepted.
 851         """
 852
 853         # Strip hashes from the URL (#1038)
 854         if isinstance(url_or_request, str):
 855             url_or_request = url_or_request.partition('#')[0]
 856
 857         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 858         if urlh is False:
 859             assert not fatal
 860             return False
 861         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 862         return (content, urlh)
 863
 864     @staticmethod
 865     def _guess_encoding_from_content(content_type, webpage_bytes):
 866         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 867         if m:
 868             encoding = m.group(1)
 869         else:
 870             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 871                           webpage_bytes[:1024])
 872             if m:
 873                 encoding = m.group(1).decode('ascii')
 874             elif webpage_bytes.startswith(b'\xff\xfe'):
 875                 encoding = 'utf-16'
 876             else:
 877                 encoding = 'utf-8'
 878
 879         return encoding
 880
 881     def __check_blocked(self, content):
 882         first_block = content[:512]
 883         if ('<title>Access to this site is blocked</title>' in content
 884                 and 'Websense' in first_block):
 885             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 886             blocked_iframe = self._html_search_regex(
 887                 r'<iframe src="([^"]+)"', content,
 888                 'Websense information URL', default=None)
 889             if blocked_iframe:
 890                 msg += ' Visit %s for more details' % blocked_iframe
 891             raise ExtractorError(msg, expected=True)
 892         if '<title>The URL you requested has been blocked</title>' in first_block:
 893             msg = (
 894                 'Access to this webpage has been blocked by Indian censorship. '
 895                 'Use a VPN or proxy server (with --proxy) to route around it.')
 896             block_msg = self._html_search_regex(
 897                 r'</h1><p>(.*?)</p>',
 898                 content, 'block message', default=None)
 899             if block_msg:
 900                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 901             raise ExtractorError(msg, expected=True)
 902         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 903                 and 'blocklist.rkn.gov.ru' in content):
 904             raise ExtractorError(
 905                 'Access to this webpage has been blocked by decision of the Russian government. '
 906                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 907                 expected=True)
 908
 909     def _request_dump_filename(self, url, video_id):
 910         basen = f'{video_id}_{url}'
 911         trim_length = self.get_param('trim_file_name') or 240
 912         if len(basen) > trim_length:
 913             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 914             basen = basen[:trim_length - len(h)] + h
 915         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 916         # Working around MAX_PATH limitation on Windows (see
 917         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 918         if compat_os_name == 'nt':
 919             absfilepath = os.path.abspath(filename)
 920             if len(absfilepath) > 259:
 921                 filename = fR'\\?\{absfilepath}'
 922         return filename
 923
 924     def __decode_webpage(self, webpage_bytes, encoding, headers):
 925         if not encoding:
 926             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 927         try:
 928             return webpage_bytes.decode(encoding, 'replace')
 929         except LookupError:
 930             return webpage_bytes.decode('utf-8', 'replace')
 931
 932     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 933         webpage_bytes = urlh.read()
 934         if prefix is not None:
 935             webpage_bytes = prefix + webpage_bytes
 936         if self.get_param('dump_intermediate_pages', False):
 937             self.to_screen('Dumping request to ' + urlh.geturl())
 938             dump = base64.b64encode(webpage_bytes).decode('ascii')
 939             self._downloader.to_screen(dump)
 940         if self.get_param('write_pages'):
 941             filename = self._request_dump_filename(urlh.geturl(), video_id)
 942             self.to_screen(f'Saving request to {filename}')
 943             with open(filename, 'wb') as outf:
 944                 outf.write(webpage_bytes)
 945
 946         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 947         self.__check_blocked(content)
 948
 949         return content
 950
 951     def __print_error(self, errnote, fatal, video_id, err):
 952         if fatal:
 953             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 954         elif errnote:
 955             self.report_warning(f'{video_id}: {errnote}: {err}')
 956
 957     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 958         if transform_source:
 959             xml_string = transform_source(xml_string)
 960         try:
 961             return compat_etree_fromstring(xml_string.encode('utf-8'))
 962         except xml.etree.ElementTree.ParseError as ve:
 963             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 964
 965     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
 966         try:
 967             return json.loads(
 968                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 969         except ValueError as ve:
 970             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
 971
 972     def _parse_socket_response_as_json(self, data, *args, **kwargs):
 973         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
 974
 975     def __create_download_methods(name, parser, note, errnote, return_value):
 976
 977         def parse(ie, content, *args, errnote=errnote, **kwargs):
 978             if parser is None:
 979                 return content
 980             if errnote is False:
 981                 kwargs['errnote'] = errnote
 982             # parser is fetched by name so subclasses can override it
 983             return getattr(ie, parser)(content, *args, **kwargs)
 984
 985         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 986                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 987             res = self._download_webpage_handle(
 988                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 989                 data=data, headers=headers, query=query, expected_status=expected_status)
 990             if res is False:
 991                 return res
 992             content, urlh = res
 993             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
 994
 995         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 996                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 997             if self.get_param('load_pages'):
 998                 url_or_request = self._create_request(url_or_request, data, headers, query)
 999                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1000                 self.to_screen(f'Loading request from {filename}')
1001                 try:
1002                     with open(filename, 'rb') as dumpf:
1003                         webpage_bytes = dumpf.read()
1004                 except OSError as e:
1005                     self.report_warning(f'Unable to load request from disk: {e}')
1006                 else:
1007                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1008                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1009             kwargs = {
1010                 'note': note,
1011                 'errnote': errnote,
1012                 'transform_source': transform_source,
1013                 'fatal': fatal,
1014                 'encoding': encoding,
1015                 'data': data,
1016                 'headers': headers,
1017                 'query': query,
1018                 'expected_status': expected_status,
1019             }
1020             if parser is None:
1021                 kwargs.pop('transform_source')
1022             # The method is fetched by name so subclasses can override _download_..._handle
1023             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1024             return res if res is False else res[0]
1025
1026         def impersonate(func, name, return_value):
1027             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1028             func.__doc__ = f'''
1029                 @param transform_source     Apply this transformation before parsing
1030                 @returns                    {return_value}
1031
1032                 See _download_webpage_handle docstring for other arguments specification
1033             '''
1034
1035         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1036         impersonate(download_content, f'_download_{name}', f'{return_value}')
1037         return download_handle, download_content
1038
1039     _download_xml_handle, _download_xml = __create_download_methods(
1040         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1041     _download_json_handle, _download_json = __create_download_methods(
1042         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1043     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1044         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1045     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1046
1047     def _download_webpage(
1048             self, url_or_request, video_id, note=None, errnote=None,
1049             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1050         """
1051         Return the data of the page as a string.
1052
1053         Keyword arguments:
1054         tries -- number of tries
1055         timeout -- sleep interval between tries
1056
1057         See _download_webpage_handle docstring for other arguments specification.
1058         """
1059
1060         R''' # NB: These are unused; should they be deprecated?
1061         if tries != 1:
1062             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1063         if timeout is NO_DEFAULT:
1064             timeout = 5
1065         else:
1066             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1067         '''
1068
1069         try_count = 0
1070         while True:
1071             try:
1072                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1073             except http.client.IncompleteRead as e:
1074                 try_count += 1
1075                 if try_count >= tries:
1076                     raise e
1077                 self._sleep(timeout, video_id)
1078
1079     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1080         idstr = format_field(video_id, None, '%s: ')
1081         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1082         if only_once:
1083             if f'WARNING: {msg}' in self._printed_messages:
1084                 return
1085             self._printed_messages.add(f'WARNING: {msg}')
1086         self._downloader.report_warning(msg, *args, **kwargs)
1087
1088     def to_screen(self, msg, *args, **kwargs):
1089         """Print msg to screen, prefixing it with '[ie_name]'"""
1090         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1091
1092     def write_debug(self, msg, *args, **kwargs):
1093         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1094
1095     def get_param(self, name, default=None, *args, **kwargs):
1096         if self._downloader:
1097             return self._downloader.params.get(name, default, *args, **kwargs)
1098         return default
1099
1100     def report_drm(self, video_id, partial=False):
1101         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1102
1103     def report_extraction(self, id_or_name):
1104         """Report information extraction."""
1105         self.to_screen('%s: Extracting information' % id_or_name)
1106
1107     def report_download_webpage(self, video_id):
1108         """Report webpage download."""
1109         self.to_screen('%s: Downloading webpage' % video_id)
1110
1111     def report_age_confirmation(self):
1112         """Report attempt to confirm age."""
1113         self.to_screen('Confirming age')
1114
1115     def report_login(self):
1116         """Report attempt to log in."""
1117         self.to_screen('Logging in')
1118
1119     def raise_login_required(
1120             self, msg='This video is only available for registered users',
1121             metadata_available=False, method=NO_DEFAULT):
1122         if metadata_available and (
1123                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1124             self.report_warning(msg)
1125             return
1126         msg += format_field(self._login_hint(method), None, '. %s')
1127         raise ExtractorError(msg, expected=True)
1128
1129     def raise_geo_restricted(
1130             self, msg='This video is not available from your location due to geo restriction',
1131             countries=None, metadata_available=False):
1132         if metadata_available and (
1133                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1134             self.report_warning(msg)
1135         else:
1136             raise GeoRestrictedError(msg, countries=countries)
1137
1138     def raise_no_formats(self, msg, expected=False, video_id=None):
1139         if expected and (
1140                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1141             self.report_warning(msg, video_id)
1142         elif isinstance(msg, ExtractorError):
1143             raise msg
1144         else:
1145             raise ExtractorError(msg, expected=expected, video_id=video_id)
1146
1147     # Methods for following #608
1148     @staticmethod
1149     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1150         """Returns a URL that points to a page that should be processed"""
1151         if ie is not None:
1152             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1153         if video_id is not None:
1154             kwargs['id'] = video_id
1155         if video_title is not None:
1156             kwargs['title'] = video_title
1157         return {
1158             **kwargs,
1159             '_type': 'url_transparent' if url_transparent else 'url',
1160             'url': url,
1161         }
1162
1163     @classmethod
1164     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1165                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1166         return cls.playlist_result(
1167             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1168             playlist_id, playlist_title, **kwargs)
1169
1170     @staticmethod
1171     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1172         """Returns a playlist"""
1173         if playlist_id:
1174             kwargs['id'] = playlist_id
1175         if playlist_title:
1176             kwargs['title'] = playlist_title
1177         if playlist_description is not None:
1178             kwargs['description'] = playlist_description
1179         return {
1180             **kwargs,
1181             '_type': 'multi_video' if multi_video else 'playlist',
1182             'entries': entries,
1183         }
1184
1185     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1186         """
1187         Perform a regex search on the given string, using a single or a list of
1188         patterns returning the first matching group.
1189         In case of failure return a default value or raise a WARNING or a
1190         RegexNotFoundError, depending on fatal, specifying the field name.
1191         """
1192         if string is None:
1193             mobj = None
1194         elif isinstance(pattern, (str, re.Pattern)):
1195             mobj = re.search(pattern, string, flags)
1196         else:
1197             for p in pattern:
1198                 mobj = re.search(p, string, flags)
1199                 if mobj:
1200                     break
1201
1202         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1203
1204         if mobj:
1205             if group is None:
1206                 # return the first matching group
1207                 return next(g for g in mobj.groups() if g is not None)
1208             elif isinstance(group, (list, tuple)):
1209                 return tuple(mobj.group(g) for g in group)
1210             else:
1211                 return mobj.group(group)
1212         elif default is not NO_DEFAULT:
1213             return default
1214         elif fatal:
1215             raise RegexNotFoundError('Unable to extract %s' % _name)
1216         else:
1217             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1218             return None
1219
1220     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1221                      contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs):
1222         """Searches string for the JSON object specified by start_pattern"""
1223         # NB: end_pattern is only used to reduce the size of the initial match
1224         if default is NO_DEFAULT:
1225             default, has_default = {}, False
1226         else:
1227             fatal, has_default = False, True
1228
1229         json_string = self._search_regex(
1230             rf'{start_pattern}\s*(?P<json>{{\s*{contains_pattern}\s*}})\s*{end_pattern}',
1231             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1232         if not json_string:
1233             return default
1234
1235         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1236         try:
1237             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1238         except ExtractorError as e:
1239             if fatal:
1240                 raise ExtractorError(
1241                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1242             elif not has_default:
1243                 self.report_warning(
1244                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1245         return default
1246
1247     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1248         """
1249         Like _search_regex, but strips HTML tags and unescapes entities.
1250         """
1251         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1252         if res:
1253             return clean_html(res).strip()
1254         else:
1255             return res
1256
1257     def _get_netrc_login_info(self, netrc_machine=None):
1258         username = None
1259         password = None
1260         netrc_machine = netrc_machine or self._NETRC_MACHINE
1261
1262         if self.get_param('usenetrc', False):
1263             try:
1264                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1265                 if os.path.isdir(netrc_file):
1266                     netrc_file = os.path.join(netrc_file, '.netrc')
1267                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1268                 if info is not None:
1269                     username = info[0]
1270                     password = info[2]
1271                 else:
1272                     raise netrc.NetrcParseError(
1273                         'No authenticators for %s' % netrc_machine)
1274             except (OSError, netrc.NetrcParseError) as err:
1275                 self.report_warning(
1276                     'parsing .netrc: %s' % error_to_compat_str(err))
1277
1278         return username, password
1279
1280     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1281         """
1282         Get the login info as (username, password)
1283         First look for the manually specified credentials using username_option
1284         and password_option as keys in params dictionary. If no such credentials
1285         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1286         value.
1287         If there's no info available, return (None, None)
1288         """
1289
1290         # Attempt to use provided username and password or .netrc data
1291         username = self.get_param(username_option)
1292         if username is not None:
1293             password = self.get_param(password_option)
1294         else:
1295             username, password = self._get_netrc_login_info(netrc_machine)
1296
1297         return username, password
1298
1299     def _get_tfa_info(self, note='two-factor verification code'):
1300         """
1301         Get the two-factor authentication info
1302         TODO - asking the user will be required for sms/phone verify
1303         currently just uses the command line option
1304         If there's no info available, return None
1305         """
1306
1307         tfa = self.get_param('twofactor')
1308         if tfa is not None:
1309             return tfa
1310
1311         return getpass.getpass('Type %s and press [Return]: ' % note)
1312
1313     # Helper functions for extracting OpenGraph info
1314     @staticmethod
1315     def _og_regexes(prop):
1316         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1317         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1318                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1319         template = r'<meta[^>]+?%s[^>]+?%s'
1320         return [
1321             template % (property_re, content_re),
1322             template % (content_re, property_re),
1323         ]
1324
1325     @staticmethod
1326     def _meta_regex(prop):
1327         return r'''(?isx)<meta
1328                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1329                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1330
1331     def _og_search_property(self, prop, html, name=None, **kargs):
1332         prop = variadic(prop)
1333         if name is None:
1334             name = 'OpenGraph %s' % prop[0]
1335         og_regexes = []
1336         for p in prop:
1337             og_regexes.extend(self._og_regexes(p))
1338         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1339         if escaped is None:
1340             return None
1341         return unescapeHTML(escaped)
1342
1343     def _og_search_thumbnail(self, html, **kargs):
1344         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1345
1346     def _og_search_description(self, html, **kargs):
1347         return self._og_search_property('description', html, fatal=False, **kargs)
1348
1349     def _og_search_title(self, html, *, fatal=False, **kargs):
1350         return self._og_search_property('title', html, fatal=fatal, **kargs)
1351
1352     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1353         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1354         if secure:
1355             regexes = self._og_regexes('video:secure_url') + regexes
1356         return self._html_search_regex(regexes, html, name, **kargs)
1357
1358     def _og_search_url(self, html, **kargs):
1359         return self._og_search_property('url', html, **kargs)
1360
1361     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1362         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1363
1364     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1365         name = variadic(name)
1366         if display_name is None:
1367             display_name = name[0]
1368         return self._html_search_regex(
1369             [self._meta_regex(n) for n in name],
1370             html, display_name, fatal=fatal, group='content', **kwargs)
1371
1372     def _dc_search_uploader(self, html):
1373         return self._html_search_meta('dc.creator', html, 'uploader')
1374
1375     @staticmethod
1376     def _rta_search(html):
1377         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1378         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1379                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1380                      html):
1381             return 18
1382
1383         # And then there are the jokers who advertise that they use RTA, but actually don't.
1384         AGE_LIMIT_MARKERS = [
1385             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1386         ]
1387         if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
1388             return 18
1389         return 0
1390
1391     def _media_rating_search(self, html):
1392         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1393         rating = self._html_search_meta('rating', html)
1394
1395         if not rating:
1396             return None
1397
1398         RATING_TABLE = {
1399             'safe for kids': 0,
1400             'general': 8,
1401             '14 years': 14,
1402             'mature': 17,
1403             'restricted': 19,
1404         }
1405         return RATING_TABLE.get(rating.lower())
1406
1407     def _family_friendly_search(self, html):
1408         # See http://schema.org/VideoObject
1409         family_friendly = self._html_search_meta(
1410             'isFamilyFriendly', html, default=None)
1411
1412         if not family_friendly:
1413             return None
1414
1415         RATING_TABLE = {
1416             '1': 0,
1417             'true': 0,
1418             '0': 18,
1419             'false': 18,
1420         }
1421         return RATING_TABLE.get(family_friendly.lower())
1422
1423     def _twitter_search_player(self, html):
1424         return self._html_search_meta('twitter:player', html,
1425                                       'twitter card player')
1426
1427     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1428         """Yield all json ld objects in the html"""
1429         if default is not NO_DEFAULT:
1430             fatal = False
1431         for mobj in re.finditer(JSON_LD_RE, html):
1432             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1433             for json_ld in variadic(json_ld_item):
1434                 if isinstance(json_ld, dict):
1435                     yield json_ld
1436
1437     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1438         """Search for a video in any json ld in the html"""
1439         if default is not NO_DEFAULT:
1440             fatal = False
1441         info = self._json_ld(
1442             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1443             video_id, fatal=fatal, expected_type=expected_type)
1444         if info:
1445             return info
1446         if default is not NO_DEFAULT:
1447             return default
1448         elif fatal:
1449             raise RegexNotFoundError('Unable to extract JSON-LD')
1450         else:
1451             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1452             return {}
1453
1454     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1455         if isinstance(json_ld, str):
1456             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1457         if not json_ld:
1458             return {}
1459         info = {}
1460         if not isinstance(json_ld, (list, tuple, dict)):
1461             return info
1462         if isinstance(json_ld, dict):
1463             json_ld = [json_ld]
1464
1465         INTERACTION_TYPE_MAP = {
1466             'CommentAction': 'comment',
1467             'AgreeAction': 'like',
1468             'DisagreeAction': 'dislike',
1469             'LikeAction': 'like',
1470             'DislikeAction': 'dislike',
1471             'ListenAction': 'view',
1472             'WatchAction': 'view',
1473             'ViewAction': 'view',
1474         }
1475
1476         def is_type(e, *expected_types):
1477             type = variadic(traverse_obj(e, '@type'))
1478             return any(x in type for x in expected_types)
1479
1480         def extract_interaction_type(e):
1481             interaction_type = e.get('interactionType')
1482             if isinstance(interaction_type, dict):
1483                 interaction_type = interaction_type.get('@type')
1484             return str_or_none(interaction_type)
1485
1486         def extract_interaction_statistic(e):
1487             interaction_statistic = e.get('interactionStatistic')
1488             if isinstance(interaction_statistic, dict):
1489                 interaction_statistic = [interaction_statistic]
1490             if not isinstance(interaction_statistic, list):
1491                 return
1492             for is_e in interaction_statistic:
1493                 if not is_type(is_e, 'InteractionCounter'):
1494                     continue
1495                 interaction_type = extract_interaction_type(is_e)
1496                 if not interaction_type:
1497                     continue
1498                 # For interaction count some sites provide string instead of
1499                 # an integer (as per spec) with non digit characters (e.g. ",")
1500                 # so extracting count with more relaxed str_to_int
1501                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1502                 if interaction_count is None:
1503                     continue
1504                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1505                 if not count_kind:
1506                     continue
1507                 count_key = '%s_count' % count_kind
1508                 if info.get(count_key) is not None:
1509                     continue
1510                 info[count_key] = interaction_count
1511
1512         def extract_chapter_information(e):
1513             chapters = [{
1514                 'title': part.get('name'),
1515                 'start_time': part.get('startOffset'),
1516                 'end_time': part.get('endOffset'),
1517             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1518             for idx, (last_c, current_c, next_c) in enumerate(zip(
1519                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1520                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1521                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1522                 if None in current_c.values():
1523                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1524                     return
1525             if chapters:
1526                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1527                 info['chapters'] = chapters
1528
1529         def extract_video_object(e):
1530             assert is_type(e, 'VideoObject')
1531             author = e.get('author')
1532             info.update({
1533                 'url': url_or_none(e.get('contentUrl')),
1534                 'title': unescapeHTML(e.get('name')),
1535                 'description': unescapeHTML(e.get('description')),
1536                 'thumbnails': [{'url': unescapeHTML(url)}
1537                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1538                                if url_or_none(url)],
1539                 'duration': parse_duration(e.get('duration')),
1540                 'timestamp': unified_timestamp(e.get('uploadDate')),
1541                 # author can be an instance of 'Organization' or 'Person' types.
1542                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1543                 # however some websites are using 'Text' type instead.
1544                 # 1. https://schema.org/VideoObject
1545                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1546                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1547                 'tbr': int_or_none(e.get('bitrate')),
1548                 'width': int_or_none(e.get('width')),
1549                 'height': int_or_none(e.get('height')),
1550                 'view_count': int_or_none(e.get('interactionCount')),
1551             })
1552             extract_interaction_statistic(e)
1553             extract_chapter_information(e)
1554
1555         def traverse_json_ld(json_ld, at_top_level=True):
1556             for e in json_ld:
1557                 if at_top_level and '@context' not in e:
1558                     continue
1559                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1560                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1561                     break
1562                 if expected_type is not None and not is_type(e, expected_type):
1563                     continue
1564                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1565                 if rating is not None:
1566                     info['average_rating'] = rating
1567                 if is_type(e, 'TVEpisode', 'Episode'):
1568                     episode_name = unescapeHTML(e.get('name'))
1569                     info.update({
1570                         'episode': episode_name,
1571                         'episode_number': int_or_none(e.get('episodeNumber')),
1572                         'description': unescapeHTML(e.get('description')),
1573                     })
1574                     if not info.get('title') and episode_name:
1575                         info['title'] = episode_name
1576                     part_of_season = e.get('partOfSeason')
1577                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1578                         info.update({
1579                             'season': unescapeHTML(part_of_season.get('name')),
1580                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1581                         })
1582                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1583                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1584                         info['series'] = unescapeHTML(part_of_series.get('name'))
1585                 elif is_type(e, 'Movie'):
1586                     info.update({
1587                         'title': unescapeHTML(e.get('name')),
1588                         'description': unescapeHTML(e.get('description')),
1589                         'duration': parse_duration(e.get('duration')),
1590                         'timestamp': unified_timestamp(e.get('dateCreated')),
1591                     })
1592                 elif is_type(e, 'Article', 'NewsArticle'):
1593                     info.update({
1594                         'timestamp': parse_iso8601(e.get('datePublished')),
1595                         'title': unescapeHTML(e.get('headline')),
1596                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1597                     })
1598                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1599                         extract_video_object(e['video'][0])
1600                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1601                         extract_video_object(e['subjectOf'][0])
1602                 elif is_type(e, 'VideoObject'):
1603                     extract_video_object(e)
1604                     if expected_type is None:
1605                         continue
1606                     else:
1607                         break
1608                 video = e.get('video')
1609                 if is_type(video, 'VideoObject'):
1610                     extract_video_object(video)
1611                 if expected_type is None:
1612                     continue
1613                 else:
1614                     break
1615         traverse_json_ld(json_ld)
1616
1617         return filter_dict(info)
1618
1619     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1620         return self._parse_json(
1621             self._search_regex(
1622                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1623                 webpage, 'next.js data', fatal=fatal, **kw),
1624             video_id, transform_source=transform_source, fatal=fatal)
1625
1626     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1627         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1628         rectx = re.escape(context_name)
1629         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1630         js, arg_keys, arg_vals = self._search_regex(
1631             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1632             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
1633
1634         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1635
1636         for key, val in args.items():
1637             if val in ('undefined', 'void 0'):
1638                 args[key] = 'null'
1639
1640         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1641         return traverse_obj(ret, traverse) or {}
1642
1643     @staticmethod
1644     def _hidden_inputs(html):
1645         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1646         hidden_inputs = {}
1647         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1648             attrs = extract_attributes(input)
1649             if not input:
1650                 continue
1651             if attrs.get('type') not in ('hidden', 'submit'):
1652                 continue
1653             name = attrs.get('name') or attrs.get('id')
1654             value = attrs.get('value')
1655             if name and value is not None:
1656                 hidden_inputs[name] = value
1657         return hidden_inputs
1658
1659     def _form_hidden_inputs(self, form_id, html):
1660         form = self._search_regex(
1661             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1662             html, '%s form' % form_id, group='form')
1663         return self._hidden_inputs(form)
1664
1665     class FormatSort:
1666         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1667
1668         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1669                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1670                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1671         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1672                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1673                         'fps', 'fs_approx', 'source', 'id')
1674
1675         settings = {
1676             'vcodec': {'type': 'ordered', 'regex': True,
1677                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1678             'acodec': {'type': 'ordered', 'regex': True,
1679                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1680             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1681                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1682             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1683                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1684             'vext': {'type': 'ordered', 'field': 'video_ext',
1685                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1686                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1687             'aext': {'type': 'ordered', 'field': 'audio_ext',
1688                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1689                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1690             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1691             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1692                            'field': ('vcodec', 'acodec'),
1693                            'function': lambda it: int(any(v != 'none' for v in it))},
1694             'ie_pref': {'priority': True, 'type': 'extractor'},
1695             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1696             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1697             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1698             'quality': {'convert': 'float', 'default': -1},
1699             'filesize': {'convert': 'bytes'},
1700             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1701             'id': {'convert': 'string', 'field': 'format_id'},
1702             'height': {'convert': 'float_none'},
1703             'width': {'convert': 'float_none'},
1704             'fps': {'convert': 'float_none'},
1705             'tbr': {'convert': 'float_none'},
1706             'vbr': {'convert': 'float_none'},
1707             'abr': {'convert': 'float_none'},
1708             'asr': {'convert': 'float_none'},
1709             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1710
1711             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1712             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1713             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1714             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1715             'res': {'type': 'multiple', 'field': ('height', 'width'),
1716                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1717
1718             # For compatibility with youtube-dl
1719             'format_id': {'type': 'alias', 'field': 'id'},
1720             'preference': {'type': 'alias', 'field': 'ie_pref'},
1721             'language_preference': {'type': 'alias', 'field': 'lang'},
1722             'source_preference': {'type': 'alias', 'field': 'source'},
1723             'protocol': {'type': 'alias', 'field': 'proto'},
1724             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1725
1726             # Deprecated
1727             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1728             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1729             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1730             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1731             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1732             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1733             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1734             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1735             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1736             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1737             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1738             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1739             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1740             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1741             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1742             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1743             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1744             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1745             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1746             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1747         }
1748
1749         def __init__(self, ie, field_preference):
1750             self._order = []
1751             self.ydl = ie._downloader
1752             self.evaluate_params(self.ydl.params, field_preference)
1753             if ie.get_param('verbose'):
1754                 self.print_verbose_info(self.ydl.write_debug)
1755
1756         def _get_field_setting(self, field, key):
1757             if field not in self.settings:
1758                 if key in ('forced', 'priority'):
1759                     return False
1760                 self.ydl.deprecation_warning(
1761                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1762                     'and may be removed in a future version')
1763                 self.settings[field] = {}
1764             propObj = self.settings[field]
1765             if key not in propObj:
1766                 type = propObj.get('type')
1767                 if key == 'field':
1768                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1769                 elif key == 'convert':
1770                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1771                 else:
1772                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1773                 propObj[key] = default
1774             return propObj[key]
1775
1776         def _resolve_field_value(self, field, value, convertNone=False):
1777             if value is None:
1778                 if not convertNone:
1779                     return None
1780             else:
1781                 value = value.lower()
1782             conversion = self._get_field_setting(field, 'convert')
1783             if conversion == 'ignore':
1784                 return None
1785             if conversion == 'string':
1786                 return value
1787             elif conversion == 'float_none':
1788                 return float_or_none(value)
1789             elif conversion == 'bytes':
1790                 return FileDownloader.parse_bytes(value)
1791             elif conversion == 'order':
1792                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1793                 use_regex = self._get_field_setting(field, 'regex')
1794                 list_length = len(order_list)
1795                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1796                 if use_regex and value is not None:
1797                     for i, regex in enumerate(order_list):
1798                         if regex and re.match(regex, value):
1799                             return list_length - i
1800                     return list_length - empty_pos  # not in list
1801                 else:  # not regex or  value = None
1802                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1803             else:
1804                 if value.isnumeric():
1805                     return float(value)
1806                 else:
1807                     self.settings[field]['convert'] = 'string'
1808                     return value
1809
1810         def evaluate_params(self, params, sort_extractor):
1811             self._use_free_order = params.get('prefer_free_formats', False)
1812             self._sort_user = params.get('format_sort', [])
1813             self._sort_extractor = sort_extractor
1814
1815             def add_item(field, reverse, closest, limit_text):
1816                 field = field.lower()
1817                 if field in self._order:
1818                     return
1819                 self._order.append(field)
1820                 limit = self._resolve_field_value(field, limit_text)
1821                 data = {
1822                     'reverse': reverse,
1823                     'closest': False if limit is None else closest,
1824                     'limit_text': limit_text,
1825                     'limit': limit}
1826                 if field in self.settings:
1827                     self.settings[field].update(data)
1828                 else:
1829                     self.settings[field] = data
1830
1831             sort_list = (
1832                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1833                 + (tuple() if params.get('format_sort_force', False)
1834                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1835                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1836
1837             for item in sort_list:
1838                 match = re.match(self.regex, item)
1839                 if match is None:
1840                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1841                 field = match.group('field')
1842                 if field is None:
1843                     continue
1844                 if self._get_field_setting(field, 'type') == 'alias':
1845                     alias, field = field, self._get_field_setting(field, 'field')
1846                     if self._get_field_setting(alias, 'deprecated'):
1847                         self.ydl.deprecation_warning(
1848                             f'Format sorting alias {alias} is deprecated '
1849                             f'and may be removed in a future version. Please use {field} instead')
1850                 reverse = match.group('reverse') is not None
1851                 closest = match.group('separator') == '~'
1852                 limit_text = match.group('limit')
1853
1854                 has_limit = limit_text is not None
1855                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1856                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1857
1858                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1859                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1860                 limit_count = len(limits)
1861                 for (i, f) in enumerate(fields):
1862                     add_item(f, reverse, closest,
1863                              limits[i] if i < limit_count
1864                              else limits[0] if has_limit and not has_multiple_limits
1865                              else None)
1866
1867         def print_verbose_info(self, write_debug):
1868             if self._sort_user:
1869                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1870             if self._sort_extractor:
1871                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1872             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1873                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1874                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1875                               self._get_field_setting(field, 'limit_text'),
1876                               self._get_field_setting(field, 'limit'))
1877                 if self._get_field_setting(field, 'limit_text') is not None else '')
1878                 for field in self._order if self._get_field_setting(field, 'visible')]))
1879
1880         def _calculate_field_preference_from_value(self, format, field, type, value):
1881             reverse = self._get_field_setting(field, 'reverse')
1882             closest = self._get_field_setting(field, 'closest')
1883             limit = self._get_field_setting(field, 'limit')
1884
1885             if type == 'extractor':
1886                 maximum = self._get_field_setting(field, 'max')
1887                 if value is None or (maximum is not None and value >= maximum):
1888                     value = -1
1889             elif type == 'boolean':
1890                 in_list = self._get_field_setting(field, 'in_list')
1891                 not_in_list = self._get_field_setting(field, 'not_in_list')
1892                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1893             elif type == 'ordered':
1894                 value = self._resolve_field_value(field, value, True)
1895
1896             # try to convert to number
1897             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1898             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1899             if is_num:
1900                 value = val_num
1901
1902             return ((-10, 0) if value is None
1903                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1904                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1905                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1906                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1907                     else (-1, value, 0))
1908
1909         def _calculate_field_preference(self, format, field):
1910             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1911             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1912             if type == 'multiple':
1913                 type = 'field'  # Only 'field' is allowed in multiple for now
1914                 actual_fields = self._get_field_setting(field, 'field')
1915
1916                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1917             else:
1918                 value = get_value(field)
1919             return self._calculate_field_preference_from_value(format, field, type, value)
1920
1921         def calculate_preference(self, format):
1922             # Determine missing protocol
1923             if not format.get('protocol'):
1924                 format['protocol'] = determine_protocol(format)
1925
1926             # Determine missing ext
1927             if not format.get('ext') and 'url' in format:
1928                 format['ext'] = determine_ext(format['url'])
1929             if format.get('vcodec') == 'none':
1930                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1931                 format['video_ext'] = 'none'
1932             else:
1933                 format['video_ext'] = format['ext']
1934                 format['audio_ext'] = 'none'
1935             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1936             #    format['preference'] = -1000
1937
1938             # Determine missing bitrates
1939             if format.get('tbr') is None:
1940                 if format.get('vbr') is not None and format.get('abr') is not None:
1941                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1942             else:
1943                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1944                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1945                 if format.get('acodec') != 'none' and format.get('abr') is None:
1946                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1947
1948             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1949
1950     def _sort_formats(self, formats, field_preference=[]):
1951         if not formats:
1952             return
1953         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1954
1955     def _check_formats(self, formats, video_id):
1956         if formats:
1957             formats[:] = filter(
1958                 lambda f: self._is_valid_url(
1959                     f['url'], video_id,
1960                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1961                 formats)
1962
1963     @staticmethod
1964     def _remove_duplicate_formats(formats):
1965         format_urls = set()
1966         unique_formats = []
1967         for f in formats:
1968             if f['url'] not in format_urls:
1969                 format_urls.add(f['url'])
1970                 unique_formats.append(f)
1971         formats[:] = unique_formats
1972
1973     def _is_valid_url(self, url, video_id, item='video', headers={}):
1974         url = self._proto_relative_url(url, scheme='http:')
1975         # For now assume non HTTP(S) URLs always valid
1976         if not (url.startswith('http://') or url.startswith('https://')):
1977             return True
1978         try:
1979             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1980             return True
1981         except ExtractorError as e:
1982             self.to_screen(
1983                 '%s: %s URL is invalid, skipping: %s'
1984                 % (video_id, item, error_to_compat_str(e.cause)))
1985             return False
1986
1987     def http_scheme(self):
1988         """ Either "http:" or "https:", depending on the user's preferences """
1989         return (
1990             'http:'
1991             if self.get_param('prefer_insecure', False)
1992             else 'https:')
1993
1994     def _proto_relative_url(self, url, scheme=None):
1995         scheme = scheme or self.http_scheme()
1996         assert scheme.endswith(':')
1997         return sanitize_url(url, scheme=scheme[:-1])
1998
1999     def _sleep(self, timeout, video_id, msg_template=None):
2000         if msg_template is None:
2001             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
2002         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
2003         self.to_screen(msg)
2004         time.sleep(timeout)
2005
2006     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2007                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
2008                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
2009         res = self._download_xml_handle(
2010             manifest_url, video_id, 'Downloading f4m manifest',
2011             'Unable to download f4m manifest',
2012             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
2013             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
2014             transform_source=transform_source,
2015             fatal=fatal, data=data, headers=headers, query=query)
2016         if res is False:
2017             return []
2018
2019         manifest, urlh = res
2020         manifest_url = urlh.geturl()
2021
2022         return self._parse_f4m_formats(
2023             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2024             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2025
2026     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2027                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2028                            fatal=True, m3u8_id=None):
2029         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2030             return []
2031
2032         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2033         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2034         if akamai_pv is not None and ';' in akamai_pv.text:
2035             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2036             if playerVerificationChallenge.strip() != '':
2037                 return []
2038
2039         formats = []
2040         manifest_version = '1.0'
2041         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2042         if not media_nodes:
2043             manifest_version = '2.0'
2044             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2045         # Remove unsupported DRM protected media from final formats
2046         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2047         media_nodes = remove_encrypted_media(media_nodes)
2048         if not media_nodes:
2049             return formats
2050
2051         manifest_base_url = get_base_url(manifest)
2052
2053         bootstrap_info = xpath_element(
2054             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2055             'bootstrap info', default=None)
2056
2057         vcodec = None
2058         mime_type = xpath_text(
2059             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2060             'base URL', default=None)
2061         if mime_type and mime_type.startswith('audio/'):
2062             vcodec = 'none'
2063
2064         for i, media_el in enumerate(media_nodes):
2065             tbr = int_or_none(media_el.attrib.get('bitrate'))
2066             width = int_or_none(media_el.attrib.get('width'))
2067             height = int_or_none(media_el.attrib.get('height'))
2068             format_id = join_nonempty(f4m_id, tbr or i)
2069             # If <bootstrapInfo> is present, the specified f4m is a
2070             # stream-level manifest, and only set-level manifests may refer to
2071             # external resources.  See section 11.4 and section 4 of F4M spec
2072             if bootstrap_info is None:
2073                 media_url = None
2074                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2075                 if manifest_version == '2.0':
2076                     media_url = media_el.attrib.get('href')
2077                 if media_url is None:
2078                     media_url = media_el.attrib.get('url')
2079                 if not media_url:
2080                     continue
2081                 manifest_url = (
2082                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2083                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2084                 # If media_url is itself a f4m manifest do the recursive extraction
2085                 # since bitrates in parent manifest (this one) and media_url manifest
2086                 # may differ leading to inability to resolve the format by requested
2087                 # bitrate in f4m downloader
2088                 ext = determine_ext(manifest_url)
2089                 if ext == 'f4m':
2090                     f4m_formats = self._extract_f4m_formats(
2091                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2092                         transform_source=transform_source, fatal=fatal)
2093                     # Sometimes stream-level manifest contains single media entry that
2094                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2095                     # At the same time parent's media entry in set-level manifest may
2096                     # contain it. We will copy it from parent in such cases.
2097                     if len(f4m_formats) == 1:
2098                         f = f4m_formats[0]
2099                         f.update({
2100                             'tbr': f.get('tbr') or tbr,
2101                             'width': f.get('width') or width,
2102                             'height': f.get('height') or height,
2103                             'format_id': f.get('format_id') if not tbr else format_id,
2104                             'vcodec': vcodec,
2105                         })
2106                     formats.extend(f4m_formats)
2107                     continue
2108                 elif ext == 'm3u8':
2109                     formats.extend(self._extract_m3u8_formats(
2110                         manifest_url, video_id, 'mp4', preference=preference,
2111                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2112                     continue
2113             formats.append({
2114                 'format_id': format_id,
2115                 'url': manifest_url,
2116                 'manifest_url': manifest_url,
2117                 'ext': 'flv' if bootstrap_info is not None else None,
2118                 'protocol': 'f4m',
2119                 'tbr': tbr,
2120                 'width': width,
2121                 'height': height,
2122                 'vcodec': vcodec,
2123                 'preference': preference,
2124                 'quality': quality,
2125             })
2126         return formats
2127
2128     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2129         return {
2130             'format_id': join_nonempty(m3u8_id, 'meta'),
2131             'url': m3u8_url,
2132             'ext': ext,
2133             'protocol': 'm3u8',
2134             'preference': preference - 100 if preference else -100,
2135             'quality': quality,
2136             'resolution': 'multiple',
2137             'format_note': 'Quality selection URL',
2138         }
2139
2140     def _report_ignoring_subs(self, name):
2141         self.report_warning(bug_reports_message(
2142             f'Ignoring subtitle tracks found in the {name} manifest; '
2143             'if any subtitle tracks are missing,'
2144         ), only_once=True)
2145
2146     def _extract_m3u8_formats(self, *args, **kwargs):
2147         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2148         if subs:
2149             self._report_ignoring_subs('HLS')
2150         return fmts
2151
2152     def _extract_m3u8_formats_and_subtitles(
2153             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2154             preference=None, quality=None, m3u8_id=None, note=None,
2155             errnote=None, fatal=True, live=False, data=None, headers={},
2156             query={}):
2157
2158         res = self._download_webpage_handle(
2159             m3u8_url, video_id,
2160             note='Downloading m3u8 information' if note is None else note,
2161             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2162             fatal=fatal, data=data, headers=headers, query=query)
2163
2164         if res is False:
2165             return [], {}
2166
2167         m3u8_doc, urlh = res
2168         m3u8_url = urlh.geturl()
2169
2170         return self._parse_m3u8_formats_and_subtitles(
2171             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2172             preference=preference, quality=quality, m3u8_id=m3u8_id,
2173             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2174             headers=headers, query=query, video_id=video_id)
2175
2176     def _parse_m3u8_formats_and_subtitles(
2177             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2178             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2179             errnote=None, fatal=True, data=None, headers={}, query={},
2180             video_id=None):
2181         formats, subtitles = [], {}
2182
2183         has_drm = re.search('|'.join([
2184             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2185             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2186         ]), m3u8_doc)
2187
2188         def format_url(url):
2189             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2190
2191         if self.get_param('hls_split_discontinuity', False):
2192             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2193                 if not m3u8_doc:
2194                     if not manifest_url:
2195                         return []
2196                     m3u8_doc = self._download_webpage(
2197                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2198                         note=False, errnote='Failed to download m3u8 playlist information')
2199                     if m3u8_doc is False:
2200                         return []
2201                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2202
2203         else:
2204             def _extract_m3u8_playlist_indices(*args, **kwargs):
2205                 return [None]
2206
2207         # References:
2208         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2209         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2210         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2211
2212         # We should try extracting formats only from master playlists [1, 4.3.4],
2213         # i.e. playlists that describe available qualities. On the other hand
2214         # media playlists [1, 4.3.3] should be returned as is since they contain
2215         # just the media without qualities renditions.
2216         # Fortunately, master playlist can be easily distinguished from media
2217         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2218         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2219         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2220         # media playlist and MUST NOT appear in master playlist thus we can
2221         # clearly detect media playlist with this criterion.
2222
2223         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2224             formats = [{
2225                 'format_id': join_nonempty(m3u8_id, idx),
2226                 'format_index': idx,
2227                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2228                 'ext': ext,
2229                 'protocol': entry_protocol,
2230                 'preference': preference,
2231                 'quality': quality,
2232                 'has_drm': has_drm,
2233             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2234
2235             return formats, subtitles
2236
2237         groups = {}
2238         last_stream_inf = {}
2239
2240         def extract_media(x_media_line):
2241             media = parse_m3u8_attributes(x_media_line)
2242             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2243             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2244             if not (media_type and group_id and name):
2245                 return
2246             groups.setdefault(group_id, []).append(media)
2247             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2248             if media_type == 'SUBTITLES':
2249                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2250                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2251                 # However, lack of URI has been spotted in the wild.
2252                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2253                 if not media.get('URI'):
2254                     return
2255                 url = format_url(media['URI'])
2256                 sub_info = {
2257                     'url': url,
2258                     'ext': determine_ext(url),
2259                 }
2260                 if sub_info['ext'] == 'm3u8':
2261                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2262                     # files may contain is WebVTT:
2263                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2264                     sub_info['ext'] = 'vtt'
2265                     sub_info['protocol'] = 'm3u8_native'
2266                 lang = media.get('LANGUAGE') or 'und'
2267                 subtitles.setdefault(lang, []).append(sub_info)
2268             if media_type not in ('VIDEO', 'AUDIO'):
2269                 return
2270             media_url = media.get('URI')
2271             if media_url:
2272                 manifest_url = format_url(media_url)
2273                 formats.extend({
2274                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2275                     'format_note': name,
2276                     'format_index': idx,
2277                     'url': manifest_url,
2278                     'manifest_url': m3u8_url,
2279                     'language': media.get('LANGUAGE'),
2280                     'ext': ext,
2281                     'protocol': entry_protocol,
2282                     'preference': preference,
2283                     'quality': quality,
2284                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2285                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2286
2287         def build_stream_name():
2288             # Despite specification does not mention NAME attribute for
2289             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2290             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2291             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2292             stream_name = last_stream_inf.get('NAME')
2293             if stream_name:
2294                 return stream_name
2295             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2296             # from corresponding rendition group
2297             stream_group_id = last_stream_inf.get('VIDEO')
2298             if not stream_group_id:
2299                 return
2300             stream_group = groups.get(stream_group_id)
2301             if not stream_group:
2302                 return stream_group_id
2303             rendition = stream_group[0]
2304             return rendition.get('NAME') or stream_group_id
2305
2306         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2307         # chance to detect video only formats when EXT-X-STREAM-INF tags
2308         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2309         for line in m3u8_doc.splitlines():
2310             if line.startswith('#EXT-X-MEDIA:'):
2311                 extract_media(line)
2312
2313         for line in m3u8_doc.splitlines():
2314             if line.startswith('#EXT-X-STREAM-INF:'):
2315                 last_stream_inf = parse_m3u8_attributes(line)
2316             elif line.startswith('#') or not line.strip():
2317                 continue
2318             else:
2319                 tbr = float_or_none(
2320                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2321                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2322                 manifest_url = format_url(line.strip())
2323
2324                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2325                     format_id = [m3u8_id, None, idx]
2326                     # Bandwidth of live streams may differ over time thus making
2327                     # format_id unpredictable. So it's better to keep provided
2328                     # format_id intact.
2329                     if not live:
2330                         stream_name = build_stream_name()
2331                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2332                     f = {
2333                         'format_id': join_nonempty(*format_id),
2334                         'format_index': idx,
2335                         'url': manifest_url,
2336                         'manifest_url': m3u8_url,
2337                         'tbr': tbr,
2338                         'ext': ext,
2339                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2340                         'protocol': entry_protocol,
2341                         'preference': preference,
2342                         'quality': quality,
2343                     }
2344                     resolution = last_stream_inf.get('RESOLUTION')
2345                     if resolution:
2346                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2347                         if mobj:
2348                             f['width'] = int(mobj.group('width'))
2349                             f['height'] = int(mobj.group('height'))
2350                     # Unified Streaming Platform
2351                     mobj = re.search(
2352                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2353                     if mobj:
2354                         abr, vbr = mobj.groups()
2355                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2356                         f.update({
2357                             'vbr': vbr,
2358                             'abr': abr,
2359                         })
2360                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2361                     f.update(codecs)
2362                     audio_group_id = last_stream_inf.get('AUDIO')
2363                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2364                     # references a rendition group MUST have a CODECS attribute.
2365                     # However, this is not always respected, for example, [2]
2366                     # contains EXT-X-STREAM-INF tag which references AUDIO
2367                     # rendition group but does not have CODECS and despite
2368                     # referencing an audio group it represents a complete
2369                     # (with audio and video) format. So, for such cases we will
2370                     # ignore references to rendition groups and treat them
2371                     # as complete formats.
2372                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2373                         audio_group = groups.get(audio_group_id)
2374                         if audio_group and audio_group[0].get('URI'):
2375                             # TODO: update acodec for audio only formats with
2376                             # the same GROUP-ID
2377                             f['acodec'] = 'none'
2378                     if not f.get('ext'):
2379                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2380                     formats.append(f)
2381
2382                     # for DailyMotion
2383                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2384                     if progressive_uri:
2385                         http_f = f.copy()
2386                         del http_f['manifest_url']
2387                         http_f.update({
2388                             'format_id': f['format_id'].replace('hls-', 'http-'),
2389                             'protocol': 'http',
2390                             'url': progressive_uri,
2391                         })
2392                         formats.append(http_f)
2393
2394                 last_stream_inf = {}
2395         return formats, subtitles
2396
2397     def _extract_m3u8_vod_duration(
2398             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2399
2400         m3u8_vod = self._download_webpage(
2401             m3u8_vod_url, video_id,
2402             note='Downloading m3u8 VOD manifest' if note is None else note,
2403             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2404             fatal=False, data=data, headers=headers, query=query)
2405
2406         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2407
2408     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2409         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2410             return None
2411
2412         return int(sum(
2413             float(line[len('#EXTINF:'):].split(',')[0])
2414             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2415
2416     @staticmethod
2417     def _xpath_ns(path, namespace=None):
2418         if not namespace:
2419             return path
2420         out = []
2421         for c in path.split('/'):
2422             if not c or c == '.':
2423                 out.append(c)
2424             else:
2425                 out.append('{%s}%s' % (namespace, c))
2426         return '/'.join(out)
2427
2428     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2429         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2430         if res is False:
2431             assert not fatal
2432             return [], {}
2433
2434         smil, urlh = res
2435         smil_url = urlh.geturl()
2436
2437         namespace = self._parse_smil_namespace(smil)
2438
2439         fmts = self._parse_smil_formats(
2440             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2441         subs = self._parse_smil_subtitles(
2442             smil, namespace=namespace)
2443
2444         return fmts, subs
2445
2446     def _extract_smil_formats(self, *args, **kwargs):
2447         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2448         if subs:
2449             self._report_ignoring_subs('SMIL')
2450         return fmts
2451
2452     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2453         res = self._download_smil(smil_url, video_id, fatal=fatal)
2454         if res is False:
2455             return {}
2456
2457         smil, urlh = res
2458         smil_url = urlh.geturl()
2459
2460         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2461
2462     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2463         return self._download_xml_handle(
2464             smil_url, video_id, 'Downloading SMIL file',
2465             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2466
2467     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2468         namespace = self._parse_smil_namespace(smil)
2469
2470         formats = self._parse_smil_formats(
2471             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2472         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2473
2474         video_id = os.path.splitext(url_basename(smil_url))[0]
2475         title = None
2476         description = None
2477         upload_date = None
2478         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2479             name = meta.attrib.get('name')
2480             content = meta.attrib.get('content')
2481             if not name or not content:
2482                 continue
2483             if not title and name == 'title':
2484                 title = content
2485             elif not description and name in ('description', 'abstract'):
2486                 description = content
2487             elif not upload_date and name == 'date':
2488                 upload_date = unified_strdate(content)
2489
2490         thumbnails = [{
2491             'id': image.get('type'),
2492             'url': image.get('src'),
2493             'width': int_or_none(image.get('width')),
2494             'height': int_or_none(image.get('height')),
2495         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2496
2497         return {
2498             'id': video_id,
2499             'title': title or video_id,
2500             'description': description,
2501             'upload_date': upload_date,
2502             'thumbnails': thumbnails,
2503             'formats': formats,
2504             'subtitles': subtitles,
2505         }
2506
2507     def _parse_smil_namespace(self, smil):
2508         return self._search_regex(
2509             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2510
2511     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2512         base = smil_url
2513         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2514             b = meta.get('base') or meta.get('httpBase')
2515             if b:
2516                 base = b
2517                 break
2518
2519         formats = []
2520         rtmp_count = 0
2521         http_count = 0
2522         m3u8_count = 0
2523         imgs_count = 0
2524
2525         srcs = set()
2526         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2527         for medium in media:
2528             src = medium.get('src')
2529             if not src or src in srcs:
2530                 continue
2531             srcs.add(src)
2532
2533             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2534             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2535             width = int_or_none(medium.get('width'))
2536             height = int_or_none(medium.get('height'))
2537             proto = medium.get('proto')
2538             ext = medium.get('ext')
2539             src_ext = determine_ext(src)
2540             streamer = medium.get('streamer') or base
2541
2542             if proto == 'rtmp' or streamer.startswith('rtmp'):
2543                 rtmp_count += 1
2544                 formats.append({
2545                     'url': streamer,
2546                     'play_path': src,
2547                     'ext': 'flv',
2548                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2549                     'tbr': bitrate,
2550                     'filesize': filesize,
2551                     'width': width,
2552                     'height': height,
2553                 })
2554                 if transform_rtmp_url:
2555                     streamer, src = transform_rtmp_url(streamer, src)
2556                     formats[-1].update({
2557                         'url': streamer,
2558                         'play_path': src,
2559                     })
2560                 continue
2561
2562             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2563             src_url = src_url.strip()
2564
2565             if proto == 'm3u8' or src_ext == 'm3u8':
2566                 m3u8_formats = self._extract_m3u8_formats(
2567                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2568                 if len(m3u8_formats) == 1:
2569                     m3u8_count += 1
2570                     m3u8_formats[0].update({
2571                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2572                         'tbr': bitrate,
2573                         'width': width,
2574                         'height': height,
2575                     })
2576                 formats.extend(m3u8_formats)
2577             elif src_ext == 'f4m':
2578                 f4m_url = src_url
2579                 if not f4m_params:
2580                     f4m_params = {
2581                         'hdcore': '3.2.0',
2582                         'plugin': 'flowplayer-3.2.0.1',
2583                     }
2584                 f4m_url += '&' if '?' in f4m_url else '?'
2585                 f4m_url += urllib.parse.urlencode(f4m_params)
2586                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2587             elif src_ext == 'mpd':
2588                 formats.extend(self._extract_mpd_formats(
2589                     src_url, video_id, mpd_id='dash', fatal=False))
2590             elif re.search(r'\.ism/[Mm]anifest', src_url):
2591                 formats.extend(self._extract_ism_formats(
2592                     src_url, video_id, ism_id='mss', fatal=False))
2593             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2594                 http_count += 1
2595                 formats.append({
2596                     'url': src_url,
2597                     'ext': ext or src_ext or 'flv',
2598                     'format_id': 'http-%d' % (bitrate or http_count),
2599                     'tbr': bitrate,
2600                     'filesize': filesize,
2601                     'width': width,
2602                     'height': height,
2603                 })
2604
2605         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2606             src = medium.get('src')
2607             if not src or src in srcs:
2608                 continue
2609             srcs.add(src)
2610
2611             imgs_count += 1
2612             formats.append({
2613                 'format_id': 'imagestream-%d' % (imgs_count),
2614                 'url': src,
2615                 'ext': mimetype2ext(medium.get('type')),
2616                 'acodec': 'none',
2617                 'vcodec': 'none',
2618                 'width': int_or_none(medium.get('width')),
2619                 'height': int_or_none(medium.get('height')),
2620                 'format_note': 'SMIL storyboards',
2621             })
2622
2623         return formats
2624
2625     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2626         urls = []
2627         subtitles = {}
2628         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2629             src = textstream.get('src')
2630             if not src or src in urls:
2631                 continue
2632             urls.append(src)
2633             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2634             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2635             subtitles.setdefault(lang, []).append({
2636                 'url': src,
2637                 'ext': ext,
2638             })
2639         return subtitles
2640
2641     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2642         res = self._download_xml_handle(
2643             xspf_url, playlist_id, 'Downloading xpsf playlist',
2644             'Unable to download xspf manifest', fatal=fatal)
2645         if res is False:
2646             return []
2647
2648         xspf, urlh = res
2649         xspf_url = urlh.geturl()
2650
2651         return self._parse_xspf(
2652             xspf, playlist_id, xspf_url=xspf_url,
2653             xspf_base_url=base_url(xspf_url))
2654
2655     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2656         NS_MAP = {
2657             'xspf': 'http://xspf.org/ns/0/',
2658             's1': 'http://static.streamone.nl/player/ns/0',
2659         }
2660
2661         entries = []
2662         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2663             title = xpath_text(
2664                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2665             description = xpath_text(
2666                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2667             thumbnail = xpath_text(
2668                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2669             duration = float_or_none(
2670                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2671
2672             formats = []
2673             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2674                 format_url = urljoin(xspf_base_url, location.text)
2675                 if not format_url:
2676                     continue
2677                 formats.append({
2678                     'url': format_url,
2679                     'manifest_url': xspf_url,
2680                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2681                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2682                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2683                 })
2684             self._sort_formats(formats)
2685
2686             entries.append({
2687                 'id': playlist_id,
2688                 'title': title,
2689                 'description': description,
2690                 'thumbnail': thumbnail,
2691                 'duration': duration,
2692                 'formats': formats,
2693             })
2694         return entries
2695
2696     def _extract_mpd_formats(self, *args, **kwargs):
2697         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2698         if subs:
2699             self._report_ignoring_subs('DASH')
2700         return fmts
2701
2702     def _extract_mpd_formats_and_subtitles(
2703             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2704             fatal=True, data=None, headers={}, query={}):
2705         res = self._download_xml_handle(
2706             mpd_url, video_id,
2707             note='Downloading MPD manifest' if note is None else note,
2708             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2709             fatal=fatal, data=data, headers=headers, query=query)
2710         if res is False:
2711             return [], {}
2712         mpd_doc, urlh = res
2713         if mpd_doc is None:
2714             return [], {}
2715
2716         # We could have been redirected to a new url when we retrieved our mpd file.
2717         mpd_url = urlh.geturl()
2718         mpd_base_url = base_url(mpd_url)
2719
2720         return self._parse_mpd_formats_and_subtitles(
2721             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2722
2723     def _parse_mpd_formats(self, *args, **kwargs):
2724         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2725         if subs:
2726             self._report_ignoring_subs('DASH')
2727         return fmts
2728
2729     def _parse_mpd_formats_and_subtitles(
2730             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2731         """
2732         Parse formats from MPD manifest.
2733         References:
2734          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2735             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2736          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2737         """
2738         if not self.get_param('dynamic_mpd', True):
2739             if mpd_doc.get('type') == 'dynamic':
2740                 return [], {}
2741
2742         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2743
2744         def _add_ns(path):
2745             return self._xpath_ns(path, namespace)
2746
2747         def is_drm_protected(element):
2748             return element.find(_add_ns('ContentProtection')) is not None
2749
2750         def extract_multisegment_info(element, ms_parent_info):
2751             ms_info = ms_parent_info.copy()
2752
2753             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2754             # common attributes and elements.  We will only extract relevant
2755             # for us.
2756             def extract_common(source):
2757                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2758                 if segment_timeline is not None:
2759                     s_e = segment_timeline.findall(_add_ns('S'))
2760                     if s_e:
2761                         ms_info['total_number'] = 0
2762                         ms_info['s'] = []
2763                         for s in s_e:
2764                             r = int(s.get('r', 0))
2765                             ms_info['total_number'] += 1 + r
2766                             ms_info['s'].append({
2767                                 't': int(s.get('t', 0)),
2768                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2769                                 'd': int(s.attrib['d']),
2770                                 'r': r,
2771                             })
2772                 start_number = source.get('startNumber')
2773                 if start_number:
2774                     ms_info['start_number'] = int(start_number)
2775                 timescale = source.get('timescale')
2776                 if timescale:
2777                     ms_info['timescale'] = int(timescale)
2778                 segment_duration = source.get('duration')
2779                 if segment_duration:
2780                     ms_info['segment_duration'] = float(segment_duration)
2781
2782             def extract_Initialization(source):
2783                 initialization = source.find(_add_ns('Initialization'))
2784                 if initialization is not None:
2785                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2786
2787             segment_list = element.find(_add_ns('SegmentList'))
2788             if segment_list is not None:
2789                 extract_common(segment_list)
2790                 extract_Initialization(segment_list)
2791                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2792                 if segment_urls_e:
2793                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2794             else:
2795                 segment_template = element.find(_add_ns('SegmentTemplate'))
2796                 if segment_template is not None:
2797                     extract_common(segment_template)
2798                     media = segment_template.get('media')
2799                     if media:
2800                         ms_info['media'] = media
2801                     initialization = segment_template.get('initialization')
2802                     if initialization:
2803                         ms_info['initialization'] = initialization
2804                     else:
2805                         extract_Initialization(segment_template)
2806             return ms_info
2807
2808         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2809         formats, subtitles = [], {}
2810         stream_numbers = collections.defaultdict(int)
2811         for period in mpd_doc.findall(_add_ns('Period')):
2812             period_duration = parse_duration(period.get('duration')) or mpd_duration
2813             period_ms_info = extract_multisegment_info(period, {
2814                 'start_number': 1,
2815                 'timescale': 1,
2816             })
2817             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2818                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2819                 for representation in adaptation_set.findall(_add_ns('Representation')):
2820                     representation_attrib = adaptation_set.attrib.copy()
2821                     representation_attrib.update(representation.attrib)
2822                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2823                     mime_type = representation_attrib['mimeType']
2824                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2825
2826                     codec_str = representation_attrib.get('codecs', '')
2827                     # Some kind of binary subtitle found in some youtube livestreams
2828                     if mime_type == 'application/x-rawcc':
2829                         codecs = {'scodec': codec_str}
2830                     else:
2831                         codecs = parse_codecs(codec_str)
2832                     if content_type not in ('video', 'audio', 'text'):
2833                         if mime_type == 'image/jpeg':
2834                             content_type = mime_type
2835                         elif codecs.get('vcodec', 'none') != 'none':
2836                             content_type = 'video'
2837                         elif codecs.get('acodec', 'none') != 'none':
2838                             content_type = 'audio'
2839                         elif codecs.get('scodec', 'none') != 'none':
2840                             content_type = 'text'
2841                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2842                             content_type = 'text'
2843                         else:
2844                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2845                             continue
2846
2847                     base_url = ''
2848                     for element in (representation, adaptation_set, period, mpd_doc):
2849                         base_url_e = element.find(_add_ns('BaseURL'))
2850                         if try_call(lambda: base_url_e.text) is not None:
2851                             base_url = base_url_e.text + base_url
2852                             if re.match(r'^https?://', base_url):
2853                                 break
2854                     if mpd_base_url and base_url.startswith('/'):
2855                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2856                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2857                         if not mpd_base_url.endswith('/'):
2858                             mpd_base_url += '/'
2859                         base_url = mpd_base_url + base_url
2860                     representation_id = representation_attrib.get('id')
2861                     lang = representation_attrib.get('lang')
2862                     url_el = representation.find(_add_ns('BaseURL'))
2863                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2864                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2865                     if representation_id is not None:
2866                         format_id = representation_id
2867                     else:
2868                         format_id = content_type
2869                     if mpd_id:
2870                         format_id = mpd_id + '-' + format_id
2871                     if content_type in ('video', 'audio'):
2872                         f = {
2873                             'format_id': format_id,
2874                             'manifest_url': mpd_url,
2875                             'ext': mimetype2ext(mime_type),
2876                             'width': int_or_none(representation_attrib.get('width')),
2877                             'height': int_or_none(representation_attrib.get('height')),
2878                             'tbr': float_or_none(bandwidth, 1000),
2879                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2880                             'fps': int_or_none(representation_attrib.get('frameRate')),
2881                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2882                             'format_note': 'DASH %s' % content_type,
2883                             'filesize': filesize,
2884                             'container': mimetype2ext(mime_type) + '_dash',
2885                             **codecs
2886                         }
2887                     elif content_type == 'text':
2888                         f = {
2889                             'ext': mimetype2ext(mime_type),
2890                             'manifest_url': mpd_url,
2891                             'filesize': filesize,
2892                         }
2893                     elif content_type == 'image/jpeg':
2894                         # See test case in VikiIE
2895                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2896                         f = {
2897                             'format_id': format_id,
2898                             'ext': 'mhtml',
2899                             'manifest_url': mpd_url,
2900                             'format_note': 'DASH storyboards (jpeg)',
2901                             'acodec': 'none',
2902                             'vcodec': 'none',
2903                         }
2904                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2905                         f['has_drm'] = True
2906                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2907
2908                     def prepare_template(template_name, identifiers):
2909                         tmpl = representation_ms_info[template_name]
2910                         # First of, % characters outside $...$ templates
2911                         # must be escaped by doubling for proper processing
2912                         # by % operator string formatting used further (see
2913                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2914                         t = ''
2915                         in_template = False
2916                         for c in tmpl:
2917                             t += c
2918                             if c == '$':
2919                                 in_template = not in_template
2920                             elif c == '%' and not in_template:
2921                                 t += c
2922                         # Next, $...$ templates are translated to their
2923                         # %(...) counterparts to be used with % operator
2924                         if representation_id is not None:
2925                             t = t.replace('$RepresentationID$', representation_id)
2926                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2927                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2928                         t.replace('$$', '$')
2929                         return t
2930
2931                     # @initialization is a regular template like @media one
2932                     # so it should be handled just the same way (see
2933                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2934                     if 'initialization' in representation_ms_info:
2935                         initialization_template = prepare_template(
2936                             'initialization',
2937                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2938                             # $Time$ shall not be included for @initialization thus
2939                             # only $Bandwidth$ remains
2940                             ('Bandwidth', ))
2941                         representation_ms_info['initialization_url'] = initialization_template % {
2942                             'Bandwidth': bandwidth,
2943                         }
2944
2945                     def location_key(location):
2946                         return 'url' if re.match(r'^https?://', location) else 'path'
2947
2948                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2949
2950                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2951                         media_location_key = location_key(media_template)
2952
2953                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2954                         # can't be used at the same time
2955                         if '%(Number' in media_template and 's' not in representation_ms_info:
2956                             segment_duration = None
2957                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2958                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2959                                 representation_ms_info['total_number'] = int(math.ceil(
2960                                     float_or_none(period_duration, segment_duration, default=0)))
2961                             representation_ms_info['fragments'] = [{
2962                                 media_location_key: media_template % {
2963                                     'Number': segment_number,
2964                                     'Bandwidth': bandwidth,
2965                                 },
2966                                 'duration': segment_duration,
2967                             } for segment_number in range(
2968                                 representation_ms_info['start_number'],
2969                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2970                         else:
2971                             # $Number*$ or $Time$ in media template with S list available
2972                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2973                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2974                             representation_ms_info['fragments'] = []
2975                             segment_time = 0
2976                             segment_d = None
2977                             segment_number = representation_ms_info['start_number']
2978
2979                             def add_segment_url():
2980                                 segment_url = media_template % {
2981                                     'Time': segment_time,
2982                                     'Bandwidth': bandwidth,
2983                                     'Number': segment_number,
2984                                 }
2985                                 representation_ms_info['fragments'].append({
2986                                     media_location_key: segment_url,
2987                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2988                                 })
2989
2990                             for num, s in enumerate(representation_ms_info['s']):
2991                                 segment_time = s.get('t') or segment_time
2992                                 segment_d = s['d']
2993                                 add_segment_url()
2994                                 segment_number += 1
2995                                 for r in range(s.get('r', 0)):
2996                                     segment_time += segment_d
2997                                     add_segment_url()
2998                                     segment_number += 1
2999                                 segment_time += segment_d
3000                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
3001                         # No media template
3002                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
3003                         # or any YouTube dashsegments video
3004                         fragments = []
3005                         segment_index = 0
3006                         timescale = representation_ms_info['timescale']
3007                         for s in representation_ms_info['s']:
3008                             duration = float_or_none(s['d'], timescale)
3009                             for r in range(s.get('r', 0) + 1):
3010                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
3011                                 fragments.append({
3012                                     location_key(segment_uri): segment_uri,
3013                                     'duration': duration,
3014                                 })
3015                                 segment_index += 1
3016                         representation_ms_info['fragments'] = fragments
3017                     elif 'segment_urls' in representation_ms_info:
3018                         # Segment URLs with no SegmentTimeline
3019                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3020                         # https://github.com/ytdl-org/youtube-dl/pull/14844
3021                         fragments = []
3022                         segment_duration = float_or_none(
3023                             representation_ms_info['segment_duration'],
3024                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3025                         for segment_url in representation_ms_info['segment_urls']:
3026                             fragment = {
3027                                 location_key(segment_url): segment_url,
3028                             }
3029                             if segment_duration:
3030                                 fragment['duration'] = segment_duration
3031                             fragments.append(fragment)
3032                         representation_ms_info['fragments'] = fragments
3033                     # If there is a fragments key available then we correctly recognized fragmented media.
3034                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3035                     # assumption is not necessarily correct since we may simply have no support for
3036                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3037                     if 'fragments' in representation_ms_info:
3038                         f.update({
3039                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3040                             'url': mpd_url or base_url,
3041                             'fragment_base_url': base_url,
3042                             'fragments': [],
3043                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3044                         })
3045                         if 'initialization_url' in representation_ms_info:
3046                             initialization_url = representation_ms_info['initialization_url']
3047                             if not f.get('url'):
3048                                 f['url'] = initialization_url
3049                             f['fragments'].append({location_key(initialization_url): initialization_url})
3050                         f['fragments'].extend(representation_ms_info['fragments'])
3051                         if not period_duration:
3052                             period_duration = try_get(
3053                                 representation_ms_info,
3054                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3055                     else:
3056                         # Assuming direct URL to unfragmented media.
3057                         f['url'] = base_url
3058                     if content_type in ('video', 'audio', 'image/jpeg'):
3059                         f['manifest_stream_number'] = stream_numbers[f['url']]
3060                         stream_numbers[f['url']] += 1
3061                         formats.append(f)
3062                     elif content_type == 'text':
3063                         subtitles.setdefault(lang or 'und', []).append(f)
3064
3065         return formats, subtitles
3066
3067     def _extract_ism_formats(self, *args, **kwargs):
3068         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3069         if subs:
3070             self._report_ignoring_subs('ISM')
3071         return fmts
3072
3073     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3074         res = self._download_xml_handle(
3075             ism_url, video_id,
3076             note='Downloading ISM manifest' if note is None else note,
3077             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3078             fatal=fatal, data=data, headers=headers, query=query)
3079         if res is False:
3080             return [], {}
3081         ism_doc, urlh = res
3082         if ism_doc is None:
3083             return [], {}
3084
3085         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3086
3087     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3088         """
3089         Parse formats from ISM manifest.
3090         References:
3091          1. [MS-SSTR]: Smooth Streaming Protocol,
3092             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3093         """
3094         if ism_doc.get('IsLive') == 'TRUE':
3095             return [], {}
3096
3097         duration = int(ism_doc.attrib['Duration'])
3098         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3099
3100         formats = []
3101         subtitles = {}
3102         for stream in ism_doc.findall('StreamIndex'):
3103             stream_type = stream.get('Type')
3104             if stream_type not in ('video', 'audio', 'text'):
3105                 continue
3106             url_pattern = stream.attrib['Url']
3107             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3108             stream_name = stream.get('Name')
3109             stream_language = stream.get('Language', 'und')
3110             for track in stream.findall('QualityLevel'):
3111                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3112                 # TODO: add support for WVC1 and WMAP
3113                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3114                     self.report_warning('%s is not a supported codec' % fourcc)
3115                     continue
3116                 tbr = int(track.attrib['Bitrate']) // 1000
3117                 # [1] does not mention Width and Height attributes. However,
3118                 # they're often present while MaxWidth and MaxHeight are
3119                 # missing, so should be used as fallbacks
3120                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3121                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3122                 sampling_rate = int_or_none(track.get('SamplingRate'))
3123
3124                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3125                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3126
3127                 fragments = []
3128                 fragment_ctx = {
3129                     'time': 0,
3130                 }
3131                 stream_fragments = stream.findall('c')
3132                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3133                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3134                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3135                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3136                     if not fragment_ctx['duration']:
3137                         try:
3138                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3139                         except IndexError:
3140                             next_fragment_time = duration
3141                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3142                     for _ in range(fragment_repeat):
3143                         fragments.append({
3144                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3145                             'duration': fragment_ctx['duration'] / stream_timescale,
3146                         })
3147                         fragment_ctx['time'] += fragment_ctx['duration']
3148
3149                 if stream_type == 'text':
3150                     subtitles.setdefault(stream_language, []).append({
3151                         'ext': 'ismt',
3152                         'protocol': 'ism',
3153                         'url': ism_url,
3154                         'manifest_url': ism_url,
3155                         'fragments': fragments,
3156                         '_download_params': {
3157                             'stream_type': stream_type,
3158                             'duration': duration,
3159                             'timescale': stream_timescale,
3160                             'fourcc': fourcc,
3161                             'language': stream_language,
3162                             'codec_private_data': track.get('CodecPrivateData'),
3163                         }
3164                     })
3165                 elif stream_type in ('video', 'audio'):
3166                     formats.append({
3167                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3168                         'url': ism_url,
3169                         'manifest_url': ism_url,
3170                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3171                         'width': width,
3172                         'height': height,
3173                         'tbr': tbr,
3174                         'asr': sampling_rate,
3175                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3176                         'acodec': 'none' if stream_type == 'video' else fourcc,
3177                         'protocol': 'ism',
3178                         'fragments': fragments,
3179                         'has_drm': ism_doc.find('Protection') is not None,
3180                         '_download_params': {
3181                             'stream_type': stream_type,
3182                             'duration': duration,
3183                             'timescale': stream_timescale,
3184                             'width': width or 0,
3185                             'height': height or 0,
3186                             'fourcc': fourcc,
3187                             'language': stream_language,
3188                             'codec_private_data': track.get('CodecPrivateData'),
3189                             'sampling_rate': sampling_rate,
3190                             'channels': int_or_none(track.get('Channels', 2)),
3191                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3192                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3193                         },
3194                     })
3195         return formats, subtitles
3196
3197     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3198         def absolute_url(item_url):
3199             return urljoin(base_url, item_url)
3200
3201         def parse_content_type(content_type):
3202             if not content_type:
3203                 return {}
3204             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3205             if ctr:
3206                 mimetype, codecs = ctr.groups()
3207                 f = parse_codecs(codecs)
3208                 f['ext'] = mimetype2ext(mimetype)
3209                 return f
3210             return {}
3211
3212         def _media_formats(src, cur_media_type, type_info=None):
3213             type_info = type_info or {}
3214             full_url = absolute_url(src)
3215             ext = type_info.get('ext') or determine_ext(full_url)
3216             if ext == 'm3u8':
3217                 is_plain_url = False
3218                 formats = self._extract_m3u8_formats(
3219                     full_url, video_id, ext='mp4',
3220                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3221                     preference=preference, quality=quality, fatal=False)
3222             elif ext == 'mpd':
3223                 is_plain_url = False
3224                 formats = self._extract_mpd_formats(
3225                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3226             else:
3227                 is_plain_url = True
3228                 formats = [{
3229                     'url': full_url,
3230                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3231                     'ext': ext,
3232                 }]
3233             return is_plain_url, formats
3234
3235         entries = []
3236         # amp-video and amp-audio are very similar to their HTML5 counterparts
3237         # so we will include them right here (see
3238         # https://www.ampproject.org/docs/reference/components/amp-video)
3239         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3240         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3241         media_tags = [(media_tag, media_tag_name, media_type, '')
3242                       for media_tag, media_tag_name, media_type
3243                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3244         media_tags.extend(re.findall(
3245             # We only allow video|audio followed by a whitespace or '>'.
3246             # Allowing more characters may end up in significant slow down (see
3247             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3248             # http://www.porntrex.com/maps/videositemap.xml).
3249             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3250         for media_tag, _, media_type, media_content in media_tags:
3251             media_info = {
3252                 'formats': [],
3253                 'subtitles': {},
3254             }
3255             media_attributes = extract_attributes(media_tag)
3256             src = strip_or_none(media_attributes.get('src'))
3257             if src:
3258                 f = parse_content_type(media_attributes.get('type'))
3259                 _, formats = _media_formats(src, media_type, f)
3260                 media_info['formats'].extend(formats)
3261             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3262             if media_content:
3263                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3264                     s_attr = extract_attributes(source_tag)
3265                     # data-video-src and data-src are non standard but seen
3266                     # several times in the wild
3267                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3268                     if not src:
3269                         continue
3270                     f = parse_content_type(s_attr.get('type'))
3271                     is_plain_url, formats = _media_formats(src, media_type, f)
3272                     if is_plain_url:
3273                         # width, height, res, label and title attributes are
3274                         # all not standard but seen several times in the wild
3275                         labels = [
3276                             s_attr.get(lbl)
3277                             for lbl in ('label', 'title')
3278                             if str_or_none(s_attr.get(lbl))
3279                         ]
3280                         width = int_or_none(s_attr.get('width'))
3281                         height = (int_or_none(s_attr.get('height'))
3282                                   or int_or_none(s_attr.get('res')))
3283                         if not width or not height:
3284                             for lbl in labels:
3285                                 resolution = parse_resolution(lbl)
3286                                 if not resolution:
3287                                     continue
3288                                 width = width or resolution.get('width')
3289                                 height = height or resolution.get('height')
3290                         for lbl in labels:
3291                             tbr = parse_bitrate(lbl)
3292                             if tbr:
3293                                 break
3294                         else:
3295                             tbr = None
3296                         f.update({
3297                             'width': width,
3298                             'height': height,
3299                             'tbr': tbr,
3300                             'format_id': s_attr.get('label') or s_attr.get('title'),
3301                         })
3302                         f.update(formats[0])
3303                         media_info['formats'].append(f)
3304                     else:
3305                         media_info['formats'].extend(formats)
3306                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3307                     track_attributes = extract_attributes(track_tag)
3308                     kind = track_attributes.get('kind')
3309                     if not kind or kind in ('subtitles', 'captions'):
3310                         src = strip_or_none(track_attributes.get('src'))
3311                         if not src:
3312                             continue
3313                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3314                         media_info['subtitles'].setdefault(lang, []).append({
3315                             'url': absolute_url(src),
3316                         })
3317             for f in media_info['formats']:
3318                 f.setdefault('http_headers', {})['Referer'] = base_url
3319             if media_info['formats'] or media_info['subtitles']:
3320                 entries.append(media_info)
3321         return entries
3322
3323     def _extract_akamai_formats(self, *args, **kwargs):
3324         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3325         if subs:
3326             self._report_ignoring_subs('akamai')
3327         return fmts
3328
3329     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3330         signed = 'hdnea=' in manifest_url
3331         if not signed:
3332             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3333             manifest_url = re.sub(
3334                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3335                 '', manifest_url).strip('?')
3336
3337         formats = []
3338         subtitles = {}
3339
3340         hdcore_sign = 'hdcore=3.7.0'
3341         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3342         hds_host = hosts.get('hds')
3343         if hds_host:
3344             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3345         if 'hdcore=' not in f4m_url:
3346             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3347         f4m_formats = self._extract_f4m_formats(
3348             f4m_url, video_id, f4m_id='hds', fatal=False)
3349         for entry in f4m_formats:
3350             entry.update({'extra_param_to_segment_url': hdcore_sign})
3351         formats.extend(f4m_formats)
3352
3353         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3354         hls_host = hosts.get('hls')
3355         if hls_host:
3356             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3357         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3358             m3u8_url, video_id, 'mp4', 'm3u8_native',
3359             m3u8_id='hls', fatal=False)
3360         formats.extend(m3u8_formats)
3361         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3362
3363         http_host = hosts.get('http')
3364         if http_host and m3u8_formats and not signed:
3365             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3366             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3367             qualities_length = len(qualities)
3368             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3369                 i = 0
3370                 for f in m3u8_formats:
3371                     if f['vcodec'] != 'none':
3372                         for protocol in ('http', 'https'):
3373                             http_f = f.copy()
3374                             del http_f['manifest_url']
3375                             http_url = re.sub(
3376                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3377                             http_f.update({
3378                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3379                                 'url': http_url,
3380                                 'protocol': protocol,
3381                             })
3382                             formats.append(http_f)
3383                         i += 1
3384
3385         return formats, subtitles
3386
3387     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3388         query = urllib.parse.urlparse(url).query
3389         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3390         mobj = re.search(
3391             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3392         url_base = mobj.group('url')
3393         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3394         formats = []
3395
3396         def manifest_url(manifest):
3397             m_url = f'{http_base_url}/{manifest}'
3398             if query:
3399                 m_url += '?%s' % query
3400             return m_url
3401
3402         if 'm3u8' not in skip_protocols:
3403             formats.extend(self._extract_m3u8_formats(
3404                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3405                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3406         if 'f4m' not in skip_protocols:
3407             formats.extend(self._extract_f4m_formats(
3408                 manifest_url('manifest.f4m'),
3409                 video_id, f4m_id='hds', fatal=False))
3410         if 'dash' not in skip_protocols:
3411             formats.extend(self._extract_mpd_formats(
3412                 manifest_url('manifest.mpd'),
3413                 video_id, mpd_id='dash', fatal=False))
3414         if re.search(r'(?:/smil:|\.smil)', url_base):
3415             if 'smil' not in skip_protocols:
3416                 rtmp_formats = self._extract_smil_formats(
3417                     manifest_url('jwplayer.smil'),
3418                     video_id, fatal=False)
3419                 for rtmp_format in rtmp_formats:
3420                     rtsp_format = rtmp_format.copy()
3421                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3422                     del rtsp_format['play_path']
3423                     del rtsp_format['ext']
3424                     rtsp_format.update({
3425                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3426                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3427                         'protocol': 'rtsp',
3428                     })
3429                     formats.extend([rtmp_format, rtsp_format])
3430         else:
3431             for protocol in ('rtmp', 'rtsp'):
3432                 if protocol not in skip_protocols:
3433                     formats.append({
3434                         'url': f'{protocol}:{url_base}',
3435                         'format_id': protocol,
3436                         'protocol': protocol,
3437                     })
3438         return formats
3439
3440     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3441         mobj = re.search(
3442             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3443             webpage)
3444         if mobj:
3445             try:
3446                 jwplayer_data = self._parse_json(mobj.group('options'),
3447                                                  video_id=video_id,
3448                                                  transform_source=transform_source)
3449             except ExtractorError:
3450                 pass
3451             else:
3452                 if isinstance(jwplayer_data, dict):
3453                     return jwplayer_data
3454
3455     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3456         jwplayer_data = self._find_jwplayer_data(
3457             webpage, video_id, transform_source=js_to_json)
3458         return self._parse_jwplayer_data(
3459             jwplayer_data, video_id, *args, **kwargs)
3460
3461     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3462                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3463         # JWPlayer backward compatibility: flattened playlists
3464         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3465         if 'playlist' not in jwplayer_data:
3466             jwplayer_data = {'playlist': [jwplayer_data]}
3467
3468         entries = []
3469
3470         # JWPlayer backward compatibility: single playlist item
3471         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3472         if not isinstance(jwplayer_data['playlist'], list):
3473             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3474
3475         for video_data in jwplayer_data['playlist']:
3476             # JWPlayer backward compatibility: flattened sources
3477             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3478             if 'sources' not in video_data:
3479                 video_data['sources'] = [video_data]
3480
3481             this_video_id = video_id or video_data['mediaid']
3482
3483             formats = self._parse_jwplayer_formats(
3484                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3485                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3486
3487             subtitles = {}
3488             tracks = video_data.get('tracks')
3489             if tracks and isinstance(tracks, list):
3490                 for track in tracks:
3491                     if not isinstance(track, dict):
3492                         continue
3493                     track_kind = track.get('kind')
3494                     if not track_kind or not isinstance(track_kind, str):
3495                         continue
3496                     if track_kind.lower() not in ('captions', 'subtitles'):
3497                         continue
3498                     track_url = urljoin(base_url, track.get('file'))
3499                     if not track_url:
3500                         continue
3501                     subtitles.setdefault(track.get('label') or 'en', []).append({
3502                         'url': self._proto_relative_url(track_url)
3503                     })
3504
3505             entry = {
3506                 'id': this_video_id,
3507                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3508                 'description': clean_html(video_data.get('description')),
3509                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3510                 'timestamp': int_or_none(video_data.get('pubdate')),
3511                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3512                 'subtitles': subtitles,
3513             }
3514             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3515             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3516                 entry.update({
3517                     '_type': 'url_transparent',
3518                     'url': formats[0]['url'],
3519                 })
3520             else:
3521                 self._sort_formats(formats)
3522                 entry['formats'] = formats
3523             entries.append(entry)
3524         if len(entries) == 1:
3525             return entries[0]
3526         else:
3527             return self.playlist_result(entries)
3528
3529     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3530                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3531         urls = []
3532         formats = []
3533         for source in jwplayer_sources_data:
3534             if not isinstance(source, dict):
3535                 continue
3536             source_url = urljoin(
3537                 base_url, self._proto_relative_url(source.get('file')))
3538             if not source_url or source_url in urls:
3539                 continue
3540             urls.append(source_url)
3541             source_type = source.get('type') or ''
3542             ext = mimetype2ext(source_type) or determine_ext(source_url)
3543             if source_type == 'hls' or ext == 'm3u8':
3544                 formats.extend(self._extract_m3u8_formats(
3545                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3546                     m3u8_id=m3u8_id, fatal=False))
3547             elif source_type == 'dash' or ext == 'mpd':
3548                 formats.extend(self._extract_mpd_formats(
3549                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3550             elif ext == 'smil':
3551                 formats.extend(self._extract_smil_formats(
3552                     source_url, video_id, fatal=False))
3553             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3554             elif source_type.startswith('audio') or ext in (
3555                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3556                 formats.append({
3557                     'url': source_url,
3558                     'vcodec': 'none',
3559                     'ext': ext,
3560                 })
3561             else:
3562                 height = int_or_none(source.get('height'))
3563                 if height is None:
3564                     # Often no height is provided but there is a label in
3565                     # format like "1080p", "720p SD", or 1080.
3566                     height = int_or_none(self._search_regex(
3567                         r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
3568                         'height', default=None))
3569                 a_format = {
3570                     'url': source_url,
3571                     'width': int_or_none(source.get('width')),
3572                     'height': height,
3573                     'tbr': int_or_none(source.get('bitrate')),
3574                     'ext': ext,
3575                 }
3576                 if source_url.startswith('rtmp'):
3577                     a_format['ext'] = 'flv'
3578                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3579                     # of jwplayer.flash.swf
3580                     rtmp_url_parts = re.split(
3581                         r'((?:mp4|mp3|flv):)', source_url, 1)
3582                     if len(rtmp_url_parts) == 3:
3583                         rtmp_url, prefix, play_path = rtmp_url_parts
3584                         a_format.update({
3585                             'url': rtmp_url,
3586                             'play_path': prefix + play_path,
3587                         })
3588                     if rtmp_params:
3589                         a_format.update(rtmp_params)
3590                 formats.append(a_format)
3591         return formats
3592
3593     def _live_title(self, name):
3594         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3595         return name
3596
3597     def _int(self, v, name, fatal=False, **kwargs):
3598         res = int_or_none(v, **kwargs)
3599         if res is None:
3600             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3601             if fatal:
3602                 raise ExtractorError(msg)
3603             else:
3604                 self.report_warning(msg)
3605         return res
3606
3607     def _float(self, v, name, fatal=False, **kwargs):
3608         res = float_or_none(v, **kwargs)
3609         if res is None:
3610             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3611             if fatal:
3612                 raise ExtractorError(msg)
3613             else:
3614                 self.report_warning(msg)
3615         return res
3616
3617     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3618                     path='/', secure=False, discard=False, rest={}, **kwargs):
3619         cookie = http.cookiejar.Cookie(
3620             0, name, value, port, port is not None, domain, True,
3621             domain.startswith('.'), path, True, secure, expire_time,
3622             discard, None, None, rest)
3623         self.cookiejar.set_cookie(cookie)
3624
3625     def _get_cookies(self, url):
3626         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3627         return http.cookies.SimpleCookie(self._downloader._calc_cookies(url))
3628
3629     def _apply_first_set_cookie_header(self, url_handle, cookie):
3630         """
3631         Apply first Set-Cookie header instead of the last. Experimental.
3632
3633         Some sites (e.g. [1-3]) may serve two cookies under the same name
3634         in Set-Cookie header and expect the first (old) one to be set rather
3635         than second (new). However, as of RFC6265 the newer one cookie
3636         should be set into cookie store what actually happens.
3637         We will workaround this issue by resetting the cookie to
3638         the first one manually.
3639         1. https://new.vk.com/
3640         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3641         3. https://learning.oreilly.com/
3642         """
3643         for header, cookies in url_handle.headers.items():
3644             if header.lower() != 'set-cookie':
3645                 continue
3646             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3647             cookie_value = re.search(
3648                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3649             if cookie_value:
3650                 value, domain = cookie_value.groups()
3651                 self._set_cookie(domain, cookie, value)
3652                 break
3653
3654     @classmethod
3655     def get_testcases(cls, include_onlymatching=False):
3656         t = getattr(cls, '_TEST', None)
3657         if t:
3658             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3659             tests = [t]
3660         else:
3661             tests = getattr(cls, '_TESTS', [])
3662         for t in tests:
3663             if not include_onlymatching and t.get('only_matching', False):
3664                 continue
3665             t['name'] = cls.ie_key()
3666             yield t
3667
3668     @classproperty
3669     def age_limit(cls):
3670         """Get age limit from the testcases"""
3671         return max(traverse_obj(
3672             tuple(cls.get_testcases(include_onlymatching=False)),
3673             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3674
3675     @classmethod
3676     def is_suitable(cls, age_limit):
3677         """Test whether the extractor is generally suitable for the given age limit"""
3678         return not age_restricted(cls.age_limit, age_limit)
3679
3680     @classmethod
3681     def description(cls, *, markdown=True, search_examples=None):
3682         """Description of the extractor"""
3683         desc = ''
3684         if cls._NETRC_MACHINE:
3685             if markdown:
3686                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3687             else:
3688                 desc += f' [{cls._NETRC_MACHINE}]'
3689         if cls.IE_DESC is False:
3690             desc += ' [HIDDEN]'
3691         elif cls.IE_DESC:
3692             desc += f' {cls.IE_DESC}'
3693         if cls.SEARCH_KEY:
3694             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3695             if search_examples:
3696                 _COUNTS = ('', '5', '10', 'all')
3697                 desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3698         if not cls.working():
3699             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3700
3701         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3702         return f'{name}:{desc}' if desc else name
3703
3704     def extract_subtitles(self, *args, **kwargs):
3705         if (self.get_param('writesubtitles', False)
3706                 or self.get_param('listsubtitles')):
3707             return self._get_subtitles(*args, **kwargs)
3708         return {}
3709
3710     def _get_subtitles(self, *args, **kwargs):
3711         raise NotImplementedError('This method must be implemented by subclasses')
3712
3713     def extract_comments(self, *args, **kwargs):
3714         if not self.get_param('getcomments'):
3715             return None
3716         generator = self._get_comments(*args, **kwargs)
3717
3718         def extractor():
3719             comments = []
3720             interrupted = True
3721             try:
3722                 while True:
3723                     comments.append(next(generator))
3724             except StopIteration:
3725                 interrupted = False
3726             except KeyboardInterrupt:
3727                 self.to_screen('Interrupted by user')
3728             except Exception as e:
3729                 if self.get_param('ignoreerrors') is not True:
3730                     raise
3731                 self._downloader.report_error(e)
3732             comment_count = len(comments)
3733             self.to_screen(f'Extracted {comment_count} comments')
3734             return {
3735                 'comments': comments,
3736                 'comment_count': None if interrupted else comment_count
3737             }
3738         return extractor
3739
3740     def _get_comments(self, *args, **kwargs):
3741         raise NotImplementedError('This method must be implemented by subclasses')
3742
3743     @staticmethod
3744     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3745         """ Merge subtitle items for one language. Items with duplicated URLs/data
3746         will be dropped. """
3747         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3748         ret = list(subtitle_list1)
3749         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3750         return ret
3751
3752     @classmethod
3753     def _merge_subtitles(cls, *dicts, target=None):
3754         """ Merge subtitle dictionaries, language by language. """
3755         if target is None:
3756             target = {}
3757         for d in dicts:
3758             for lang, subs in d.items():
3759                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3760         return target
3761
3762     def extract_automatic_captions(self, *args, **kwargs):
3763         if (self.get_param('writeautomaticsub', False)
3764                 or self.get_param('listsubtitles')):
3765             return self._get_automatic_captions(*args, **kwargs)
3766         return {}
3767
3768     def _get_automatic_captions(self, *args, **kwargs):
3769         raise NotImplementedError('This method must be implemented by subclasses')
3770
3771     @functools.cached_property
3772     def _cookies_passed(self):
3773         """Whether cookies have been passed to YoutubeDL"""
3774         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3775
3776     def mark_watched(self, *args, **kwargs):
3777         if not self.get_param('mark_watched', False):
3778             return
3779         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3780             self._mark_watched(*args, **kwargs)
3781
3782     def _mark_watched(self, *args, **kwargs):
3783         raise NotImplementedError('This method must be implemented by subclasses')
3784
3785     def geo_verification_headers(self):
3786         headers = {}
3787         geo_verification_proxy = self.get_param('geo_verification_proxy')
3788         if geo_verification_proxy:
3789             headers['Ytdl-request-proxy'] = geo_verification_proxy
3790         return headers
3791
3792     @staticmethod
3793     def _generic_id(url):
3794         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3795
3796     @staticmethod
3797     def _generic_title(url):
3798         return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3799
3800     @staticmethod
3801     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3802         all_known = all(map(
3803             lambda x: x is not None,
3804             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3805         return (
3806             'private' if is_private
3807             else 'premium_only' if needs_premium
3808             else 'subscriber_only' if needs_subscription
3809             else 'needs_auth' if needs_auth
3810             else 'unlisted' if is_unlisted
3811             else 'public' if all_known
3812             else None)
3813
3814     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3815         '''
3816         @returns            A list of values for the extractor argument given by "key"
3817                             or "default" if no such key is present
3818         @param default      The default value to return when the key is not present (default: [])
3819         @param casesense    When false, the values are converted to lower case
3820         '''
3821         val = traverse_obj(
3822             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3823         if val is None:
3824             return [] if default is NO_DEFAULT else default
3825         return list(val) if casesense else [x.lower() for x in val]
3826
3827     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3828         if not playlist_id or not video_id:
3829             return not video_id
3830
3831         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3832         if no_playlist is not None:
3833             return not no_playlist
3834
3835         video_id = '' if video_id is True else f' {video_id}'
3836         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3837         if self.get_param('noplaylist'):
3838             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3839             return False
3840         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3841         return True
3842
3843     @classmethod
3844     def extract_from_webpage(cls, ydl, url, webpage):
3845         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3846               else ydl.get_info_extractor(cls.ie_key()))
3847         yield from ie._extract_from_webpage(url, webpage) or []
3848
3849     @classmethod
3850     def _extract_from_webpage(cls, url, webpage):
3851         for embed_url in orderedSet(
3852                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3853             yield cls.url_result(embed_url, cls)
3854
3855     @classmethod
3856     def _extract_embed_urls(cls, url, webpage):
3857         """@returns all the embed urls on the webpage"""
3858         if '_EMBED_URL_RE' not in cls.__dict__:
3859             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3860             for idx, regex in enumerate(cls._EMBED_REGEX):
3861                 assert regex.count('(?P<url>') == 1, \
3862                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3863             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3864
3865         for regex in cls._EMBED_URL_RE:
3866             for mobj in regex.finditer(webpage):
3867                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3868                 if cls._VALID_URL is False or cls.suitable(embed_url):
3869                     yield embed_url
3870
3871     class StopExtraction(Exception):
3872         pass
3873
3874
3875 class SearchInfoExtractor(InfoExtractor):
3876     """
3877     Base class for paged search queries extractors.
3878     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3879     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3880     """
3881
3882     _MAX_RESULTS = float('inf')
3883
3884     @classproperty
3885     def _VALID_URL(cls):
3886         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3887
3888     def _real_extract(self, query):
3889         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3890         if prefix == '':
3891             return self._get_n_results(query, 1)
3892         elif prefix == 'all':
3893             return self._get_n_results(query, self._MAX_RESULTS)
3894         else:
3895             n = int(prefix)
3896             if n <= 0:
3897                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3898             elif n > self._MAX_RESULTS:
3899                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3900                 n = self._MAX_RESULTS
3901             return self._get_n_results(query, n)
3902
3903     def _get_n_results(self, query, n):
3904         """Get a specified number of results for a query.
3905         Either this function or _search_results must be overridden by subclasses """
3906         return self.playlist_result(
3907             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3908             query, query)
3909
3910     def _search_results(self, query):
3911         """Returns an iterator of search results"""
3912         raise NotImplementedError('This method must be implemented by subclasses')
3913
3914     @classproperty
3915     def SEARCH_KEY(cls):
3916         return cls._SEARCH_KEY