yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import sys
  17 import time
  18 import types
  19 import urllib.parse
  20 import urllib.request
  21 import xml.etree.ElementTree
  22
  23 from ..compat import functools  # isort: split
  24 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  25 from ..cookies import LenientSimpleCookie
  26 from ..downloader import FileDownloader
  27 from ..downloader.f4m import get_base_url, remove_encrypted_media
  28 from ..utils import (
  29     IDENTITY,
  30     JSON_LD_RE,
  31     NO_DEFAULT,
  32     ExtractorError,
  33     GeoRestrictedError,
  34     GeoUtils,
  35     LenientJSONDecoder,
  36     RegexNotFoundError,
  37     RetryManager,
  38     UnsupportedError,
  39     age_restricted,
  40     base_url,
  41     bug_reports_message,
  42     classproperty,
  43     clean_html,
  44     determine_ext,
  45     determine_protocol,
  46     dict_get,
  47     encode_data_uri,
  48     error_to_compat_str,
  49     extract_attributes,
  50     filter_dict,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     int_or_none,
  55     join_nonempty,
  56     js_to_json,
  57     mimetype2ext,
  58     network_exceptions,
  59     orderedSet,
  60     parse_bitrate,
  61     parse_codecs,
  62     parse_duration,
  63     parse_iso8601,
  64     parse_m3u8_attributes,
  65     parse_resolution,
  66     sanitize_filename,
  67     sanitize_url,
  68     sanitized_Request,
  69     str_or_none,
  70     str_to_int,
  71     strip_or_none,
  72     traverse_obj,
  73     try_call,
  74     try_get,
  75     unescapeHTML,
  76     unified_strdate,
  77     unified_timestamp,
  78     update_Request,
  79     update_url_query,
  80     url_basename,
  81     url_or_none,
  82     urljoin,
  83     variadic,
  84     xpath_element,
  85     xpath_text,
  86     xpath_with_ns,
  87 )
  88
  89
  90 class InfoExtractor:
  91     """Information Extractor class.
  92
  93     Information extractors are the classes that, given a URL, extract
  94     information about the video (or videos) the URL refers to. This
  95     information includes the real video URL, the video title, author and
  96     others. The information is stored in a dictionary which is then
  97     passed to the YoutubeDL. The YoutubeDL processes this
  98     information possibly downloading the video to the file system, among
  99     other possible outcomes.
 100
 101     The type field determines the type of the result.
 102     By far the most common value (and the default if _type is missing) is
 103     "video", which indicates a single video.
 104
 105     For a video, the dictionaries must include the following fields:
 106
 107     id:             Video identifier.
 108     title:          Video title, unescaped. Set to an empty string if video has
 109                     no title as opposed to "None" which signifies that the
 110                     extractor failed to obtain a title
 111
 112     Additionally, it must contain either a formats entry or a url one:
 113
 114     formats:        A list of dictionaries for each format available, ordered
 115                     from worst to best quality.
 116
 117                     Potential fields:
 118                     * url        The mandatory URL representing the media:
 119                                    for plain file media - HTTP URL of this file,
 120                                    for RTMP - RTMP URL,
 121                                    for HLS - URL of the M3U8 media playlist,
 122                                    for HDS - URL of the F4M manifest,
 123                                    for DASH
 124                                      - HTTP URL to plain file media (in case of
 125                                        unfragmented media)
 126                                      - URL of the MPD manifest or base URL
 127                                        representing the media if MPD manifest
 128                                        is parsed from a string (in case of
 129                                        fragmented media)
 130                                    for MSS - URL of the ISM manifest.
 131                     * manifest_url
 132                                  The URL of the manifest file in case of
 133                                  fragmented media:
 134                                    for HLS - URL of the M3U8 master playlist,
 135                                    for HDS - URL of the F4M manifest,
 136                                    for DASH - URL of the MPD manifest,
 137                                    for MSS - URL of the ISM manifest.
 138                     * manifest_stream_number  (For internal use only)
 139                                  The index of the stream in the manifest file
 140                     * ext        Will be calculated from URL if missing
 141                     * format     A human-readable description of the format
 142                                  ("mp4 container with h264/opus").
 143                                  Calculated from the format_id, width, height.
 144                                  and format_note fields if missing.
 145                     * format_id  A short description of the format
 146                                  ("mp4_h264_opus" or "19").
 147                                 Technically optional, but strongly recommended.
 148                     * format_note Additional info about the format
 149                                  ("3D" or "DASH video")
 150                     * width      Width of the video, if known
 151                     * height     Height of the video, if known
 152                     * resolution Textual description of width and height
 153                     * dynamic_range The dynamic range of the video. One of:
 154                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 155                     * tbr        Average bitrate of audio and video in KBit/s
 156                     * abr        Average audio bitrate in KBit/s
 157                     * acodec     Name of the audio codec in use
 158                     * asr        Audio sampling rate in Hertz
 159                     * audio_channels  Number of audio channels
 160                     * vbr        Average video bitrate in KBit/s
 161                     * fps        Frame rate
 162                     * vcodec     Name of the video codec in use
 163                     * container  Name of the container format
 164                     * filesize   The number of bytes, if known in advance
 165                     * filesize_approx  An estimate for the number of bytes
 166                     * player_url SWF Player URL (used for rtmpdump).
 167                     * protocol   The protocol that will be used for the actual
 168                                  download, lower-case. One of "http", "https" or
 169                                  one of the protocols defined in downloader.PROTOCOL_MAP
 170                     * fragment_base_url
 171                                  Base URL for fragments. Each fragment's path
 172                                  value (if present) will be relative to
 173                                  this URL.
 174                     * fragments  A list of fragments of a fragmented media.
 175                                  Each fragment entry must contain either an url
 176                                  or a path. If an url is present it should be
 177                                  considered by a client. Otherwise both path and
 178                                  fragment_base_url must be present. Here is
 179                                  the list of all potential fields:
 180                                  * "url" - fragment's URL
 181                                  * "path" - fragment's path relative to
 182                                             fragment_base_url
 183                                  * "duration" (optional, int or float)
 184                                  * "filesize" (optional, int)
 185                     * is_from_start  Is a live format that can be downloaded
 186                                 from the start. Boolean
 187                     * preference Order number of this format. If this field is
 188                                  present and not None, the formats get sorted
 189                                  by this field, regardless of all other values.
 190                                  -1 for default (order by other properties),
 191                                  -2 or smaller for less than default.
 192                                  < -1000 to hide the format (if there is
 193                                     another one which is strictly better)
 194                     * language   Language code, e.g. "de" or "en-US".
 195                     * language_preference  Is this in the language mentioned in
 196                                  the URL?
 197                                  10 if it's what the URL is about,
 198                                  -1 for default (don't know),
 199                                  -10 otherwise, other values reserved for now.
 200                     * quality    Order number of the video quality of this
 201                                  format, irrespective of the file format.
 202                                  -1 for default (order by other properties),
 203                                  -2 or smaller for less than default.
 204                     * source_preference  Order number for this video source
 205                                   (quality takes higher priority)
 206                                  -1 for default (order by other properties),
 207                                  -2 or smaller for less than default.
 208                     * http_headers  A dictionary of additional HTTP headers
 209                                  to add to the request.
 210                     * stretched_ratio  If given and not 1, indicates that the
 211                                  video's pixels are not square.
 212                                  width : height ratio as float.
 213                     * no_resume  The server does not support resuming the
 214                                  (HTTP or RTMP) download. Boolean.
 215                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 216                     * downloader_options  A dictionary of downloader options
 217                                  (For internal use only)
 218                                  * http_chunk_size Chunk size for HTTP downloads
 219                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 220                     RTMP formats can also have the additional fields: page_url,
 221                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 222                     rtmp_protocol, rtmp_real_time
 223
 224     url:            Final video URL.
 225     ext:            Video filename extension.
 226     format:         The video format, defaults to ext (used for --get-format)
 227     player_url:     SWF Player URL (used for rtmpdump).
 228
 229     The following fields are optional:
 230
 231     direct:         True if a direct video file was given (must only be set by GenericIE)
 232     alt_title:      A secondary title of the video.
 233     display_id      An alternative identifier for the video, not necessarily
 234                     unique, but available before title. Typically, id is
 235                     something like "4234987", title "Dancing naked mole rats",
 236                     and display_id "dancing-naked-mole-rats"
 237     thumbnails:     A list of dictionaries, with the following entries:
 238                         * "id" (optional, string) - Thumbnail format ID
 239                         * "url"
 240                         * "preference" (optional, int) - quality of the image
 241                         * "width" (optional, int)
 242                         * "height" (optional, int)
 243                         * "resolution" (optional, string "{width}x{height}",
 244                                         deprecated)
 245                         * "filesize" (optional, int)
 246                         * "http_headers" (dict) - HTTP headers for the request
 247     thumbnail:      Full URL to a video thumbnail image.
 248     description:    Full video description.
 249     uploader:       Full name of the video uploader.
 250     license:        License name the video is licensed under.
 251     creator:        The creator of the video.
 252     timestamp:      UNIX timestamp of the moment the video was uploaded
 253     upload_date:    Video upload date in UTC (YYYYMMDD).
 254                     If not explicitly set, calculated from timestamp
 255     release_timestamp: UNIX timestamp of the moment the video was released.
 256                     If it is not clear whether to use timestamp or this, use the former
 257     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 258                     If not explicitly set, calculated from release_timestamp
 259     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 260     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 261                     If not explicitly set, calculated from modified_timestamp
 262     uploader_id:    Nickname or id of the video uploader.
 263     uploader_url:   Full URL to a personal webpage of the video uploader.
 264     channel:        Full name of the channel the video is uploaded on.
 265                     Note that channel fields may or may not repeat uploader
 266                     fields. This depends on a particular extractor.
 267     channel_id:     Id of the channel.
 268     channel_url:    Full URL to a channel webpage.
 269     channel_follower_count: Number of followers of the channel.
 270     location:       Physical location where the video was filmed.
 271     subtitles:      The available subtitles as a dictionary in the format
 272                     {tag: subformats}. "tag" is usually a language code, and
 273                     "subformats" is a list sorted from lower to higher
 274                     preference, each element is a dictionary with the "ext"
 275                     entry and one of:
 276                         * "data": The subtitles file contents
 277                         * "url": A URL pointing to the subtitles file
 278                     It can optionally also have:
 279                         * "name": Name or description of the subtitles
 280                         * "http_headers": A dictionary of additional HTTP headers
 281                                   to add to the request.
 282                     "ext" will be calculated from URL if missing
 283     automatic_captions: Like 'subtitles'; contains automatically generated
 284                     captions instead of normal subtitles
 285     duration:       Length of the video in seconds, as an integer or float.
 286     view_count:     How many users have watched the video on the platform.
 287     like_count:     Number of positive ratings of the video
 288     dislike_count:  Number of negative ratings of the video
 289     repost_count:   Number of reposts of the video
 290     average_rating: Average rating give by users, the scale used depends on the webpage
 291     comment_count:  Number of comments on the video
 292     comments:       A list of comments, each with one or more of the following
 293                     properties (all but one of text or html optional):
 294                         * "author" - human-readable name of the comment author
 295                         * "author_id" - user ID of the comment author
 296                         * "author_thumbnail" - The thumbnail of the comment author
 297                         * "id" - Comment ID
 298                         * "html" - Comment as HTML
 299                         * "text" - Plain text of the comment
 300                         * "timestamp" - UNIX timestamp of comment
 301                         * "parent" - ID of the comment this one is replying to.
 302                                      Set to "root" to indicate that this is a
 303                                      comment to the original video.
 304                         * "like_count" - Number of positive ratings of the comment
 305                         * "dislike_count" - Number of negative ratings of the comment
 306                         * "is_favorited" - Whether the comment is marked as
 307                                            favorite by the video uploader
 308                         * "author_is_uploader" - Whether the comment is made by
 309                                                  the video uploader
 310     age_limit:      Age restriction for the video, as an integer (years)
 311     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 312                     should allow to get the same result again. (It will be set
 313                     by YoutubeDL if it's missing)
 314     categories:     A list of categories that the video falls in, for example
 315                     ["Sports", "Berlin"]
 316     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 317     cast:           A list of the video cast
 318     is_live:        True, False, or None (=unknown). Whether this video is a
 319                     live stream that goes on instead of a fixed-length video.
 320     was_live:       True, False, or None (=unknown). Whether this video was
 321                     originally a live stream.
 322     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 323                     or 'post_live' (was live, but VOD is not yet processed)
 324                     If absent, automatically set from is_live, was_live
 325     start_time:     Time in seconds where the reproduction should start, as
 326                     specified in the URL.
 327     end_time:       Time in seconds where the reproduction should end, as
 328                     specified in the URL.
 329     chapters:       A list of dictionaries, with the following entries:
 330                         * "start_time" - The start time of the chapter in seconds
 331                         * "end_time" - The end time of the chapter in seconds
 332                         * "title" (optional, string)
 333     playable_in_embed: Whether this video is allowed to play in embedded
 334                     players on other sites. Can be True (=always allowed),
 335                     False (=never allowed), None (=unknown), or a string
 336                     specifying the criteria for embedability; e.g. 'whitelist'
 337     availability:   Under what condition the video is available. One of
 338                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 339                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 340                     to set it
 341     _old_archive_ids: A list of old archive ids needed for backward compatibility
 342     __post_extractor: A function to be called just before the metadata is
 343                     written to either disk, logger or console. The function
 344                     must return a dict which will be added to the info_dict.
 345                     This is usefull for additional information that is
 346                     time-consuming to extract. Note that the fields thus
 347                     extracted will not be available to output template and
 348                     match_filter. So, only "comments" and "comment_count" are
 349                     currently allowed to be extracted via this method.
 350
 351     The following fields should only be used when the video belongs to some logical
 352     chapter or section:
 353
 354     chapter:        Name or title of the chapter the video belongs to.
 355     chapter_number: Number of the chapter the video belongs to, as an integer.
 356     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 357
 358     The following fields should only be used when the video is an episode of some
 359     series, programme or podcast:
 360
 361     series:         Title of the series or programme the video episode belongs to.
 362     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 363     season:         Title of the season the video episode belongs to.
 364     season_number:  Number of the season the video episode belongs to, as an integer.
 365     season_id:      Id of the season the video episode belongs to, as a unicode string.
 366     episode:        Title of the video episode. Unlike mandatory video title field,
 367                     this field should denote the exact title of the video episode
 368                     without any kind of decoration.
 369     episode_number: Number of the video episode within a season, as an integer.
 370     episode_id:     Id of the video episode, as a unicode string.
 371
 372     The following fields should only be used when the media is a track or a part of
 373     a music album:
 374
 375     track:          Title of the track.
 376     track_number:   Number of the track within an album or a disc, as an integer.
 377     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 378                     as a unicode string.
 379     artist:         Artist(s) of the track.
 380     genre:          Genre(s) of the track.
 381     album:          Title of the album the track belongs to.
 382     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 383     album_artist:   List of all artists appeared on the album (e.g.
 384                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 385                     and compilations).
 386     disc_number:    Number of the disc or other physical medium the track belongs to,
 387                     as an integer.
 388     release_year:   Year (YYYY) when the album was released.
 389     composer:       Composer of the piece
 390
 391     The following fields should only be set for clips that should be cut from the original video:
 392
 393     section_start:  Start time of the section in seconds
 394     section_end:    End time of the section in seconds
 395
 396     The following fields should only be set for storyboards:
 397     rows:           Number of rows in each storyboard fragment, as an integer
 398     columns:        Number of columns in each storyboard fragment, as an integer
 399
 400     Unless mentioned otherwise, the fields should be Unicode strings.
 401
 402     Unless mentioned otherwise, None is equivalent to absence of information.
 403
 404
 405     _type "playlist" indicates multiple videos.
 406     There must be a key "entries", which is a list, an iterable, or a PagedList
 407     object, each element of which is a valid dictionary by this specification.
 408
 409     Additionally, playlists can have "id", "title", and any other relevant
 410     attributes with the same semantics as videos (see above).
 411
 412     It can also have the following optional fields:
 413
 414     playlist_count: The total number of videos in a playlist. If not given,
 415                     YoutubeDL tries to calculate it from "entries"
 416
 417
 418     _type "multi_video" indicates that there are multiple videos that
 419     form a single show, for examples multiple acts of an opera or TV episode.
 420     It must have an entries key like a playlist and contain all the keys
 421     required for a video at the same time.
 422
 423
 424     _type "url" indicates that the video must be extracted from another
 425     location, possibly by a different extractor. Its only required key is:
 426     "url" - the next URL to extract.
 427     The key "ie_key" can be set to the class name (minus the trailing "IE",
 428     e.g. "Youtube") if the extractor class is known in advance.
 429     Additionally, the dictionary may have any properties of the resolved entity
 430     known in advance, for example "title" if the title of the referred video is
 431     known ahead of time.
 432
 433
 434     _type "url_transparent" entities have the same specification as "url", but
 435     indicate that the given additional information is more precise than the one
 436     associated with the resolved URL.
 437     This is useful when a site employs a video service that hosts the video and
 438     its technical metadata, but that video service does not embed a useful
 439     title, description etc.
 440
 441
 442     Subclasses of this should also be added to the list of extractors and
 443     should define a _VALID_URL regexp and, re-define the _real_extract() and
 444     (optionally) _real_initialize() methods.
 445
 446     Subclasses may also override suitable() if necessary, but ensure the function
 447     signature is preserved and that this function imports everything it needs
 448     (except other extractors), so that lazy_extractors works correctly.
 449
 450     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 451     the HTML of Generic webpages. It may also override _extract_embed_urls
 452     or _extract_from_webpage as necessary. While these are normally classmethods,
 453     _extract_from_webpage is allowed to be an instance method.
 454
 455     _extract_from_webpage may raise self.StopExtraction() to stop further
 456     processing of the webpage and obtain exclusive rights to it. This is useful
 457     when the extractor cannot reliably be matched using just the URL,
 458     e.g. invidious/peertube instances
 459
 460     Embed-only extractors can be defined by setting _VALID_URL = False.
 461
 462     To support username + password (or netrc) login, the extractor must define a
 463     _NETRC_MACHINE and re-define _perform_login(username, password) and
 464     (optionally) _initialize_pre_login() methods. The _perform_login method will
 465     be called between _initialize_pre_login and _real_initialize if credentials
 466     are passed by the user. In cases where it is necessary to have the login
 467     process as part of the extraction rather than initialization, _perform_login
 468     can be left undefined.
 469
 470     _GEO_BYPASS attribute may be set to False in order to disable
 471     geo restriction bypass mechanisms for a particular extractor.
 472     Though it won't disable explicit geo restriction bypass based on
 473     country code provided with geo_bypass_country.
 474
 475     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 476     countries for this extractor. One of these countries will be used by
 477     geo restriction bypass mechanism right away in order to bypass
 478     geo restriction, of course, if the mechanism is not disabled.
 479
 480     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 481     IP blocks in CIDR notation for this extractor. One of these IP blocks
 482     will be used by geo restriction bypass mechanism similarly
 483     to _GEO_COUNTRIES.
 484
 485     The _ENABLED attribute should be set to False for IEs that
 486     are disabled by default and must be explicitly enabled.
 487
 488     The _WORKING attribute should be set to False for broken IEs
 489     in order to warn the users and skip the tests.
 490     """
 491
 492     _ready = False
 493     _downloader = None
 494     _x_forwarded_for_ip = None
 495     _GEO_BYPASS = True
 496     _GEO_COUNTRIES = None
 497     _GEO_IP_BLOCKS = None
 498     _WORKING = True
 499     _ENABLED = True
 500     _NETRC_MACHINE = None
 501     IE_DESC = None
 502     SEARCH_KEY = None
 503     _VALID_URL = None
 504     _EMBED_REGEX = []
 505
 506     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 507         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 508         return {
 509             None: '',
 510             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 511             'password': f'Use {password_hint}',
 512             'cookies': (
 513                 'Use --cookies-from-browser or --cookies for the authentication. '
 514                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 515         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 516
 517     def __init__(self, downloader=None):
 518         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 519         If a downloader is not passed during initialization,
 520         it must be set using "set_downloader()" before "extract()" is called"""
 521         self._ready = False
 522         self._x_forwarded_for_ip = None
 523         self._printed_messages = set()
 524         self.set_downloader(downloader)
 525
 526     @classmethod
 527     def _match_valid_url(cls, url):
 528         if cls._VALID_URL is False:
 529             return None
 530         # This does not use has/getattr intentionally - we want to know whether
 531         # we have cached the regexp for *this* class, whereas getattr would also
 532         # match the superclass
 533         if '_VALID_URL_RE' not in cls.__dict__:
 534             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 535         return cls._VALID_URL_RE.match(url)
 536
 537     @classmethod
 538     def suitable(cls, url):
 539         """Receives a URL and returns True if suitable for this IE."""
 540         # This function must import everything it needs (except other extractors),
 541         # so that lazy_extractors works correctly
 542         return cls._match_valid_url(url) is not None
 543
 544     @classmethod
 545     def _match_id(cls, url):
 546         return cls._match_valid_url(url).group('id')
 547
 548     @classmethod
 549     def get_temp_id(cls, url):
 550         try:
 551             return cls._match_id(url)
 552         except (IndexError, AttributeError):
 553             return None
 554
 555     @classmethod
 556     def working(cls):
 557         """Getter method for _WORKING."""
 558         return cls._WORKING
 559
 560     @classmethod
 561     def supports_login(cls):
 562         return bool(cls._NETRC_MACHINE)
 563
 564     def initialize(self):
 565         """Initializes an instance (authentication, etc)."""
 566         self._printed_messages = set()
 567         self._initialize_geo_bypass({
 568             'countries': self._GEO_COUNTRIES,
 569             'ip_blocks': self._GEO_IP_BLOCKS,
 570         })
 571         if not self._ready:
 572             self._initialize_pre_login()
 573             if self.supports_login():
 574                 username, password = self._get_login_info()
 575                 if username:
 576                     self._perform_login(username, password)
 577             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 578                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 579             self._real_initialize()
 580             self._ready = True
 581
 582     def _initialize_geo_bypass(self, geo_bypass_context):
 583         """
 584         Initialize geo restriction bypass mechanism.
 585
 586         This method is used to initialize geo bypass mechanism based on faking
 587         X-Forwarded-For HTTP header. A random country from provided country list
 588         is selected and a random IP belonging to this country is generated. This
 589         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 590         HTTP requests.
 591
 592         This method will be used for initial geo bypass mechanism initialization
 593         during the instance initialization with _GEO_COUNTRIES and
 594         _GEO_IP_BLOCKS.
 595
 596         You may also manually call it from extractor's code if geo bypass
 597         information is not available beforehand (e.g. obtained during
 598         extraction) or due to some other reason. In this case you should pass
 599         this information in geo bypass context passed as first argument. It may
 600         contain following fields:
 601
 602         countries:  List of geo unrestricted countries (similar
 603                     to _GEO_COUNTRIES)
 604         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 605                     (similar to _GEO_IP_BLOCKS)
 606
 607         """
 608         if not self._x_forwarded_for_ip:
 609
 610             # Geo bypass mechanism is explicitly disabled by user
 611             if not self.get_param('geo_bypass', True):
 612                 return
 613
 614             if not geo_bypass_context:
 615                 geo_bypass_context = {}
 616
 617             # Backward compatibility: previously _initialize_geo_bypass
 618             # expected a list of countries, some 3rd party code may still use
 619             # it this way
 620             if isinstance(geo_bypass_context, (list, tuple)):
 621                 geo_bypass_context = {
 622                     'countries': geo_bypass_context,
 623                 }
 624
 625             # The whole point of geo bypass mechanism is to fake IP
 626             # as X-Forwarded-For HTTP header based on some IP block or
 627             # country code.
 628
 629             # Path 1: bypassing based on IP block in CIDR notation
 630
 631             # Explicit IP block specified by user, use it right away
 632             # regardless of whether extractor is geo bypassable or not
 633             ip_block = self.get_param('geo_bypass_ip_block', None)
 634
 635             # Otherwise use random IP block from geo bypass context but only
 636             # if extractor is known as geo bypassable
 637             if not ip_block:
 638                 ip_blocks = geo_bypass_context.get('ip_blocks')
 639                 if self._GEO_BYPASS and ip_blocks:
 640                     ip_block = random.choice(ip_blocks)
 641
 642             if ip_block:
 643                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 644                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 645                 return
 646
 647             # Path 2: bypassing based on country code
 648
 649             # Explicit country code specified by user, use it right away
 650             # regardless of whether extractor is geo bypassable or not
 651             country = self.get_param('geo_bypass_country', None)
 652
 653             # Otherwise use random country code from geo bypass context but
 654             # only if extractor is known as geo bypassable
 655             if not country:
 656                 countries = geo_bypass_context.get('countries')
 657                 if self._GEO_BYPASS and countries:
 658                     country = random.choice(countries)
 659
 660             if country:
 661                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 662                 self._downloader.write_debug(
 663                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 664
 665     def extract(self, url):
 666         """Extracts URL information and returns it in list of dicts."""
 667         try:
 668             for _ in range(2):
 669                 try:
 670                     self.initialize()
 671                     self.write_debug('Extracting URL: %s' % url)
 672                     ie_result = self._real_extract(url)
 673                     if ie_result is None:
 674                         return None
 675                     if self._x_forwarded_for_ip:
 676                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 677                     subtitles = ie_result.get('subtitles') or {}
 678                     if 'no-live-chat' in self.get_param('compat_opts'):
 679                         for lang in ('live_chat', 'comments', 'danmaku'):
 680                             subtitles.pop(lang, None)
 681                     return ie_result
 682                 except GeoRestrictedError as e:
 683                     if self.__maybe_fake_ip_and_retry(e.countries):
 684                         continue
 685                     raise
 686         except UnsupportedError:
 687             raise
 688         except ExtractorError as e:
 689             kwargs = {
 690                 'video_id': e.video_id or self.get_temp_id(url),
 691                 'ie': self.IE_NAME,
 692                 'tb': e.traceback or sys.exc_info()[2],
 693                 'expected': e.expected,
 694                 'cause': e.cause
 695             }
 696             if hasattr(e, 'countries'):
 697                 kwargs['countries'] = e.countries
 698             raise type(e)(e.orig_msg, **kwargs)
 699         except http.client.IncompleteRead as e:
 700             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 701         except (KeyError, StopIteration) as e:
 702             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 703
 704     def __maybe_fake_ip_and_retry(self, countries):
 705         if (not self.get_param('geo_bypass_country', None)
 706                 and self._GEO_BYPASS
 707                 and self.get_param('geo_bypass', True)
 708                 and not self._x_forwarded_for_ip
 709                 and countries):
 710             country_code = random.choice(countries)
 711             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 712             if self._x_forwarded_for_ip:
 713                 self.report_warning(
 714                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 715                     % (self._x_forwarded_for_ip, country_code.upper()))
 716                 return True
 717         return False
 718
 719     def set_downloader(self, downloader):
 720         """Sets a YoutubeDL instance as the downloader for this IE."""
 721         self._downloader = downloader
 722
 723     @property
 724     def cache(self):
 725         return self._downloader.cache
 726
 727     @property
 728     def cookiejar(self):
 729         return self._downloader.cookiejar
 730
 731     def _initialize_pre_login(self):
 732         """ Initialization before login. Redefine in subclasses."""
 733         pass
 734
 735     def _perform_login(self, username, password):
 736         """ Login with username and password. Redefine in subclasses."""
 737         pass
 738
 739     def _real_initialize(self):
 740         """Real initialization process. Redefine in subclasses."""
 741         pass
 742
 743     def _real_extract(self, url):
 744         """Real extraction process. Redefine in subclasses."""
 745         raise NotImplementedError('This method must be implemented by subclasses')
 746
 747     @classmethod
 748     def ie_key(cls):
 749         """A string for getting the InfoExtractor with get_info_extractor"""
 750         return cls.__name__[:-2]
 751
 752     @classproperty
 753     def IE_NAME(cls):
 754         return cls.__name__[:-2]
 755
 756     @staticmethod
 757     def __can_accept_status_code(err, expected_status):
 758         assert isinstance(err, urllib.error.HTTPError)
 759         if expected_status is None:
 760             return False
 761         elif callable(expected_status):
 762             return expected_status(err.code) is True
 763         else:
 764             return err.code in variadic(expected_status)
 765
 766     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 767         if isinstance(url_or_request, urllib.request.Request):
 768             return update_Request(url_or_request, data=data, headers=headers, query=query)
 769         if query:
 770             url_or_request = update_url_query(url_or_request, query)
 771         return sanitized_Request(url_or_request, data, headers or {})
 772
 773     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 774         """
 775         Return the response handle.
 776
 777         See _download_webpage docstring for arguments specification.
 778         """
 779         if not self._downloader._first_webpage_request:
 780             sleep_interval = self.get_param('sleep_interval_requests') or 0
 781             if sleep_interval > 0:
 782                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 783                 time.sleep(sleep_interval)
 784         else:
 785             self._downloader._first_webpage_request = False
 786
 787         if note is None:
 788             self.report_download_webpage(video_id)
 789         elif note is not False:
 790             if video_id is None:
 791                 self.to_screen(str(note))
 792             else:
 793                 self.to_screen(f'{video_id}: {note}')
 794
 795         # Some sites check X-Forwarded-For HTTP header in order to figure out
 796         # the origin of the client behind proxy. This allows bypassing geo
 797         # restriction by faking this header's value to IP that belongs to some
 798         # geo unrestricted country. We will do so once we encounter any
 799         # geo restriction error.
 800         if self._x_forwarded_for_ip:
 801             headers = (headers or {}).copy()
 802             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 803
 804         try:
 805             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 806         except network_exceptions as err:
 807             if isinstance(err, urllib.error.HTTPError):
 808                 if self.__can_accept_status_code(err, expected_status):
 809                     # Retain reference to error to prevent file object from
 810                     # being closed before it can be read. Works around the
 811                     # effects of <https://bugs.python.org/issue15002>
 812                     # introduced in Python 3.4.1.
 813                     err.fp._error = err
 814                     return err.fp
 815
 816             if errnote is False:
 817                 return False
 818             if errnote is None:
 819                 errnote = 'Unable to download webpage'
 820
 821             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 822             if fatal:
 823                 raise ExtractorError(errmsg, cause=err)
 824             else:
 825                 self.report_warning(errmsg)
 826                 return False
 827
 828     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 829                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 830         """
 831         Return a tuple (page content as string, URL handle).
 832
 833         Arguments:
 834         url_or_request -- plain text URL as a string or
 835             a urllib.request.Request object
 836         video_id -- Video/playlist/item identifier (string)
 837
 838         Keyword arguments:
 839         note -- note printed before downloading (string)
 840         errnote -- note printed in case of an error (string)
 841         fatal -- flag denoting whether error should be considered fatal,
 842             i.e. whether it should cause ExtractionError to be raised,
 843             otherwise a warning will be reported and extraction continued
 844         encoding -- encoding for a page content decoding, guessed automatically
 845             when not explicitly specified
 846         data -- POST data (bytes)
 847         headers -- HTTP headers (dict)
 848         query -- URL query (dict)
 849         expected_status -- allows to accept failed HTTP requests (non 2xx
 850             status code) by explicitly specifying a set of accepted status
 851             codes. Can be any of the following entities:
 852                 - an integer type specifying an exact failed status code to
 853                   accept
 854                 - a list or a tuple of integer types specifying a list of
 855                   failed status codes to accept
 856                 - a callable accepting an actual failed status code and
 857                   returning True if it should be accepted
 858             Note that this argument does not affect success status codes (2xx)
 859             which are always accepted.
 860         """
 861
 862         # Strip hashes from the URL (#1038)
 863         if isinstance(url_or_request, str):
 864             url_or_request = url_or_request.partition('#')[0]
 865
 866         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 867         if urlh is False:
 868             assert not fatal
 869             return False
 870         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 871         return (content, urlh)
 872
 873     @staticmethod
 874     def _guess_encoding_from_content(content_type, webpage_bytes):
 875         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 876         if m:
 877             encoding = m.group(1)
 878         else:
 879             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 880                           webpage_bytes[:1024])
 881             if m:
 882                 encoding = m.group(1).decode('ascii')
 883             elif webpage_bytes.startswith(b'\xff\xfe'):
 884                 encoding = 'utf-16'
 885             else:
 886                 encoding = 'utf-8'
 887
 888         return encoding
 889
 890     def __check_blocked(self, content):
 891         first_block = content[:512]
 892         if ('<title>Access to this site is blocked</title>' in content
 893                 and 'Websense' in first_block):
 894             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 895             blocked_iframe = self._html_search_regex(
 896                 r'<iframe src="([^"]+)"', content,
 897                 'Websense information URL', default=None)
 898             if blocked_iframe:
 899                 msg += ' Visit %s for more details' % blocked_iframe
 900             raise ExtractorError(msg, expected=True)
 901         if '<title>The URL you requested has been blocked</title>' in first_block:
 902             msg = (
 903                 'Access to this webpage has been blocked by Indian censorship. '
 904                 'Use a VPN or proxy server (with --proxy) to route around it.')
 905             block_msg = self._html_search_regex(
 906                 r'</h1><p>(.*?)</p>',
 907                 content, 'block message', default=None)
 908             if block_msg:
 909                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 910             raise ExtractorError(msg, expected=True)
 911         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 912                 and 'blocklist.rkn.gov.ru' in content):
 913             raise ExtractorError(
 914                 'Access to this webpage has been blocked by decision of the Russian government. '
 915                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 916                 expected=True)
 917
 918     def _request_dump_filename(self, url, video_id):
 919         basen = f'{video_id}_{url}'
 920         trim_length = self.get_param('trim_file_name') or 240
 921         if len(basen) > trim_length:
 922             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 923             basen = basen[:trim_length - len(h)] + h
 924         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 925         # Working around MAX_PATH limitation on Windows (see
 926         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 927         if compat_os_name == 'nt':
 928             absfilepath = os.path.abspath(filename)
 929             if len(absfilepath) > 259:
 930                 filename = fR'\\?\{absfilepath}'
 931         return filename
 932
 933     def __decode_webpage(self, webpage_bytes, encoding, headers):
 934         if not encoding:
 935             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 936         try:
 937             return webpage_bytes.decode(encoding, 'replace')
 938         except LookupError:
 939             return webpage_bytes.decode('utf-8', 'replace')
 940
 941     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 942         webpage_bytes = urlh.read()
 943         if prefix is not None:
 944             webpage_bytes = prefix + webpage_bytes
 945         if self.get_param('dump_intermediate_pages', False):
 946             self.to_screen('Dumping request to ' + urlh.geturl())
 947             dump = base64.b64encode(webpage_bytes).decode('ascii')
 948             self._downloader.to_screen(dump)
 949         if self.get_param('write_pages'):
 950             filename = self._request_dump_filename(urlh.geturl(), video_id)
 951             self.to_screen(f'Saving request to {filename}')
 952             with open(filename, 'wb') as outf:
 953                 outf.write(webpage_bytes)
 954
 955         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 956         self.__check_blocked(content)
 957
 958         return content
 959
 960     def __print_error(self, errnote, fatal, video_id, err):
 961         if fatal:
 962             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 963         elif errnote:
 964             self.report_warning(f'{video_id}: {errnote}: {err}')
 965
 966     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 967         if transform_source:
 968             xml_string = transform_source(xml_string)
 969         try:
 970             return compat_etree_fromstring(xml_string.encode('utf-8'))
 971         except xml.etree.ElementTree.ParseError as ve:
 972             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 973
 974     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
 975         try:
 976             return json.loads(
 977                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 978         except ValueError as ve:
 979             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
 980
 981     def _parse_socket_response_as_json(self, data, *args, **kwargs):
 982         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
 983
 984     def __create_download_methods(name, parser, note, errnote, return_value):
 985
 986         def parse(ie, content, *args, errnote=errnote, **kwargs):
 987             if parser is None:
 988                 return content
 989             if errnote is False:
 990                 kwargs['errnote'] = errnote
 991             # parser is fetched by name so subclasses can override it
 992             return getattr(ie, parser)(content, *args, **kwargs)
 993
 994         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 995                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 996             res = self._download_webpage_handle(
 997                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 998                 data=data, headers=headers, query=query, expected_status=expected_status)
 999             if res is False:
1000                 return res
1001             content, urlh = res
1002             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1003
1004         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1005                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1006             if self.get_param('load_pages'):
1007                 url_or_request = self._create_request(url_or_request, data, headers, query)
1008                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1009                 self.to_screen(f'Loading request from {filename}')
1010                 try:
1011                     with open(filename, 'rb') as dumpf:
1012                         webpage_bytes = dumpf.read()
1013                 except OSError as e:
1014                     self.report_warning(f'Unable to load request from disk: {e}')
1015                 else:
1016                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1017                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1018             kwargs = {
1019                 'note': note,
1020                 'errnote': errnote,
1021                 'transform_source': transform_source,
1022                 'fatal': fatal,
1023                 'encoding': encoding,
1024                 'data': data,
1025                 'headers': headers,
1026                 'query': query,
1027                 'expected_status': expected_status,
1028             }
1029             if parser is None:
1030                 kwargs.pop('transform_source')
1031             # The method is fetched by name so subclasses can override _download_..._handle
1032             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1033             return res if res is False else res[0]
1034
1035         def impersonate(func, name, return_value):
1036             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1037             func.__doc__ = f'''
1038                 @param transform_source     Apply this transformation before parsing
1039                 @returns                    {return_value}
1040
1041                 See _download_webpage_handle docstring for other arguments specification
1042             '''
1043
1044         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1045         impersonate(download_content, f'_download_{name}', f'{return_value}')
1046         return download_handle, download_content
1047
1048     _download_xml_handle, _download_xml = __create_download_methods(
1049         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1050     _download_json_handle, _download_json = __create_download_methods(
1051         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1052     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1053         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1054     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1055
1056     def _download_webpage(
1057             self, url_or_request, video_id, note=None, errnote=None,
1058             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1059         """
1060         Return the data of the page as a string.
1061
1062         Keyword arguments:
1063         tries -- number of tries
1064         timeout -- sleep interval between tries
1065
1066         See _download_webpage_handle docstring for other arguments specification.
1067         """
1068
1069         R''' # NB: These are unused; should they be deprecated?
1070         if tries != 1:
1071             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1072         if timeout is NO_DEFAULT:
1073             timeout = 5
1074         else:
1075             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1076         '''
1077
1078         try_count = 0
1079         while True:
1080             try:
1081                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1082             except http.client.IncompleteRead as e:
1083                 try_count += 1
1084                 if try_count >= tries:
1085                     raise e
1086                 self._sleep(timeout, video_id)
1087
1088     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1089         idstr = format_field(video_id, None, '%s: ')
1090         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1091         if only_once:
1092             if f'WARNING: {msg}' in self._printed_messages:
1093                 return
1094             self._printed_messages.add(f'WARNING: {msg}')
1095         self._downloader.report_warning(msg, *args, **kwargs)
1096
1097     def to_screen(self, msg, *args, **kwargs):
1098         """Print msg to screen, prefixing it with '[ie_name]'"""
1099         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1100
1101     def write_debug(self, msg, *args, **kwargs):
1102         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1103
1104     def get_param(self, name, default=None, *args, **kwargs):
1105         if self._downloader:
1106             return self._downloader.params.get(name, default, *args, **kwargs)
1107         return default
1108
1109     def report_drm(self, video_id, partial=False):
1110         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1111
1112     def report_extraction(self, id_or_name):
1113         """Report information extraction."""
1114         self.to_screen('%s: Extracting information' % id_or_name)
1115
1116     def report_download_webpage(self, video_id):
1117         """Report webpage download."""
1118         self.to_screen('%s: Downloading webpage' % video_id)
1119
1120     def report_age_confirmation(self):
1121         """Report attempt to confirm age."""
1122         self.to_screen('Confirming age')
1123
1124     def report_login(self):
1125         """Report attempt to log in."""
1126         self.to_screen('Logging in')
1127
1128     def raise_login_required(
1129             self, msg='This video is only available for registered users',
1130             metadata_available=False, method=NO_DEFAULT):
1131         if metadata_available and (
1132                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1133             self.report_warning(msg)
1134             return
1135         msg += format_field(self._login_hint(method), None, '. %s')
1136         raise ExtractorError(msg, expected=True)
1137
1138     def raise_geo_restricted(
1139             self, msg='This video is not available from your location due to geo restriction',
1140             countries=None, metadata_available=False):
1141         if metadata_available and (
1142                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1143             self.report_warning(msg)
1144         else:
1145             raise GeoRestrictedError(msg, countries=countries)
1146
1147     def raise_no_formats(self, msg, expected=False, video_id=None):
1148         if expected and (
1149                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1150             self.report_warning(msg, video_id)
1151         elif isinstance(msg, ExtractorError):
1152             raise msg
1153         else:
1154             raise ExtractorError(msg, expected=expected, video_id=video_id)
1155
1156     # Methods for following #608
1157     @staticmethod
1158     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1159         """Returns a URL that points to a page that should be processed"""
1160         if ie is not None:
1161             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1162         if video_id is not None:
1163             kwargs['id'] = video_id
1164         if video_title is not None:
1165             kwargs['title'] = video_title
1166         return {
1167             **kwargs,
1168             '_type': 'url_transparent' if url_transparent else 'url',
1169             'url': url,
1170         }
1171
1172     @classmethod
1173     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1174                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1175         return cls.playlist_result(
1176             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1177             playlist_id, playlist_title, **kwargs)
1178
1179     @staticmethod
1180     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1181         """Returns a playlist"""
1182         if playlist_id:
1183             kwargs['id'] = playlist_id
1184         if playlist_title:
1185             kwargs['title'] = playlist_title
1186         if playlist_description is not None:
1187             kwargs['description'] = playlist_description
1188         return {
1189             **kwargs,
1190             '_type': 'multi_video' if multi_video else 'playlist',
1191             'entries': entries,
1192         }
1193
1194     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1195         """
1196         Perform a regex search on the given string, using a single or a list of
1197         patterns returning the first matching group.
1198         In case of failure return a default value or raise a WARNING or a
1199         RegexNotFoundError, depending on fatal, specifying the field name.
1200         """
1201         if string is None:
1202             mobj = None
1203         elif isinstance(pattern, (str, re.Pattern)):
1204             mobj = re.search(pattern, string, flags)
1205         else:
1206             for p in pattern:
1207                 mobj = re.search(p, string, flags)
1208                 if mobj:
1209                     break
1210
1211         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1212
1213         if mobj:
1214             if group is None:
1215                 # return the first matching group
1216                 return next(g for g in mobj.groups() if g is not None)
1217             elif isinstance(group, (list, tuple)):
1218                 return tuple(mobj.group(g) for g in group)
1219             else:
1220                 return mobj.group(group)
1221         elif default is not NO_DEFAULT:
1222             return default
1223         elif fatal:
1224             raise RegexNotFoundError('Unable to extract %s' % _name)
1225         else:
1226             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1227             return None
1228
1229     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1230                      contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs):
1231         """Searches string for the JSON object specified by start_pattern"""
1232         # NB: end_pattern is only used to reduce the size of the initial match
1233         if default is NO_DEFAULT:
1234             default, has_default = {}, False
1235         else:
1236             fatal, has_default = False, True
1237
1238         json_string = self._search_regex(
1239             rf'(?:{start_pattern})\s*(?P<json>{{\s*(?:{contains_pattern})\s*}})\s*(?:{end_pattern})',
1240             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1241         if not json_string:
1242             return default
1243
1244         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1245         try:
1246             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1247         except ExtractorError as e:
1248             if fatal:
1249                 raise ExtractorError(
1250                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1251             elif not has_default:
1252                 self.report_warning(
1253                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1254         return default
1255
1256     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1257         """
1258         Like _search_regex, but strips HTML tags and unescapes entities.
1259         """
1260         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1261         if res:
1262             return clean_html(res).strip()
1263         else:
1264             return res
1265
1266     def _get_netrc_login_info(self, netrc_machine=None):
1267         username = None
1268         password = None
1269         netrc_machine = netrc_machine or self._NETRC_MACHINE
1270
1271         if self.get_param('usenetrc', False):
1272             try:
1273                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1274                 if os.path.isdir(netrc_file):
1275                     netrc_file = os.path.join(netrc_file, '.netrc')
1276                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1277                 if info is not None:
1278                     username = info[0]
1279                     password = info[2]
1280                 else:
1281                     raise netrc.NetrcParseError(
1282                         'No authenticators for %s' % netrc_machine)
1283             except (OSError, netrc.NetrcParseError) as err:
1284                 self.report_warning(
1285                     'parsing .netrc: %s' % error_to_compat_str(err))
1286
1287         return username, password
1288
1289     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1290         """
1291         Get the login info as (username, password)
1292         First look for the manually specified credentials using username_option
1293         and password_option as keys in params dictionary. If no such credentials
1294         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1295         value.
1296         If there's no info available, return (None, None)
1297         """
1298
1299         # Attempt to use provided username and password or .netrc data
1300         username = self.get_param(username_option)
1301         if username is not None:
1302             password = self.get_param(password_option)
1303         else:
1304             username, password = self._get_netrc_login_info(netrc_machine)
1305
1306         return username, password
1307
1308     def _get_tfa_info(self, note='two-factor verification code'):
1309         """
1310         Get the two-factor authentication info
1311         TODO - asking the user will be required for sms/phone verify
1312         currently just uses the command line option
1313         If there's no info available, return None
1314         """
1315
1316         tfa = self.get_param('twofactor')
1317         if tfa is not None:
1318             return tfa
1319
1320         return getpass.getpass('Type %s and press [Return]: ' % note)
1321
1322     # Helper functions for extracting OpenGraph info
1323     @staticmethod
1324     def _og_regexes(prop):
1325         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1326         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1327                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1328         template = r'<meta[^>]+?%s[^>]+?%s'
1329         return [
1330             template % (property_re, content_re),
1331             template % (content_re, property_re),
1332         ]
1333
1334     @staticmethod
1335     def _meta_regex(prop):
1336         return r'''(?isx)<meta
1337                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1338                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1339
1340     def _og_search_property(self, prop, html, name=None, **kargs):
1341         prop = variadic(prop)
1342         if name is None:
1343             name = 'OpenGraph %s' % prop[0]
1344         og_regexes = []
1345         for p in prop:
1346             og_regexes.extend(self._og_regexes(p))
1347         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1348         if escaped is None:
1349             return None
1350         return unescapeHTML(escaped)
1351
1352     def _og_search_thumbnail(self, html, **kargs):
1353         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1354
1355     def _og_search_description(self, html, **kargs):
1356         return self._og_search_property('description', html, fatal=False, **kargs)
1357
1358     def _og_search_title(self, html, *, fatal=False, **kargs):
1359         return self._og_search_property('title', html, fatal=fatal, **kargs)
1360
1361     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1362         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1363         if secure:
1364             regexes = self._og_regexes('video:secure_url') + regexes
1365         return self._html_search_regex(regexes, html, name, **kargs)
1366
1367     def _og_search_url(self, html, **kargs):
1368         return self._og_search_property('url', html, **kargs)
1369
1370     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1371         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1372
1373     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1374         name = variadic(name)
1375         if display_name is None:
1376             display_name = name[0]
1377         return self._html_search_regex(
1378             [self._meta_regex(n) for n in name],
1379             html, display_name, fatal=fatal, group='content', **kwargs)
1380
1381     def _dc_search_uploader(self, html):
1382         return self._html_search_meta('dc.creator', html, 'uploader')
1383
1384     @staticmethod
1385     def _rta_search(html):
1386         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1387         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1388                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1389                      html):
1390             return 18
1391
1392         # And then there are the jokers who advertise that they use RTA, but actually don't.
1393         AGE_LIMIT_MARKERS = [
1394             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1395         ]
1396         if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
1397             return 18
1398         return 0
1399
1400     def _media_rating_search(self, html):
1401         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1402         rating = self._html_search_meta('rating', html)
1403
1404         if not rating:
1405             return None
1406
1407         RATING_TABLE = {
1408             'safe for kids': 0,
1409             'general': 8,
1410             '14 years': 14,
1411             'mature': 17,
1412             'restricted': 19,
1413         }
1414         return RATING_TABLE.get(rating.lower())
1415
1416     def _family_friendly_search(self, html):
1417         # See http://schema.org/VideoObject
1418         family_friendly = self._html_search_meta(
1419             'isFamilyFriendly', html, default=None)
1420
1421         if not family_friendly:
1422             return None
1423
1424         RATING_TABLE = {
1425             '1': 0,
1426             'true': 0,
1427             '0': 18,
1428             'false': 18,
1429         }
1430         return RATING_TABLE.get(family_friendly.lower())
1431
1432     def _twitter_search_player(self, html):
1433         return self._html_search_meta('twitter:player', html,
1434                                       'twitter card player')
1435
1436     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1437         """Yield all json ld objects in the html"""
1438         if default is not NO_DEFAULT:
1439             fatal = False
1440         for mobj in re.finditer(JSON_LD_RE, html):
1441             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1442             for json_ld in variadic(json_ld_item):
1443                 if isinstance(json_ld, dict):
1444                     yield json_ld
1445
1446     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1447         """Search for a video in any json ld in the html"""
1448         if default is not NO_DEFAULT:
1449             fatal = False
1450         info = self._json_ld(
1451             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1452             video_id, fatal=fatal, expected_type=expected_type)
1453         if info:
1454             return info
1455         if default is not NO_DEFAULT:
1456             return default
1457         elif fatal:
1458             raise RegexNotFoundError('Unable to extract JSON-LD')
1459         else:
1460             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1461             return {}
1462
1463     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1464         if isinstance(json_ld, str):
1465             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1466         if not json_ld:
1467             return {}
1468         info = {}
1469         if not isinstance(json_ld, (list, tuple, dict)):
1470             return info
1471         if isinstance(json_ld, dict):
1472             json_ld = [json_ld]
1473
1474         INTERACTION_TYPE_MAP = {
1475             'CommentAction': 'comment',
1476             'AgreeAction': 'like',
1477             'DisagreeAction': 'dislike',
1478             'LikeAction': 'like',
1479             'DislikeAction': 'dislike',
1480             'ListenAction': 'view',
1481             'WatchAction': 'view',
1482             'ViewAction': 'view',
1483         }
1484
1485         def is_type(e, *expected_types):
1486             type = variadic(traverse_obj(e, '@type'))
1487             return any(x in type for x in expected_types)
1488
1489         def extract_interaction_type(e):
1490             interaction_type = e.get('interactionType')
1491             if isinstance(interaction_type, dict):
1492                 interaction_type = interaction_type.get('@type')
1493             return str_or_none(interaction_type)
1494
1495         def extract_interaction_statistic(e):
1496             interaction_statistic = e.get('interactionStatistic')
1497             if isinstance(interaction_statistic, dict):
1498                 interaction_statistic = [interaction_statistic]
1499             if not isinstance(interaction_statistic, list):
1500                 return
1501             for is_e in interaction_statistic:
1502                 if not is_type(is_e, 'InteractionCounter'):
1503                     continue
1504                 interaction_type = extract_interaction_type(is_e)
1505                 if not interaction_type:
1506                     continue
1507                 # For interaction count some sites provide string instead of
1508                 # an integer (as per spec) with non digit characters (e.g. ",")
1509                 # so extracting count with more relaxed str_to_int
1510                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1511                 if interaction_count is None:
1512                     continue
1513                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1514                 if not count_kind:
1515                     continue
1516                 count_key = '%s_count' % count_kind
1517                 if info.get(count_key) is not None:
1518                     continue
1519                 info[count_key] = interaction_count
1520
1521         def extract_chapter_information(e):
1522             chapters = [{
1523                 'title': part.get('name'),
1524                 'start_time': part.get('startOffset'),
1525                 'end_time': part.get('endOffset'),
1526             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1527             for idx, (last_c, current_c, next_c) in enumerate(zip(
1528                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1529                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1530                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1531                 if None in current_c.values():
1532                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1533                     return
1534             if chapters:
1535                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1536                 info['chapters'] = chapters
1537
1538         def extract_video_object(e):
1539             assert is_type(e, 'VideoObject')
1540             author = e.get('author')
1541             info.update({
1542                 'url': url_or_none(e.get('contentUrl')),
1543                 'title': unescapeHTML(e.get('name')),
1544                 'description': unescapeHTML(e.get('description')),
1545                 'thumbnails': [{'url': unescapeHTML(url)}
1546                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1547                                if url_or_none(url)],
1548                 'duration': parse_duration(e.get('duration')),
1549                 'timestamp': unified_timestamp(e.get('uploadDate')),
1550                 # author can be an instance of 'Organization' or 'Person' types.
1551                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1552                 # however some websites are using 'Text' type instead.
1553                 # 1. https://schema.org/VideoObject
1554                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1555                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1556                 'tbr': int_or_none(e.get('bitrate')),
1557                 'width': int_or_none(e.get('width')),
1558                 'height': int_or_none(e.get('height')),
1559                 'view_count': int_or_none(e.get('interactionCount')),
1560             })
1561             extract_interaction_statistic(e)
1562             extract_chapter_information(e)
1563
1564         def traverse_json_ld(json_ld, at_top_level=True):
1565             for e in json_ld:
1566                 if at_top_level and '@context' not in e:
1567                     continue
1568                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1569                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1570                     break
1571                 if expected_type is not None and not is_type(e, expected_type):
1572                     continue
1573                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1574                 if rating is not None:
1575                     info['average_rating'] = rating
1576                 if is_type(e, 'TVEpisode', 'Episode'):
1577                     episode_name = unescapeHTML(e.get('name'))
1578                     info.update({
1579                         'episode': episode_name,
1580                         'episode_number': int_or_none(e.get('episodeNumber')),
1581                         'description': unescapeHTML(e.get('description')),
1582                     })
1583                     if not info.get('title') and episode_name:
1584                         info['title'] = episode_name
1585                     part_of_season = e.get('partOfSeason')
1586                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1587                         info.update({
1588                             'season': unescapeHTML(part_of_season.get('name')),
1589                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1590                         })
1591                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1592                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1593                         info['series'] = unescapeHTML(part_of_series.get('name'))
1594                 elif is_type(e, 'Movie'):
1595                     info.update({
1596                         'title': unescapeHTML(e.get('name')),
1597                         'description': unescapeHTML(e.get('description')),
1598                         'duration': parse_duration(e.get('duration')),
1599                         'timestamp': unified_timestamp(e.get('dateCreated')),
1600                     })
1601                 elif is_type(e, 'Article', 'NewsArticle'):
1602                     info.update({
1603                         'timestamp': parse_iso8601(e.get('datePublished')),
1604                         'title': unescapeHTML(e.get('headline')),
1605                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1606                     })
1607                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1608                         extract_video_object(e['video'][0])
1609                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1610                         extract_video_object(e['subjectOf'][0])
1611                 elif is_type(e, 'VideoObject'):
1612                     extract_video_object(e)
1613                     if expected_type is None:
1614                         continue
1615                     else:
1616                         break
1617                 video = e.get('video')
1618                 if is_type(video, 'VideoObject'):
1619                     extract_video_object(video)
1620                 if expected_type is None:
1621                     continue
1622                 else:
1623                     break
1624         traverse_json_ld(json_ld)
1625
1626         return filter_dict(info)
1627
1628     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1629         return self._parse_json(
1630             self._search_regex(
1631                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1632                 webpage, 'next.js data', fatal=fatal, **kw),
1633             video_id, transform_source=transform_source, fatal=fatal)
1634
1635     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1636         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1637         rectx = re.escape(context_name)
1638         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1639         js, arg_keys, arg_vals = self._search_regex(
1640             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1641             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
1642
1643         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1644
1645         for key, val in args.items():
1646             if val in ('undefined', 'void 0'):
1647                 args[key] = 'null'
1648
1649         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1650         return traverse_obj(ret, traverse) or {}
1651
1652     @staticmethod
1653     def _hidden_inputs(html):
1654         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1655         hidden_inputs = {}
1656         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1657             attrs = extract_attributes(input)
1658             if not input:
1659                 continue
1660             if attrs.get('type') not in ('hidden', 'submit'):
1661                 continue
1662             name = attrs.get('name') or attrs.get('id')
1663             value = attrs.get('value')
1664             if name and value is not None:
1665                 hidden_inputs[name] = value
1666         return hidden_inputs
1667
1668     def _form_hidden_inputs(self, form_id, html):
1669         form = self._search_regex(
1670             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1671             html, '%s form' % form_id, group='form')
1672         return self._hidden_inputs(form)
1673
1674     class FormatSort:
1675         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1676
1677         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1678                    'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
1679                    'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1680         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1681                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1682                         'fps', 'fs_approx', 'source', 'id')
1683
1684         settings = {
1685             'vcodec': {'type': 'ordered', 'regex': True,
1686                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1687             'acodec': {'type': 'ordered', 'regex': True,
1688                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1689             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1690                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1691             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1692                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1693             'vext': {'type': 'ordered', 'field': 'video_ext',
1694                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1695                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1696             'aext': {'type': 'ordered', 'field': 'audio_ext',
1697                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1698                      'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
1699             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1700             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1701                            'field': ('vcodec', 'acodec'),
1702                            'function': lambda it: int(any(v != 'none' for v in it))},
1703             'ie_pref': {'priority': True, 'type': 'extractor'},
1704             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1705             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1706             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1707             'quality': {'convert': 'float', 'default': -1},
1708             'filesize': {'convert': 'bytes'},
1709             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1710             'id': {'convert': 'string', 'field': 'format_id'},
1711             'height': {'convert': 'float_none'},
1712             'width': {'convert': 'float_none'},
1713             'fps': {'convert': 'float_none'},
1714             'channels': {'convert': 'float_none', 'field': 'audio_channels'},
1715             'tbr': {'convert': 'float_none'},
1716             'vbr': {'convert': 'float_none'},
1717             'abr': {'convert': 'float_none'},
1718             'asr': {'convert': 'float_none'},
1719             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1720
1721             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1722             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1723             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1724             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1725             'res': {'type': 'multiple', 'field': ('height', 'width'),
1726                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1727
1728             # Actual field names
1729             'format_id': {'type': 'alias', 'field': 'id'},
1730             'preference': {'type': 'alias', 'field': 'ie_pref'},
1731             'language_preference': {'type': 'alias', 'field': 'lang'},
1732             'source_preference': {'type': 'alias', 'field': 'source'},
1733             'protocol': {'type': 'alias', 'field': 'proto'},
1734             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1735             'audio_channels': {'type': 'alias', 'field': 'channels'},
1736
1737             # Deprecated
1738             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1739             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1740             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1741             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1742             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1743             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1744             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1745             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1746             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1747             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1748             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1749             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1750             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1751             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1752             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1753             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1754             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1755             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1756             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1757             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1758         }
1759
1760         def __init__(self, ie, field_preference):
1761             self._order = []
1762             self.ydl = ie._downloader
1763             self.evaluate_params(self.ydl.params, field_preference)
1764             if ie.get_param('verbose'):
1765                 self.print_verbose_info(self.ydl.write_debug)
1766
1767         def _get_field_setting(self, field, key):
1768             if field not in self.settings:
1769                 if key in ('forced', 'priority'):
1770                     return False
1771                 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
1772                                             'deprecated and may be removed in a future version')
1773                 self.settings[field] = {}
1774             propObj = self.settings[field]
1775             if key not in propObj:
1776                 type = propObj.get('type')
1777                 if key == 'field':
1778                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1779                 elif key == 'convert':
1780                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1781                 else:
1782                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1783                 propObj[key] = default
1784             return propObj[key]
1785
1786         def _resolve_field_value(self, field, value, convertNone=False):
1787             if value is None:
1788                 if not convertNone:
1789                     return None
1790             else:
1791                 value = value.lower()
1792             conversion = self._get_field_setting(field, 'convert')
1793             if conversion == 'ignore':
1794                 return None
1795             if conversion == 'string':
1796                 return value
1797             elif conversion == 'float_none':
1798                 return float_or_none(value)
1799             elif conversion == 'bytes':
1800                 return FileDownloader.parse_bytes(value)
1801             elif conversion == 'order':
1802                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1803                 use_regex = self._get_field_setting(field, 'regex')
1804                 list_length = len(order_list)
1805                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1806                 if use_regex and value is not None:
1807                     for i, regex in enumerate(order_list):
1808                         if regex and re.match(regex, value):
1809                             return list_length - i
1810                     return list_length - empty_pos  # not in list
1811                 else:  # not regex or  value = None
1812                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1813             else:
1814                 if value.isnumeric():
1815                     return float(value)
1816                 else:
1817                     self.settings[field]['convert'] = 'string'
1818                     return value
1819
1820         def evaluate_params(self, params, sort_extractor):
1821             self._use_free_order = params.get('prefer_free_formats', False)
1822             self._sort_user = params.get('format_sort', [])
1823             self._sort_extractor = sort_extractor
1824
1825             def add_item(field, reverse, closest, limit_text):
1826                 field = field.lower()
1827                 if field in self._order:
1828                     return
1829                 self._order.append(field)
1830                 limit = self._resolve_field_value(field, limit_text)
1831                 data = {
1832                     'reverse': reverse,
1833                     'closest': False if limit is None else closest,
1834                     'limit_text': limit_text,
1835                     'limit': limit}
1836                 if field in self.settings:
1837                     self.settings[field].update(data)
1838                 else:
1839                     self.settings[field] = data
1840
1841             sort_list = (
1842                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1843                 + (tuple() if params.get('format_sort_force', False)
1844                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1845                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1846
1847             for item in sort_list:
1848                 match = re.match(self.regex, item)
1849                 if match is None:
1850                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1851                 field = match.group('field')
1852                 if field is None:
1853                     continue
1854                 if self._get_field_setting(field, 'type') == 'alias':
1855                     alias, field = field, self._get_field_setting(field, 'field')
1856                     if self._get_field_setting(alias, 'deprecated'):
1857                         self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
1858                                                     'be removed in a future version. Please use {field} instead')
1859                 reverse = match.group('reverse') is not None
1860                 closest = match.group('separator') == '~'
1861                 limit_text = match.group('limit')
1862
1863                 has_limit = limit_text is not None
1864                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1865                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1866
1867                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1868                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1869                 limit_count = len(limits)
1870                 for (i, f) in enumerate(fields):
1871                     add_item(f, reverse, closest,
1872                              limits[i] if i < limit_count
1873                              else limits[0] if has_limit and not has_multiple_limits
1874                              else None)
1875
1876         def print_verbose_info(self, write_debug):
1877             if self._sort_user:
1878                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1879             if self._sort_extractor:
1880                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1881             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1882                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1883                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1884                               self._get_field_setting(field, 'limit_text'),
1885                               self._get_field_setting(field, 'limit'))
1886                 if self._get_field_setting(field, 'limit_text') is not None else '')
1887                 for field in self._order if self._get_field_setting(field, 'visible')]))
1888
1889         def _calculate_field_preference_from_value(self, format, field, type, value):
1890             reverse = self._get_field_setting(field, 'reverse')
1891             closest = self._get_field_setting(field, 'closest')
1892             limit = self._get_field_setting(field, 'limit')
1893
1894             if type == 'extractor':
1895                 maximum = self._get_field_setting(field, 'max')
1896                 if value is None or (maximum is not None and value >= maximum):
1897                     value = -1
1898             elif type == 'boolean':
1899                 in_list = self._get_field_setting(field, 'in_list')
1900                 not_in_list = self._get_field_setting(field, 'not_in_list')
1901                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1902             elif type == 'ordered':
1903                 value = self._resolve_field_value(field, value, True)
1904
1905             # try to convert to number
1906             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1907             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1908             if is_num:
1909                 value = val_num
1910
1911             return ((-10, 0) if value is None
1912                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1913                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1914                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1915                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1916                     else (-1, value, 0))
1917
1918         def _calculate_field_preference(self, format, field):
1919             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1920             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1921             if type == 'multiple':
1922                 type = 'field'  # Only 'field' is allowed in multiple for now
1923                 actual_fields = self._get_field_setting(field, 'field')
1924
1925                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1926             else:
1927                 value = get_value(field)
1928             return self._calculate_field_preference_from_value(format, field, type, value)
1929
1930         def calculate_preference(self, format):
1931             # Determine missing protocol
1932             if not format.get('protocol'):
1933                 format['protocol'] = determine_protocol(format)
1934
1935             # Determine missing ext
1936             if not format.get('ext') and 'url' in format:
1937                 format['ext'] = determine_ext(format['url'])
1938             if format.get('vcodec') == 'none':
1939                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1940                 format['video_ext'] = 'none'
1941             else:
1942                 format['video_ext'] = format['ext']
1943                 format['audio_ext'] = 'none'
1944             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1945             #    format['preference'] = -1000
1946
1947             # Determine missing bitrates
1948             if format.get('tbr') is None:
1949                 if format.get('vbr') is not None and format.get('abr') is not None:
1950                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1951             else:
1952                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1953                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1954                 if format.get('acodec') != 'none' and format.get('abr') is None:
1955                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1956
1957             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1958
1959     def _sort_formats(self, formats, field_preference=[]):
1960         if not formats:
1961             return
1962         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1963
1964     def _check_formats(self, formats, video_id):
1965         if formats:
1966             formats[:] = filter(
1967                 lambda f: self._is_valid_url(
1968                     f['url'], video_id,
1969                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1970                 formats)
1971
1972     @staticmethod
1973     def _remove_duplicate_formats(formats):
1974         format_urls = set()
1975         unique_formats = []
1976         for f in formats:
1977             if f['url'] not in format_urls:
1978                 format_urls.add(f['url'])
1979                 unique_formats.append(f)
1980         formats[:] = unique_formats
1981
1982     def _is_valid_url(self, url, video_id, item='video', headers={}):
1983         url = self._proto_relative_url(url, scheme='http:')
1984         # For now assume non HTTP(S) URLs always valid
1985         if not (url.startswith('http://') or url.startswith('https://')):
1986             return True
1987         try:
1988             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1989             return True
1990         except ExtractorError as e:
1991             self.to_screen(
1992                 '%s: %s URL is invalid, skipping: %s'
1993                 % (video_id, item, error_to_compat_str(e.cause)))
1994             return False
1995
1996     def http_scheme(self):
1997         """ Either "http:" or "https:", depending on the user's preferences """
1998         return (
1999             'http:'
2000             if self.get_param('prefer_insecure', False)
2001             else 'https:')
2002
2003     def _proto_relative_url(self, url, scheme=None):
2004         scheme = scheme or self.http_scheme()
2005         assert scheme.endswith(':')
2006         return sanitize_url(url, scheme=scheme[:-1])
2007
2008     def _sleep(self, timeout, video_id, msg_template=None):
2009         if msg_template is None:
2010             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
2011         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
2012         self.to_screen(msg)
2013         time.sleep(timeout)
2014
2015     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2016                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
2017                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
2018         res = self._download_xml_handle(
2019             manifest_url, video_id, 'Downloading f4m manifest',
2020             'Unable to download f4m manifest',
2021             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
2022             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
2023             transform_source=transform_source,
2024             fatal=fatal, data=data, headers=headers, query=query)
2025         if res is False:
2026             return []
2027
2028         manifest, urlh = res
2029         manifest_url = urlh.geturl()
2030
2031         return self._parse_f4m_formats(
2032             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2033             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2034
2035     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2036                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2037                            fatal=True, m3u8_id=None):
2038         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2039             return []
2040
2041         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2042         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2043         if akamai_pv is not None and ';' in akamai_pv.text:
2044             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2045             if playerVerificationChallenge.strip() != '':
2046                 return []
2047
2048         formats = []
2049         manifest_version = '1.0'
2050         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2051         if not media_nodes:
2052             manifest_version = '2.0'
2053             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2054         # Remove unsupported DRM protected media from final formats
2055         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2056         media_nodes = remove_encrypted_media(media_nodes)
2057         if not media_nodes:
2058             return formats
2059
2060         manifest_base_url = get_base_url(manifest)
2061
2062         bootstrap_info = xpath_element(
2063             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2064             'bootstrap info', default=None)
2065
2066         vcodec = None
2067         mime_type = xpath_text(
2068             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2069             'base URL', default=None)
2070         if mime_type and mime_type.startswith('audio/'):
2071             vcodec = 'none'
2072
2073         for i, media_el in enumerate(media_nodes):
2074             tbr = int_or_none(media_el.attrib.get('bitrate'))
2075             width = int_or_none(media_el.attrib.get('width'))
2076             height = int_or_none(media_el.attrib.get('height'))
2077             format_id = join_nonempty(f4m_id, tbr or i)
2078             # If <bootstrapInfo> is present, the specified f4m is a
2079             # stream-level manifest, and only set-level manifests may refer to
2080             # external resources.  See section 11.4 and section 4 of F4M spec
2081             if bootstrap_info is None:
2082                 media_url = None
2083                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2084                 if manifest_version == '2.0':
2085                     media_url = media_el.attrib.get('href')
2086                 if media_url is None:
2087                     media_url = media_el.attrib.get('url')
2088                 if not media_url:
2089                     continue
2090                 manifest_url = (
2091                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2092                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2093                 # If media_url is itself a f4m manifest do the recursive extraction
2094                 # since bitrates in parent manifest (this one) and media_url manifest
2095                 # may differ leading to inability to resolve the format by requested
2096                 # bitrate in f4m downloader
2097                 ext = determine_ext(manifest_url)
2098                 if ext == 'f4m':
2099                     f4m_formats = self._extract_f4m_formats(
2100                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2101                         transform_source=transform_source, fatal=fatal)
2102                     # Sometimes stream-level manifest contains single media entry that
2103                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2104                     # At the same time parent's media entry in set-level manifest may
2105                     # contain it. We will copy it from parent in such cases.
2106                     if len(f4m_formats) == 1:
2107                         f = f4m_formats[0]
2108                         f.update({
2109                             'tbr': f.get('tbr') or tbr,
2110                             'width': f.get('width') or width,
2111                             'height': f.get('height') or height,
2112                             'format_id': f.get('format_id') if not tbr else format_id,
2113                             'vcodec': vcodec,
2114                         })
2115                     formats.extend(f4m_formats)
2116                     continue
2117                 elif ext == 'm3u8':
2118                     formats.extend(self._extract_m3u8_formats(
2119                         manifest_url, video_id, 'mp4', preference=preference,
2120                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2121                     continue
2122             formats.append({
2123                 'format_id': format_id,
2124                 'url': manifest_url,
2125                 'manifest_url': manifest_url,
2126                 'ext': 'flv' if bootstrap_info is not None else None,
2127                 'protocol': 'f4m',
2128                 'tbr': tbr,
2129                 'width': width,
2130                 'height': height,
2131                 'vcodec': vcodec,
2132                 'preference': preference,
2133                 'quality': quality,
2134             })
2135         return formats
2136
2137     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2138         return {
2139             'format_id': join_nonempty(m3u8_id, 'meta'),
2140             'url': m3u8_url,
2141             'ext': ext,
2142             'protocol': 'm3u8',
2143             'preference': preference - 100 if preference else -100,
2144             'quality': quality,
2145             'resolution': 'multiple',
2146             'format_note': 'Quality selection URL',
2147         }
2148
2149     def _report_ignoring_subs(self, name):
2150         self.report_warning(bug_reports_message(
2151             f'Ignoring subtitle tracks found in the {name} manifest; '
2152             'if any subtitle tracks are missing,'
2153         ), only_once=True)
2154
2155     def _extract_m3u8_formats(self, *args, **kwargs):
2156         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2157         if subs:
2158             self._report_ignoring_subs('HLS')
2159         return fmts
2160
2161     def _extract_m3u8_formats_and_subtitles(
2162             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2163             preference=None, quality=None, m3u8_id=None, note=None,
2164             errnote=None, fatal=True, live=False, data=None, headers={},
2165             query={}):
2166
2167         res = self._download_webpage_handle(
2168             m3u8_url, video_id,
2169             note='Downloading m3u8 information' if note is None else note,
2170             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2171             fatal=fatal, data=data, headers=headers, query=query)
2172
2173         if res is False:
2174             return [], {}
2175
2176         m3u8_doc, urlh = res
2177         m3u8_url = urlh.geturl()
2178
2179         return self._parse_m3u8_formats_and_subtitles(
2180             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2181             preference=preference, quality=quality, m3u8_id=m3u8_id,
2182             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2183             headers=headers, query=query, video_id=video_id)
2184
2185     def _parse_m3u8_formats_and_subtitles(
2186             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2187             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2188             errnote=None, fatal=True, data=None, headers={}, query={},
2189             video_id=None):
2190         formats, subtitles = [], {}
2191
2192         has_drm = re.search('|'.join([
2193             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2194             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2195         ]), m3u8_doc)
2196
2197         def format_url(url):
2198             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2199
2200         if self.get_param('hls_split_discontinuity', False):
2201             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2202                 if not m3u8_doc:
2203                     if not manifest_url:
2204                         return []
2205                     m3u8_doc = self._download_webpage(
2206                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2207                         note=False, errnote='Failed to download m3u8 playlist information')
2208                     if m3u8_doc is False:
2209                         return []
2210                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2211
2212         else:
2213             def _extract_m3u8_playlist_indices(*args, **kwargs):
2214                 return [None]
2215
2216         # References:
2217         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2218         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2219         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2220
2221         # We should try extracting formats only from master playlists [1, 4.3.4],
2222         # i.e. playlists that describe available qualities. On the other hand
2223         # media playlists [1, 4.3.3] should be returned as is since they contain
2224         # just the media without qualities renditions.
2225         # Fortunately, master playlist can be easily distinguished from media
2226         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2227         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2228         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2229         # media playlist and MUST NOT appear in master playlist thus we can
2230         # clearly detect media playlist with this criterion.
2231
2232         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2233             formats = [{
2234                 'format_id': join_nonempty(m3u8_id, idx),
2235                 'format_index': idx,
2236                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2237                 'ext': ext,
2238                 'protocol': entry_protocol,
2239                 'preference': preference,
2240                 'quality': quality,
2241                 'has_drm': has_drm,
2242             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2243
2244             return formats, subtitles
2245
2246         groups = {}
2247         last_stream_inf = {}
2248
2249         def extract_media(x_media_line):
2250             media = parse_m3u8_attributes(x_media_line)
2251             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2252             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2253             if not (media_type and group_id and name):
2254                 return
2255             groups.setdefault(group_id, []).append(media)
2256             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2257             if media_type == 'SUBTITLES':
2258                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2259                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2260                 # However, lack of URI has been spotted in the wild.
2261                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2262                 if not media.get('URI'):
2263                     return
2264                 url = format_url(media['URI'])
2265                 sub_info = {
2266                     'url': url,
2267                     'ext': determine_ext(url),
2268                 }
2269                 if sub_info['ext'] == 'm3u8':
2270                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2271                     # files may contain is WebVTT:
2272                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2273                     sub_info['ext'] = 'vtt'
2274                     sub_info['protocol'] = 'm3u8_native'
2275                 lang = media.get('LANGUAGE') or 'und'
2276                 subtitles.setdefault(lang, []).append(sub_info)
2277             if media_type not in ('VIDEO', 'AUDIO'):
2278                 return
2279             media_url = media.get('URI')
2280             if media_url:
2281                 manifest_url = format_url(media_url)
2282                 formats.extend({
2283                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2284                     'format_note': name,
2285                     'format_index': idx,
2286                     'url': manifest_url,
2287                     'manifest_url': m3u8_url,
2288                     'language': media.get('LANGUAGE'),
2289                     'ext': ext,
2290                     'protocol': entry_protocol,
2291                     'preference': preference,
2292                     'quality': quality,
2293                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2294                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2295
2296         def build_stream_name():
2297             # Despite specification does not mention NAME attribute for
2298             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2299             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2300             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2301             stream_name = last_stream_inf.get('NAME')
2302             if stream_name:
2303                 return stream_name
2304             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2305             # from corresponding rendition group
2306             stream_group_id = last_stream_inf.get('VIDEO')
2307             if not stream_group_id:
2308                 return
2309             stream_group = groups.get(stream_group_id)
2310             if not stream_group:
2311                 return stream_group_id
2312             rendition = stream_group[0]
2313             return rendition.get('NAME') or stream_group_id
2314
2315         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2316         # chance to detect video only formats when EXT-X-STREAM-INF tags
2317         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2318         for line in m3u8_doc.splitlines():
2319             if line.startswith('#EXT-X-MEDIA:'):
2320                 extract_media(line)
2321
2322         for line in m3u8_doc.splitlines():
2323             if line.startswith('#EXT-X-STREAM-INF:'):
2324                 last_stream_inf = parse_m3u8_attributes(line)
2325             elif line.startswith('#') or not line.strip():
2326                 continue
2327             else:
2328                 tbr = float_or_none(
2329                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2330                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2331                 manifest_url = format_url(line.strip())
2332
2333                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2334                     format_id = [m3u8_id, None, idx]
2335                     # Bandwidth of live streams may differ over time thus making
2336                     # format_id unpredictable. So it's better to keep provided
2337                     # format_id intact.
2338                     if not live:
2339                         stream_name = build_stream_name()
2340                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2341                     f = {
2342                         'format_id': join_nonempty(*format_id),
2343                         'format_index': idx,
2344                         'url': manifest_url,
2345                         'manifest_url': m3u8_url,
2346                         'tbr': tbr,
2347                         'ext': ext,
2348                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2349                         'protocol': entry_protocol,
2350                         'preference': preference,
2351                         'quality': quality,
2352                     }
2353                     resolution = last_stream_inf.get('RESOLUTION')
2354                     if resolution:
2355                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2356                         if mobj:
2357                             f['width'] = int(mobj.group('width'))
2358                             f['height'] = int(mobj.group('height'))
2359                     # Unified Streaming Platform
2360                     mobj = re.search(
2361                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2362                     if mobj:
2363                         abr, vbr = mobj.groups()
2364                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2365                         f.update({
2366                             'vbr': vbr,
2367                             'abr': abr,
2368                         })
2369                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2370                     f.update(codecs)
2371                     audio_group_id = last_stream_inf.get('AUDIO')
2372                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2373                     # references a rendition group MUST have a CODECS attribute.
2374                     # However, this is not always respected. E.g. [2]
2375                     # contains EXT-X-STREAM-INF tag which references AUDIO
2376                     # rendition group but does not have CODECS and despite
2377                     # referencing an audio group it represents a complete
2378                     # (with audio and video) format. So, for such cases we will
2379                     # ignore references to rendition groups and treat them
2380                     # as complete formats.
2381                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2382                         audio_group = groups.get(audio_group_id)
2383                         if audio_group and audio_group[0].get('URI'):
2384                             # TODO: update acodec for audio only formats with
2385                             # the same GROUP-ID
2386                             f['acodec'] = 'none'
2387                     if not f.get('ext'):
2388                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2389                     formats.append(f)
2390
2391                     # for DailyMotion
2392                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2393                     if progressive_uri:
2394                         http_f = f.copy()
2395                         del http_f['manifest_url']
2396                         http_f.update({
2397                             'format_id': f['format_id'].replace('hls-', 'http-'),
2398                             'protocol': 'http',
2399                             'url': progressive_uri,
2400                         })
2401                         formats.append(http_f)
2402
2403                 last_stream_inf = {}
2404         return formats, subtitles
2405
2406     def _extract_m3u8_vod_duration(
2407             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2408
2409         m3u8_vod = self._download_webpage(
2410             m3u8_vod_url, video_id,
2411             note='Downloading m3u8 VOD manifest' if note is None else note,
2412             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2413             fatal=False, data=data, headers=headers, query=query)
2414
2415         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2416
2417     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2418         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2419             return None
2420
2421         return int(sum(
2422             float(line[len('#EXTINF:'):].split(',')[0])
2423             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2424
2425     @staticmethod
2426     def _xpath_ns(path, namespace=None):
2427         if not namespace:
2428             return path
2429         out = []
2430         for c in path.split('/'):
2431             if not c or c == '.':
2432                 out.append(c)
2433             else:
2434                 out.append('{%s}%s' % (namespace, c))
2435         return '/'.join(out)
2436
2437     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2438         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2439         if res is False:
2440             assert not fatal
2441             return [], {}
2442
2443         smil, urlh = res
2444         smil_url = urlh.geturl()
2445
2446         namespace = self._parse_smil_namespace(smil)
2447
2448         fmts = self._parse_smil_formats(
2449             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2450         subs = self._parse_smil_subtitles(
2451             smil, namespace=namespace)
2452
2453         return fmts, subs
2454
2455     def _extract_smil_formats(self, *args, **kwargs):
2456         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2457         if subs:
2458             self._report_ignoring_subs('SMIL')
2459         return fmts
2460
2461     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2462         res = self._download_smil(smil_url, video_id, fatal=fatal)
2463         if res is False:
2464             return {}
2465
2466         smil, urlh = res
2467         smil_url = urlh.geturl()
2468
2469         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2470
2471     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2472         return self._download_xml_handle(
2473             smil_url, video_id, 'Downloading SMIL file',
2474             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2475
2476     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2477         namespace = self._parse_smil_namespace(smil)
2478
2479         formats = self._parse_smil_formats(
2480             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2481         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2482
2483         video_id = os.path.splitext(url_basename(smil_url))[0]
2484         title = None
2485         description = None
2486         upload_date = None
2487         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2488             name = meta.attrib.get('name')
2489             content = meta.attrib.get('content')
2490             if not name or not content:
2491                 continue
2492             if not title and name == 'title':
2493                 title = content
2494             elif not description and name in ('description', 'abstract'):
2495                 description = content
2496             elif not upload_date and name == 'date':
2497                 upload_date = unified_strdate(content)
2498
2499         thumbnails = [{
2500             'id': image.get('type'),
2501             'url': image.get('src'),
2502             'width': int_or_none(image.get('width')),
2503             'height': int_or_none(image.get('height')),
2504         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2505
2506         return {
2507             'id': video_id,
2508             'title': title or video_id,
2509             'description': description,
2510             'upload_date': upload_date,
2511             'thumbnails': thumbnails,
2512             'formats': formats,
2513             'subtitles': subtitles,
2514         }
2515
2516     def _parse_smil_namespace(self, smil):
2517         return self._search_regex(
2518             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2519
2520     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2521         base = smil_url
2522         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2523             b = meta.get('base') or meta.get('httpBase')
2524             if b:
2525                 base = b
2526                 break
2527
2528         formats = []
2529         rtmp_count = 0
2530         http_count = 0
2531         m3u8_count = 0
2532         imgs_count = 0
2533
2534         srcs = set()
2535         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2536         for medium in media:
2537             src = medium.get('src')
2538             if not src or src in srcs:
2539                 continue
2540             srcs.add(src)
2541
2542             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2543             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2544             width = int_or_none(medium.get('width'))
2545             height = int_or_none(medium.get('height'))
2546             proto = medium.get('proto')
2547             ext = medium.get('ext')
2548             src_ext = determine_ext(src)
2549             streamer = medium.get('streamer') or base
2550
2551             if proto == 'rtmp' or streamer.startswith('rtmp'):
2552                 rtmp_count += 1
2553                 formats.append({
2554                     'url': streamer,
2555                     'play_path': src,
2556                     'ext': 'flv',
2557                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2558                     'tbr': bitrate,
2559                     'filesize': filesize,
2560                     'width': width,
2561                     'height': height,
2562                 })
2563                 if transform_rtmp_url:
2564                     streamer, src = transform_rtmp_url(streamer, src)
2565                     formats[-1].update({
2566                         'url': streamer,
2567                         'play_path': src,
2568                     })
2569                 continue
2570
2571             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2572             src_url = src_url.strip()
2573
2574             if proto == 'm3u8' or src_ext == 'm3u8':
2575                 m3u8_formats = self._extract_m3u8_formats(
2576                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2577                 if len(m3u8_formats) == 1:
2578                     m3u8_count += 1
2579                     m3u8_formats[0].update({
2580                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2581                         'tbr': bitrate,
2582                         'width': width,
2583                         'height': height,
2584                     })
2585                 formats.extend(m3u8_formats)
2586             elif src_ext == 'f4m':
2587                 f4m_url = src_url
2588                 if not f4m_params:
2589                     f4m_params = {
2590                         'hdcore': '3.2.0',
2591                         'plugin': 'flowplayer-3.2.0.1',
2592                     }
2593                 f4m_url += '&' if '?' in f4m_url else '?'
2594                 f4m_url += urllib.parse.urlencode(f4m_params)
2595                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2596             elif src_ext == 'mpd':
2597                 formats.extend(self._extract_mpd_formats(
2598                     src_url, video_id, mpd_id='dash', fatal=False))
2599             elif re.search(r'\.ism/[Mm]anifest', src_url):
2600                 formats.extend(self._extract_ism_formats(
2601                     src_url, video_id, ism_id='mss', fatal=False))
2602             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2603                 http_count += 1
2604                 formats.append({
2605                     'url': src_url,
2606                     'ext': ext or src_ext or 'flv',
2607                     'format_id': 'http-%d' % (bitrate or http_count),
2608                     'tbr': bitrate,
2609                     'filesize': filesize,
2610                     'width': width,
2611                     'height': height,
2612                 })
2613
2614         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2615             src = medium.get('src')
2616             if not src or src in srcs:
2617                 continue
2618             srcs.add(src)
2619
2620             imgs_count += 1
2621             formats.append({
2622                 'format_id': 'imagestream-%d' % (imgs_count),
2623                 'url': src,
2624                 'ext': mimetype2ext(medium.get('type')),
2625                 'acodec': 'none',
2626                 'vcodec': 'none',
2627                 'width': int_or_none(medium.get('width')),
2628                 'height': int_or_none(medium.get('height')),
2629                 'format_note': 'SMIL storyboards',
2630             })
2631
2632         return formats
2633
2634     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2635         urls = []
2636         subtitles = {}
2637         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2638             src = textstream.get('src')
2639             if not src or src in urls:
2640                 continue
2641             urls.append(src)
2642             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2643             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2644             subtitles.setdefault(lang, []).append({
2645                 'url': src,
2646                 'ext': ext,
2647             })
2648         return subtitles
2649
2650     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2651         res = self._download_xml_handle(
2652             xspf_url, playlist_id, 'Downloading xpsf playlist',
2653             'Unable to download xspf manifest', fatal=fatal)
2654         if res is False:
2655             return []
2656
2657         xspf, urlh = res
2658         xspf_url = urlh.geturl()
2659
2660         return self._parse_xspf(
2661             xspf, playlist_id, xspf_url=xspf_url,
2662             xspf_base_url=base_url(xspf_url))
2663
2664     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2665         NS_MAP = {
2666             'xspf': 'http://xspf.org/ns/0/',
2667             's1': 'http://static.streamone.nl/player/ns/0',
2668         }
2669
2670         entries = []
2671         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2672             title = xpath_text(
2673                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2674             description = xpath_text(
2675                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2676             thumbnail = xpath_text(
2677                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2678             duration = float_or_none(
2679                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2680
2681             formats = []
2682             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2683                 format_url = urljoin(xspf_base_url, location.text)
2684                 if not format_url:
2685                     continue
2686                 formats.append({
2687                     'url': format_url,
2688                     'manifest_url': xspf_url,
2689                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2690                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2691                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2692                 })
2693             self._sort_formats(formats)
2694
2695             entries.append({
2696                 'id': playlist_id,
2697                 'title': title,
2698                 'description': description,
2699                 'thumbnail': thumbnail,
2700                 'duration': duration,
2701                 'formats': formats,
2702             })
2703         return entries
2704
2705     def _extract_mpd_formats(self, *args, **kwargs):
2706         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2707         if subs:
2708             self._report_ignoring_subs('DASH')
2709         return fmts
2710
2711     def _extract_mpd_formats_and_subtitles(
2712             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2713             fatal=True, data=None, headers={}, query={}):
2714         res = self._download_xml_handle(
2715             mpd_url, video_id,
2716             note='Downloading MPD manifest' if note is None else note,
2717             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2718             fatal=fatal, data=data, headers=headers, query=query)
2719         if res is False:
2720             return [], {}
2721         mpd_doc, urlh = res
2722         if mpd_doc is None:
2723             return [], {}
2724
2725         # We could have been redirected to a new url when we retrieved our mpd file.
2726         mpd_url = urlh.geturl()
2727         mpd_base_url = base_url(mpd_url)
2728
2729         return self._parse_mpd_formats_and_subtitles(
2730             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2731
2732     def _parse_mpd_formats(self, *args, **kwargs):
2733         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2734         if subs:
2735             self._report_ignoring_subs('DASH')
2736         return fmts
2737
2738     def _parse_mpd_formats_and_subtitles(
2739             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2740         """
2741         Parse formats from MPD manifest.
2742         References:
2743          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2744             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2745          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2746         """
2747         if not self.get_param('dynamic_mpd', True):
2748             if mpd_doc.get('type') == 'dynamic':
2749                 return [], {}
2750
2751         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2752
2753         def _add_ns(path):
2754             return self._xpath_ns(path, namespace)
2755
2756         def is_drm_protected(element):
2757             return element.find(_add_ns('ContentProtection')) is not None
2758
2759         def extract_multisegment_info(element, ms_parent_info):
2760             ms_info = ms_parent_info.copy()
2761
2762             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2763             # common attributes and elements.  We will only extract relevant
2764             # for us.
2765             def extract_common(source):
2766                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2767                 if segment_timeline is not None:
2768                     s_e = segment_timeline.findall(_add_ns('S'))
2769                     if s_e:
2770                         ms_info['total_number'] = 0
2771                         ms_info['s'] = []
2772                         for s in s_e:
2773                             r = int(s.get('r', 0))
2774                             ms_info['total_number'] += 1 + r
2775                             ms_info['s'].append({
2776                                 't': int(s.get('t', 0)),
2777                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2778                                 'd': int(s.attrib['d']),
2779                                 'r': r,
2780                             })
2781                 start_number = source.get('startNumber')
2782                 if start_number:
2783                     ms_info['start_number'] = int(start_number)
2784                 timescale = source.get('timescale')
2785                 if timescale:
2786                     ms_info['timescale'] = int(timescale)
2787                 segment_duration = source.get('duration')
2788                 if segment_duration:
2789                     ms_info['segment_duration'] = float(segment_duration)
2790
2791             def extract_Initialization(source):
2792                 initialization = source.find(_add_ns('Initialization'))
2793                 if initialization is not None:
2794                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2795
2796             segment_list = element.find(_add_ns('SegmentList'))
2797             if segment_list is not None:
2798                 extract_common(segment_list)
2799                 extract_Initialization(segment_list)
2800                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2801                 if segment_urls_e:
2802                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2803             else:
2804                 segment_template = element.find(_add_ns('SegmentTemplate'))
2805                 if segment_template is not None:
2806                     extract_common(segment_template)
2807                     media = segment_template.get('media')
2808                     if media:
2809                         ms_info['media'] = media
2810                     initialization = segment_template.get('initialization')
2811                     if initialization:
2812                         ms_info['initialization'] = initialization
2813                     else:
2814                         extract_Initialization(segment_template)
2815             return ms_info
2816
2817         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2818         formats, subtitles = [], {}
2819         stream_numbers = collections.defaultdict(int)
2820         for period in mpd_doc.findall(_add_ns('Period')):
2821             period_duration = parse_duration(period.get('duration')) or mpd_duration
2822             period_ms_info = extract_multisegment_info(period, {
2823                 'start_number': 1,
2824                 'timescale': 1,
2825             })
2826             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2827                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2828                 for representation in adaptation_set.findall(_add_ns('Representation')):
2829                     representation_attrib = adaptation_set.attrib.copy()
2830                     representation_attrib.update(representation.attrib)
2831                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2832                     mime_type = representation_attrib['mimeType']
2833                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2834
2835                     codec_str = representation_attrib.get('codecs', '')
2836                     # Some kind of binary subtitle found in some youtube livestreams
2837                     if mime_type == 'application/x-rawcc':
2838                         codecs = {'scodec': codec_str}
2839                     else:
2840                         codecs = parse_codecs(codec_str)
2841                     if content_type not in ('video', 'audio', 'text'):
2842                         if mime_type == 'image/jpeg':
2843                             content_type = mime_type
2844                         elif codecs.get('vcodec', 'none') != 'none':
2845                             content_type = 'video'
2846                         elif codecs.get('acodec', 'none') != 'none':
2847                             content_type = 'audio'
2848                         elif codecs.get('scodec', 'none') != 'none':
2849                             content_type = 'text'
2850                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2851                             content_type = 'text'
2852                         else:
2853                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2854                             continue
2855
2856                     base_url = ''
2857                     for element in (representation, adaptation_set, period, mpd_doc):
2858                         base_url_e = element.find(_add_ns('BaseURL'))
2859                         if try_call(lambda: base_url_e.text) is not None:
2860                             base_url = base_url_e.text + base_url
2861                             if re.match(r'^https?://', base_url):
2862                                 break
2863                     if mpd_base_url and base_url.startswith('/'):
2864                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2865                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2866                         if not mpd_base_url.endswith('/'):
2867                             mpd_base_url += '/'
2868                         base_url = mpd_base_url + base_url
2869                     representation_id = representation_attrib.get('id')
2870                     lang = representation_attrib.get('lang')
2871                     url_el = representation.find(_add_ns('BaseURL'))
2872                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2873                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2874                     if representation_id is not None:
2875                         format_id = representation_id
2876                     else:
2877                         format_id = content_type
2878                     if mpd_id:
2879                         format_id = mpd_id + '-' + format_id
2880                     if content_type in ('video', 'audio'):
2881                         f = {
2882                             'format_id': format_id,
2883                             'manifest_url': mpd_url,
2884                             'ext': mimetype2ext(mime_type),
2885                             'width': int_or_none(representation_attrib.get('width')),
2886                             'height': int_or_none(representation_attrib.get('height')),
2887                             'tbr': float_or_none(bandwidth, 1000),
2888                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2889                             'fps': int_or_none(representation_attrib.get('frameRate')),
2890                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2891                             'format_note': 'DASH %s' % content_type,
2892                             'filesize': filesize,
2893                             'container': mimetype2ext(mime_type) + '_dash',
2894                             **codecs
2895                         }
2896                     elif content_type == 'text':
2897                         f = {
2898                             'ext': mimetype2ext(mime_type),
2899                             'manifest_url': mpd_url,
2900                             'filesize': filesize,
2901                         }
2902                     elif content_type == 'image/jpeg':
2903                         # See test case in VikiIE
2904                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2905                         f = {
2906                             'format_id': format_id,
2907                             'ext': 'mhtml',
2908                             'manifest_url': mpd_url,
2909                             'format_note': 'DASH storyboards (jpeg)',
2910                             'acodec': 'none',
2911                             'vcodec': 'none',
2912                         }
2913                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2914                         f['has_drm'] = True
2915                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2916
2917                     def prepare_template(template_name, identifiers):
2918                         tmpl = representation_ms_info[template_name]
2919                         if representation_id is not None:
2920                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2921                         # First of, % characters outside $...$ templates
2922                         # must be escaped by doubling for proper processing
2923                         # by % operator string formatting used further (see
2924                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2925                         t = ''
2926                         in_template = False
2927                         for c in tmpl:
2928                             t += c
2929                             if c == '$':
2930                                 in_template = not in_template
2931                             elif c == '%' and not in_template:
2932                                 t += c
2933                         # Next, $...$ templates are translated to their
2934                         # %(...) counterparts to be used with % operator
2935                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2936                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2937                         t.replace('$$', '$')
2938                         return t
2939
2940                     # @initialization is a regular template like @media one
2941                     # so it should be handled just the same way (see
2942                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2943                     if 'initialization' in representation_ms_info:
2944                         initialization_template = prepare_template(
2945                             'initialization',
2946                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2947                             # $Time$ shall not be included for @initialization thus
2948                             # only $Bandwidth$ remains
2949                             ('Bandwidth', ))
2950                         representation_ms_info['initialization_url'] = initialization_template % {
2951                             'Bandwidth': bandwidth,
2952                         }
2953
2954                     def location_key(location):
2955                         return 'url' if re.match(r'^https?://', location) else 'path'
2956
2957                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2958
2959                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2960                         media_location_key = location_key(media_template)
2961
2962                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2963                         # can't be used at the same time
2964                         if '%(Number' in media_template and 's' not in representation_ms_info:
2965                             segment_duration = None
2966                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2967                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2968                                 representation_ms_info['total_number'] = int(math.ceil(
2969                                     float_or_none(period_duration, segment_duration, default=0)))
2970                             representation_ms_info['fragments'] = [{
2971                                 media_location_key: media_template % {
2972                                     'Number': segment_number,
2973                                     'Bandwidth': bandwidth,
2974                                 },
2975                                 'duration': segment_duration,
2976                             } for segment_number in range(
2977                                 representation_ms_info['start_number'],
2978                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2979                         else:
2980                             # $Number*$ or $Time$ in media template with S list available
2981                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2982                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2983                             representation_ms_info['fragments'] = []
2984                             segment_time = 0
2985                             segment_d = None
2986                             segment_number = representation_ms_info['start_number']
2987
2988                             def add_segment_url():
2989                                 segment_url = media_template % {
2990                                     'Time': segment_time,
2991                                     'Bandwidth': bandwidth,
2992                                     'Number': segment_number,
2993                                 }
2994                                 representation_ms_info['fragments'].append({
2995                                     media_location_key: segment_url,
2996                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2997                                 })
2998
2999                             for num, s in enumerate(representation_ms_info['s']):
3000                                 segment_time = s.get('t') or segment_time
3001                                 segment_d = s['d']
3002                                 add_segment_url()
3003                                 segment_number += 1
3004                                 for r in range(s.get('r', 0)):
3005                                     segment_time += segment_d
3006                                     add_segment_url()
3007                                     segment_number += 1
3008                                 segment_time += segment_d
3009                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
3010                         # No media template,
3011                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
3012                         # or any YouTube dashsegments video
3013                         fragments = []
3014                         segment_index = 0
3015                         timescale = representation_ms_info['timescale']
3016                         for s in representation_ms_info['s']:
3017                             duration = float_or_none(s['d'], timescale)
3018                             for r in range(s.get('r', 0) + 1):
3019                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
3020                                 fragments.append({
3021                                     location_key(segment_uri): segment_uri,
3022                                     'duration': duration,
3023                                 })
3024                                 segment_index += 1
3025                         representation_ms_info['fragments'] = fragments
3026                     elif 'segment_urls' in representation_ms_info:
3027                         # Segment URLs with no SegmentTimeline
3028                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3029                         # https://github.com/ytdl-org/youtube-dl/pull/14844
3030                         fragments = []
3031                         segment_duration = float_or_none(
3032                             representation_ms_info['segment_duration'],
3033                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3034                         for segment_url in representation_ms_info['segment_urls']:
3035                             fragment = {
3036                                 location_key(segment_url): segment_url,
3037                             }
3038                             if segment_duration:
3039                                 fragment['duration'] = segment_duration
3040                             fragments.append(fragment)
3041                         representation_ms_info['fragments'] = fragments
3042                     # If there is a fragments key available then we correctly recognized fragmented media.
3043                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3044                     # assumption is not necessarily correct since we may simply have no support for
3045                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3046                     if 'fragments' in representation_ms_info:
3047                         f.update({
3048                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3049                             'url': mpd_url or base_url,
3050                             'fragment_base_url': base_url,
3051                             'fragments': [],
3052                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3053                         })
3054                         if 'initialization_url' in representation_ms_info:
3055                             initialization_url = representation_ms_info['initialization_url']
3056                             if not f.get('url'):
3057                                 f['url'] = initialization_url
3058                             f['fragments'].append({location_key(initialization_url): initialization_url})
3059                         f['fragments'].extend(representation_ms_info['fragments'])
3060                         if not period_duration:
3061                             period_duration = try_get(
3062                                 representation_ms_info,
3063                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3064                     else:
3065                         # Assuming direct URL to unfragmented media.
3066                         f['url'] = base_url
3067                     if content_type in ('video', 'audio', 'image/jpeg'):
3068                         f['manifest_stream_number'] = stream_numbers[f['url']]
3069                         stream_numbers[f['url']] += 1
3070                         formats.append(f)
3071                     elif content_type == 'text':
3072                         subtitles.setdefault(lang or 'und', []).append(f)
3073
3074         return formats, subtitles
3075
3076     def _extract_ism_formats(self, *args, **kwargs):
3077         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3078         if subs:
3079             self._report_ignoring_subs('ISM')
3080         return fmts
3081
3082     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3083         res = self._download_xml_handle(
3084             ism_url, video_id,
3085             note='Downloading ISM manifest' if note is None else note,
3086             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3087             fatal=fatal, data=data, headers=headers, query=query)
3088         if res is False:
3089             return [], {}
3090         ism_doc, urlh = res
3091         if ism_doc is None:
3092             return [], {}
3093
3094         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3095
3096     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3097         """
3098         Parse formats from ISM manifest.
3099         References:
3100          1. [MS-SSTR]: Smooth Streaming Protocol,
3101             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3102         """
3103         if ism_doc.get('IsLive') == 'TRUE':
3104             return [], {}
3105
3106         duration = int(ism_doc.attrib['Duration'])
3107         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3108
3109         formats = []
3110         subtitles = {}
3111         for stream in ism_doc.findall('StreamIndex'):
3112             stream_type = stream.get('Type')
3113             if stream_type not in ('video', 'audio', 'text'):
3114                 continue
3115             url_pattern = stream.attrib['Url']
3116             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3117             stream_name = stream.get('Name')
3118             stream_language = stream.get('Language', 'und')
3119             for track in stream.findall('QualityLevel'):
3120                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3121                 # TODO: add support for WVC1 and WMAP
3122                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3123                     self.report_warning('%s is not a supported codec' % fourcc)
3124                     continue
3125                 tbr = int(track.attrib['Bitrate']) // 1000
3126                 # [1] does not mention Width and Height attributes. However,
3127                 # they're often present while MaxWidth and MaxHeight are
3128                 # missing, so should be used as fallbacks
3129                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3130                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3131                 sampling_rate = int_or_none(track.get('SamplingRate'))
3132
3133                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3134                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3135
3136                 fragments = []
3137                 fragment_ctx = {
3138                     'time': 0,
3139                 }
3140                 stream_fragments = stream.findall('c')
3141                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3142                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3143                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3144                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3145                     if not fragment_ctx['duration']:
3146                         try:
3147                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3148                         except IndexError:
3149                             next_fragment_time = duration
3150                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3151                     for _ in range(fragment_repeat):
3152                         fragments.append({
3153                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3154                             'duration': fragment_ctx['duration'] / stream_timescale,
3155                         })
3156                         fragment_ctx['time'] += fragment_ctx['duration']
3157
3158                 if stream_type == 'text':
3159                     subtitles.setdefault(stream_language, []).append({
3160                         'ext': 'ismt',
3161                         'protocol': 'ism',
3162                         'url': ism_url,
3163                         'manifest_url': ism_url,
3164                         'fragments': fragments,
3165                         '_download_params': {
3166                             'stream_type': stream_type,
3167                             'duration': duration,
3168                             'timescale': stream_timescale,
3169                             'fourcc': fourcc,
3170                             'language': stream_language,
3171                             'codec_private_data': track.get('CodecPrivateData'),
3172                         }
3173                     })
3174                 elif stream_type in ('video', 'audio'):
3175                     formats.append({
3176                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3177                         'url': ism_url,
3178                         'manifest_url': ism_url,
3179                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3180                         'width': width,
3181                         'height': height,
3182                         'tbr': tbr,
3183                         'asr': sampling_rate,
3184                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3185                         'acodec': 'none' if stream_type == 'video' else fourcc,
3186                         'protocol': 'ism',
3187                         'fragments': fragments,
3188                         'has_drm': ism_doc.find('Protection') is not None,
3189                         '_download_params': {
3190                             'stream_type': stream_type,
3191                             'duration': duration,
3192                             'timescale': stream_timescale,
3193                             'width': width or 0,
3194                             'height': height or 0,
3195                             'fourcc': fourcc,
3196                             'language': stream_language,
3197                             'codec_private_data': track.get('CodecPrivateData'),
3198                             'sampling_rate': sampling_rate,
3199                             'channels': int_or_none(track.get('Channels', 2)),
3200                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3201                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3202                         },
3203                     })
3204         return formats, subtitles
3205
3206     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3207         def absolute_url(item_url):
3208             return urljoin(base_url, item_url)
3209
3210         def parse_content_type(content_type):
3211             if not content_type:
3212                 return {}
3213             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3214             if ctr:
3215                 mimetype, codecs = ctr.groups()
3216                 f = parse_codecs(codecs)
3217                 f['ext'] = mimetype2ext(mimetype)
3218                 return f
3219             return {}
3220
3221         def _media_formats(src, cur_media_type, type_info=None):
3222             type_info = type_info or {}
3223             full_url = absolute_url(src)
3224             ext = type_info.get('ext') or determine_ext(full_url)
3225             if ext == 'm3u8':
3226                 is_plain_url = False
3227                 formats = self._extract_m3u8_formats(
3228                     full_url, video_id, ext='mp4',
3229                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3230                     preference=preference, quality=quality, fatal=False)
3231             elif ext == 'mpd':
3232                 is_plain_url = False
3233                 formats = self._extract_mpd_formats(
3234                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3235             else:
3236                 is_plain_url = True
3237                 formats = [{
3238                     'url': full_url,
3239                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3240                     'ext': ext,
3241                 }]
3242             return is_plain_url, formats
3243
3244         entries = []
3245         # amp-video and amp-audio are very similar to their HTML5 counterparts
3246         # so we will include them right here (see
3247         # https://www.ampproject.org/docs/reference/components/amp-video)
3248         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3249         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3250         media_tags = [(media_tag, media_tag_name, media_type, '')
3251                       for media_tag, media_tag_name, media_type
3252                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3253         media_tags.extend(re.findall(
3254             # We only allow video|audio followed by a whitespace or '>'.
3255             # Allowing more characters may end up in significant slow down (see
3256             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3257             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3258             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3259         for media_tag, _, media_type, media_content in media_tags:
3260             media_info = {
3261                 'formats': [],
3262                 'subtitles': {},
3263             }
3264             media_attributes = extract_attributes(media_tag)
3265             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3266             if src:
3267                 f = parse_content_type(media_attributes.get('type'))
3268                 _, formats = _media_formats(src, media_type, f)
3269                 media_info['formats'].extend(formats)
3270             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3271             if media_content:
3272                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3273                     s_attr = extract_attributes(source_tag)
3274                     # data-video-src and data-src are non standard but seen
3275                     # several times in the wild
3276                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3277                     if not src:
3278                         continue
3279                     f = parse_content_type(s_attr.get('type'))
3280                     is_plain_url, formats = _media_formats(src, media_type, f)
3281                     if is_plain_url:
3282                         # width, height, res, label and title attributes are
3283                         # all not standard but seen several times in the wild
3284                         labels = [
3285                             s_attr.get(lbl)
3286                             for lbl in ('label', 'title')
3287                             if str_or_none(s_attr.get(lbl))
3288                         ]
3289                         width = int_or_none(s_attr.get('width'))
3290                         height = (int_or_none(s_attr.get('height'))
3291                                   or int_or_none(s_attr.get('res')))
3292                         if not width or not height:
3293                             for lbl in labels:
3294                                 resolution = parse_resolution(lbl)
3295                                 if not resolution:
3296                                     continue
3297                                 width = width or resolution.get('width')
3298                                 height = height or resolution.get('height')
3299                         for lbl in labels:
3300                             tbr = parse_bitrate(lbl)
3301                             if tbr:
3302                                 break
3303                         else:
3304                             tbr = None
3305                         f.update({
3306                             'width': width,
3307                             'height': height,
3308                             'tbr': tbr,
3309                             'format_id': s_attr.get('label') or s_attr.get('title'),
3310                         })
3311                         f.update(formats[0])
3312                         media_info['formats'].append(f)
3313                     else:
3314                         media_info['formats'].extend(formats)
3315                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3316                     track_attributes = extract_attributes(track_tag)
3317                     kind = track_attributes.get('kind')
3318                     if not kind or kind in ('subtitles', 'captions'):
3319                         src = strip_or_none(track_attributes.get('src'))
3320                         if not src:
3321                             continue
3322                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3323                         media_info['subtitles'].setdefault(lang, []).append({
3324                             'url': absolute_url(src),
3325                         })
3326             for f in media_info['formats']:
3327                 f.setdefault('http_headers', {})['Referer'] = base_url
3328             if media_info['formats'] or media_info['subtitles']:
3329                 entries.append(media_info)
3330         return entries
3331
3332     def _extract_akamai_formats(self, *args, **kwargs):
3333         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3334         if subs:
3335             self._report_ignoring_subs('akamai')
3336         return fmts
3337
3338     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3339         signed = 'hdnea=' in manifest_url
3340         if not signed:
3341             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3342             manifest_url = re.sub(
3343                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3344                 '', manifest_url).strip('?')
3345
3346         formats = []
3347         subtitles = {}
3348
3349         hdcore_sign = 'hdcore=3.7.0'
3350         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3351         hds_host = hosts.get('hds')
3352         if hds_host:
3353             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3354         if 'hdcore=' not in f4m_url:
3355             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3356         f4m_formats = self._extract_f4m_formats(
3357             f4m_url, video_id, f4m_id='hds', fatal=False)
3358         for entry in f4m_formats:
3359             entry.update({'extra_param_to_segment_url': hdcore_sign})
3360         formats.extend(f4m_formats)
3361
3362         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3363         hls_host = hosts.get('hls')
3364         if hls_host:
3365             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3366         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3367             m3u8_url, video_id, 'mp4', 'm3u8_native',
3368             m3u8_id='hls', fatal=False)
3369         formats.extend(m3u8_formats)
3370         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3371
3372         http_host = hosts.get('http')
3373         if http_host and m3u8_formats and not signed:
3374             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3375             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3376             qualities_length = len(qualities)
3377             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3378                 i = 0
3379                 for f in m3u8_formats:
3380                     if f['vcodec'] != 'none':
3381                         for protocol in ('http', 'https'):
3382                             http_f = f.copy()
3383                             del http_f['manifest_url']
3384                             http_url = re.sub(
3385                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3386                             http_f.update({
3387                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3388                                 'url': http_url,
3389                                 'protocol': protocol,
3390                             })
3391                             formats.append(http_f)
3392                         i += 1
3393
3394         return formats, subtitles
3395
3396     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3397         query = urllib.parse.urlparse(url).query
3398         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3399         mobj = re.search(
3400             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3401         url_base = mobj.group('url')
3402         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3403         formats = []
3404
3405         def manifest_url(manifest):
3406             m_url = f'{http_base_url}/{manifest}'
3407             if query:
3408                 m_url += '?%s' % query
3409             return m_url
3410
3411         if 'm3u8' not in skip_protocols:
3412             formats.extend(self._extract_m3u8_formats(
3413                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3414                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3415         if 'f4m' not in skip_protocols:
3416             formats.extend(self._extract_f4m_formats(
3417                 manifest_url('manifest.f4m'),
3418                 video_id, f4m_id='hds', fatal=False))
3419         if 'dash' not in skip_protocols:
3420             formats.extend(self._extract_mpd_formats(
3421                 manifest_url('manifest.mpd'),
3422                 video_id, mpd_id='dash', fatal=False))
3423         if re.search(r'(?:/smil:|\.smil)', url_base):
3424             if 'smil' not in skip_protocols:
3425                 rtmp_formats = self._extract_smil_formats(
3426                     manifest_url('jwplayer.smil'),
3427                     video_id, fatal=False)
3428                 for rtmp_format in rtmp_formats:
3429                     rtsp_format = rtmp_format.copy()
3430                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3431                     del rtsp_format['play_path']
3432                     del rtsp_format['ext']
3433                     rtsp_format.update({
3434                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3435                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3436                         'protocol': 'rtsp',
3437                     })
3438                     formats.extend([rtmp_format, rtsp_format])
3439         else:
3440             for protocol in ('rtmp', 'rtsp'):
3441                 if protocol not in skip_protocols:
3442                     formats.append({
3443                         'url': f'{protocol}:{url_base}',
3444                         'format_id': protocol,
3445                         'protocol': protocol,
3446                     })
3447         return formats
3448
3449     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3450         mobj = re.search(
3451             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3452             webpage)
3453         if mobj:
3454             try:
3455                 jwplayer_data = self._parse_json(mobj.group('options'),
3456                                                  video_id=video_id,
3457                                                  transform_source=transform_source)
3458             except ExtractorError:
3459                 pass
3460             else:
3461                 if isinstance(jwplayer_data, dict):
3462                     return jwplayer_data
3463
3464     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3465         jwplayer_data = self._find_jwplayer_data(
3466             webpage, video_id, transform_source=js_to_json)
3467         return self._parse_jwplayer_data(
3468             jwplayer_data, video_id, *args, **kwargs)
3469
3470     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3471                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3472         # JWPlayer backward compatibility: flattened playlists
3473         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3474         if 'playlist' not in jwplayer_data:
3475             jwplayer_data = {'playlist': [jwplayer_data]}
3476
3477         entries = []
3478
3479         # JWPlayer backward compatibility: single playlist item
3480         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3481         if not isinstance(jwplayer_data['playlist'], list):
3482             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3483
3484         for video_data in jwplayer_data['playlist']:
3485             # JWPlayer backward compatibility: flattened sources
3486             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3487             if 'sources' not in video_data:
3488                 video_data['sources'] = [video_data]
3489
3490             this_video_id = video_id or video_data['mediaid']
3491
3492             formats = self._parse_jwplayer_formats(
3493                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3494                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3495
3496             subtitles = {}
3497             tracks = video_data.get('tracks')
3498             if tracks and isinstance(tracks, list):
3499                 for track in tracks:
3500                     if not isinstance(track, dict):
3501                         continue
3502                     track_kind = track.get('kind')
3503                     if not track_kind or not isinstance(track_kind, str):
3504                         continue
3505                     if track_kind.lower() not in ('captions', 'subtitles'):
3506                         continue
3507                     track_url = urljoin(base_url, track.get('file'))
3508                     if not track_url:
3509                         continue
3510                     subtitles.setdefault(track.get('label') or 'en', []).append({
3511                         'url': self._proto_relative_url(track_url)
3512                     })
3513
3514             entry = {
3515                 'id': this_video_id,
3516                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3517                 'description': clean_html(video_data.get('description')),
3518                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3519                 'timestamp': int_or_none(video_data.get('pubdate')),
3520                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3521                 'subtitles': subtitles,
3522             }
3523             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3524             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3525                 entry.update({
3526                     '_type': 'url_transparent',
3527                     'url': formats[0]['url'],
3528                 })
3529             else:
3530                 self._sort_formats(formats)
3531                 entry['formats'] = formats
3532             entries.append(entry)
3533         if len(entries) == 1:
3534             return entries[0]
3535         else:
3536             return self.playlist_result(entries)
3537
3538     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3539                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3540         urls = []
3541         formats = []
3542         for source in jwplayer_sources_data:
3543             if not isinstance(source, dict):
3544                 continue
3545             source_url = urljoin(
3546                 base_url, self._proto_relative_url(source.get('file')))
3547             if not source_url or source_url in urls:
3548                 continue
3549             urls.append(source_url)
3550             source_type = source.get('type') or ''
3551             ext = mimetype2ext(source_type) or determine_ext(source_url)
3552             if source_type == 'hls' or ext == 'm3u8':
3553                 formats.extend(self._extract_m3u8_formats(
3554                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3555                     m3u8_id=m3u8_id, fatal=False))
3556             elif source_type == 'dash' or ext == 'mpd':
3557                 formats.extend(self._extract_mpd_formats(
3558                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3559             elif ext == 'smil':
3560                 formats.extend(self._extract_smil_formats(
3561                     source_url, video_id, fatal=False))
3562             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3563             elif source_type.startswith('audio') or ext in (
3564                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3565                 formats.append({
3566                     'url': source_url,
3567                     'vcodec': 'none',
3568                     'ext': ext,
3569                 })
3570             else:
3571                 height = int_or_none(source.get('height'))
3572                 if height is None:
3573                     # Often no height is provided but there is a label in
3574                     # format like "1080p", "720p SD", or 1080.
3575                     height = int_or_none(self._search_regex(
3576                         r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
3577                         'height', default=None))
3578                 a_format = {
3579                     'url': source_url,
3580                     'width': int_or_none(source.get('width')),
3581                     'height': height,
3582                     'tbr': int_or_none(source.get('bitrate')),
3583                     'ext': ext,
3584                 }
3585                 if source_url.startswith('rtmp'):
3586                     a_format['ext'] = 'flv'
3587                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3588                     # of jwplayer.flash.swf
3589                     rtmp_url_parts = re.split(
3590                         r'((?:mp4|mp3|flv):)', source_url, 1)
3591                     if len(rtmp_url_parts) == 3:
3592                         rtmp_url, prefix, play_path = rtmp_url_parts
3593                         a_format.update({
3594                             'url': rtmp_url,
3595                             'play_path': prefix + play_path,
3596                         })
3597                     if rtmp_params:
3598                         a_format.update(rtmp_params)
3599                 formats.append(a_format)
3600         return formats
3601
3602     def _live_title(self, name):
3603         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3604         return name
3605
3606     def _int(self, v, name, fatal=False, **kwargs):
3607         res = int_or_none(v, **kwargs)
3608         if res is None:
3609             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3610             if fatal:
3611                 raise ExtractorError(msg)
3612             else:
3613                 self.report_warning(msg)
3614         return res
3615
3616     def _float(self, v, name, fatal=False, **kwargs):
3617         res = float_or_none(v, **kwargs)
3618         if res is None:
3619             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3620             if fatal:
3621                 raise ExtractorError(msg)
3622             else:
3623                 self.report_warning(msg)
3624         return res
3625
3626     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3627                     path='/', secure=False, discard=False, rest={}, **kwargs):
3628         cookie = http.cookiejar.Cookie(
3629             0, name, value, port, port is not None, domain, True,
3630             domain.startswith('.'), path, True, secure, expire_time,
3631             discard, None, None, rest)
3632         self.cookiejar.set_cookie(cookie)
3633
3634     def _get_cookies(self, url):
3635         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3636         return LenientSimpleCookie(self._downloader._calc_cookies(url))
3637
3638     def _apply_first_set_cookie_header(self, url_handle, cookie):
3639         """
3640         Apply first Set-Cookie header instead of the last. Experimental.
3641
3642         Some sites (e.g. [1-3]) may serve two cookies under the same name
3643         in Set-Cookie header and expect the first (old) one to be set rather
3644         than second (new). However, as of RFC6265 the newer one cookie
3645         should be set into cookie store what actually happens.
3646         We will workaround this issue by resetting the cookie to
3647         the first one manually.
3648         1. https://new.vk.com/
3649         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3650         3. https://learning.oreilly.com/
3651         """
3652         for header, cookies in url_handle.headers.items():
3653             if header.lower() != 'set-cookie':
3654                 continue
3655             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3656             cookie_value = re.search(
3657                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3658             if cookie_value:
3659                 value, domain = cookie_value.groups()
3660                 self._set_cookie(domain, cookie, value)
3661                 break
3662
3663     @classmethod
3664     def get_testcases(cls, include_onlymatching=False):
3665         t = getattr(cls, '_TEST', None)
3666         if t:
3667             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3668             tests = [t]
3669         else:
3670             tests = getattr(cls, '_TESTS', [])
3671         for t in tests:
3672             if not include_onlymatching and t.get('only_matching', False):
3673                 continue
3674             t['name'] = cls.ie_key()
3675             yield t
3676
3677     @classmethod
3678     def get_webpage_testcases(cls):
3679         tests = getattr(cls, '_WEBPAGE_TESTS', [])
3680         for t in tests:
3681             t['name'] = cls.ie_key()
3682         return tests
3683
3684     @classproperty
3685     def age_limit(cls):
3686         """Get age limit from the testcases"""
3687         return max(traverse_obj(
3688             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3689             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3690
3691     @classmethod
3692     def is_suitable(cls, age_limit):
3693         """Test whether the extractor is generally suitable for the given age limit"""
3694         return not age_restricted(cls.age_limit, age_limit)
3695
3696     @classmethod
3697     def description(cls, *, markdown=True, search_examples=None):
3698         """Description of the extractor"""
3699         desc = ''
3700         if cls._NETRC_MACHINE:
3701             if markdown:
3702                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3703             else:
3704                 desc += f' [{cls._NETRC_MACHINE}]'
3705         if cls.IE_DESC is False:
3706             desc += ' [HIDDEN]'
3707         elif cls.IE_DESC:
3708             desc += f' {cls.IE_DESC}'
3709         if cls.SEARCH_KEY:
3710             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3711             if search_examples:
3712                 _COUNTS = ('', '5', '10', 'all')
3713                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3714         if not cls.working():
3715             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3716
3717         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3718         return f'{name}:{desc}' if desc else name
3719
3720     def extract_subtitles(self, *args, **kwargs):
3721         if (self.get_param('writesubtitles', False)
3722                 or self.get_param('listsubtitles')):
3723             return self._get_subtitles(*args, **kwargs)
3724         return {}
3725
3726     def _get_subtitles(self, *args, **kwargs):
3727         raise NotImplementedError('This method must be implemented by subclasses')
3728
3729     def extract_comments(self, *args, **kwargs):
3730         if not self.get_param('getcomments'):
3731             return None
3732         generator = self._get_comments(*args, **kwargs)
3733
3734         def extractor():
3735             comments = []
3736             interrupted = True
3737             try:
3738                 while True:
3739                     comments.append(next(generator))
3740             except StopIteration:
3741                 interrupted = False
3742             except KeyboardInterrupt:
3743                 self.to_screen('Interrupted by user')
3744             except Exception as e:
3745                 if self.get_param('ignoreerrors') is not True:
3746                     raise
3747                 self._downloader.report_error(e)
3748             comment_count = len(comments)
3749             self.to_screen(f'Extracted {comment_count} comments')
3750             return {
3751                 'comments': comments,
3752                 'comment_count': None if interrupted else comment_count
3753             }
3754         return extractor
3755
3756     def _get_comments(self, *args, **kwargs):
3757         raise NotImplementedError('This method must be implemented by subclasses')
3758
3759     @staticmethod
3760     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3761         """ Merge subtitle items for one language. Items with duplicated URLs/data
3762         will be dropped. """
3763         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3764         ret = list(subtitle_list1)
3765         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3766         return ret
3767
3768     @classmethod
3769     def _merge_subtitles(cls, *dicts, target=None):
3770         """ Merge subtitle dictionaries, language by language. """
3771         if target is None:
3772             target = {}
3773         for d in dicts:
3774             for lang, subs in d.items():
3775                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3776         return target
3777
3778     def extract_automatic_captions(self, *args, **kwargs):
3779         if (self.get_param('writeautomaticsub', False)
3780                 or self.get_param('listsubtitles')):
3781             return self._get_automatic_captions(*args, **kwargs)
3782         return {}
3783
3784     def _get_automatic_captions(self, *args, **kwargs):
3785         raise NotImplementedError('This method must be implemented by subclasses')
3786
3787     @functools.cached_property
3788     def _cookies_passed(self):
3789         """Whether cookies have been passed to YoutubeDL"""
3790         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3791
3792     def mark_watched(self, *args, **kwargs):
3793         if not self.get_param('mark_watched', False):
3794             return
3795         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3796             self._mark_watched(*args, **kwargs)
3797
3798     def _mark_watched(self, *args, **kwargs):
3799         raise NotImplementedError('This method must be implemented by subclasses')
3800
3801     def geo_verification_headers(self):
3802         headers = {}
3803         geo_verification_proxy = self.get_param('geo_verification_proxy')
3804         if geo_verification_proxy:
3805             headers['Ytdl-request-proxy'] = geo_verification_proxy
3806         return headers
3807
3808     @staticmethod
3809     def _generic_id(url):
3810         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3811
3812     @staticmethod
3813     def _generic_title(url):
3814         return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3815
3816     @staticmethod
3817     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3818         all_known = all(map(
3819             lambda x: x is not None,
3820             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3821         return (
3822             'private' if is_private
3823             else 'premium_only' if needs_premium
3824             else 'subscriber_only' if needs_subscription
3825             else 'needs_auth' if needs_auth
3826             else 'unlisted' if is_unlisted
3827             else 'public' if all_known
3828             else None)
3829
3830     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3831         '''
3832         @returns            A list of values for the extractor argument given by "key"
3833                             or "default" if no such key is present
3834         @param default      The default value to return when the key is not present (default: [])
3835         @param casesense    When false, the values are converted to lower case
3836         '''
3837         val = traverse_obj(
3838             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3839         if val is None:
3840             return [] if default is NO_DEFAULT else default
3841         return list(val) if casesense else [x.lower() for x in val]
3842
3843     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3844         if not playlist_id or not video_id:
3845             return not video_id
3846
3847         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3848         if no_playlist is not None:
3849             return not no_playlist
3850
3851         video_id = '' if video_id is True else f' {video_id}'
3852         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3853         if self.get_param('noplaylist'):
3854             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3855             return False
3856         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3857         return True
3858
3859     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3860         RetryManager.report_retry(
3861             err, _count or int(fatal), _retries,
3862             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3863             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3864
3865     def RetryManager(self, **kwargs):
3866         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3867
3868     @classmethod
3869     def extract_from_webpage(cls, ydl, url, webpage):
3870         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3871               else ydl.get_info_extractor(cls.ie_key()))
3872         for info in ie._extract_from_webpage(url, webpage) or []:
3873             # url = None since we do not want to set (webpage/original)_url
3874             ydl.add_default_extra_info(info, ie, None)
3875             yield info
3876
3877     @classmethod
3878     def _extract_from_webpage(cls, url, webpage):
3879         for embed_url in orderedSet(
3880                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3881             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3882
3883     @classmethod
3884     def _extract_embed_urls(cls, url, webpage):
3885         """@returns all the embed urls on the webpage"""
3886         if '_EMBED_URL_RE' not in cls.__dict__:
3887             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3888             for idx, regex in enumerate(cls._EMBED_REGEX):
3889                 assert regex.count('(?P<url>') == 1, \
3890                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3891             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3892
3893         for regex in cls._EMBED_URL_RE:
3894             for mobj in regex.finditer(webpage):
3895                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3896                 if cls._VALID_URL is False or cls.suitable(embed_url):
3897                     yield embed_url
3898
3899     class StopExtraction(Exception):
3900         pass
3901
3902     @classmethod
3903     def _extract_url(cls, webpage):  # TODO: Remove
3904         """Only for compatibility with some older extractors"""
3905         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3906
3907     @classmethod
3908     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3909         if plugin_name:
3910             mro = inspect.getmro(cls)
3911             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3912             cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key
3913             while getattr(super_class, '__wrapped__', None):
3914                 super_class = super_class.__wrapped__
3915             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3916
3917         return super().__init_subclass__(**kwargs)
3918
3919
3920 class SearchInfoExtractor(InfoExtractor):
3921     """
3922     Base class for paged search queries extractors.
3923     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3924     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3925     """
3926
3927     _MAX_RESULTS = float('inf')
3928
3929     @classproperty
3930     def _VALID_URL(cls):
3931         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3932
3933     def _real_extract(self, query):
3934         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3935         if prefix == '':
3936             return self._get_n_results(query, 1)
3937         elif prefix == 'all':
3938             return self._get_n_results(query, self._MAX_RESULTS)
3939         else:
3940             n = int(prefix)
3941             if n <= 0:
3942                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3943             elif n > self._MAX_RESULTS:
3944                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3945                 n = self._MAX_RESULTS
3946             return self._get_n_results(query, n)
3947
3948     def _get_n_results(self, query, n):
3949         """Get a specified number of results for a query.
3950         Either this function or _search_results must be overridden by subclasses """
3951         return self.playlist_result(
3952             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3953             query, query)
3954
3955     def _search_results(self, query):
3956         """Returns an iterator of search results"""
3957         raise NotImplementedError('This method must be implemented by subclasses')
3958
3959     @classproperty
3960     def SEARCH_KEY(cls):
3961         return cls._SEARCH_KEY
3962
3963
3964 class UnsupportedURLIE(InfoExtractor):
3965     _VALID_URL = '.*'
3966     _ENABLED = False
3967     IE_DESC = False
3968
3969     def _real_extract(self, url):
3970         raise UnsupportedError(url)