yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import sys
  17 import time
  18 import types
  19 import urllib.parse
  20 import urllib.request
  21 import xml.etree.ElementTree
  22
  23 from ..compat import functools  # isort: split
  24 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  25 from ..cookies import LenientSimpleCookie
  26 from ..downloader import FileDownloader
  27 from ..downloader.f4m import get_base_url, remove_encrypted_media
  28 from ..utils import (
  29     IDENTITY,
  30     JSON_LD_RE,
  31     NO_DEFAULT,
  32     ExtractorError,
  33     GeoRestrictedError,
  34     GeoUtils,
  35     LenientJSONDecoder,
  36     RegexNotFoundError,
  37     RetryManager,
  38     UnsupportedError,
  39     age_restricted,
  40     base_url,
  41     bug_reports_message,
  42     classproperty,
  43     clean_html,
  44     determine_ext,
  45     determine_protocol,
  46     dict_get,
  47     encode_data_uri,
  48     error_to_compat_str,
  49     extract_attributes,
  50     filter_dict,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     int_or_none,
  55     join_nonempty,
  56     js_to_json,
  57     mimetype2ext,
  58     network_exceptions,
  59     orderedSet,
  60     parse_bitrate,
  61     parse_codecs,
  62     parse_duration,
  63     parse_iso8601,
  64     parse_m3u8_attributes,
  65     parse_resolution,
  66     sanitize_filename,
  67     sanitize_url,
  68     sanitized_Request,
  69     str_or_none,
  70     str_to_int,
  71     strip_or_none,
  72     traverse_obj,
  73     try_call,
  74     try_get,
  75     unescapeHTML,
  76     unified_strdate,
  77     unified_timestamp,
  78     update_Request,
  79     update_url_query,
  80     url_basename,
  81     url_or_none,
  82     urljoin,
  83     variadic,
  84     xpath_element,
  85     xpath_text,
  86     xpath_with_ns,
  87 )
  88
  89
  90 class InfoExtractor:
  91     """Information Extractor class.
  92
  93     Information extractors are the classes that, given a URL, extract
  94     information about the video (or videos) the URL refers to. This
  95     information includes the real video URL, the video title, author and
  96     others. The information is stored in a dictionary which is then
  97     passed to the YoutubeDL. The YoutubeDL processes this
  98     information possibly downloading the video to the file system, among
  99     other possible outcomes.
 100
 101     The type field determines the type of the result.
 102     By far the most common value (and the default if _type is missing) is
 103     "video", which indicates a single video.
 104
 105     For a video, the dictionaries must include the following fields:
 106
 107     id:             Video identifier.
 108     title:          Video title, unescaped. Set to an empty string if video has
 109                     no title as opposed to "None" which signifies that the
 110                     extractor failed to obtain a title
 111
 112     Additionally, it must contain either a formats entry or a url one:
 113
 114     formats:        A list of dictionaries for each format available, ordered
 115                     from worst to best quality.
 116
 117                     Potential fields:
 118                     * url        The mandatory URL representing the media:
 119                                    for plain file media - HTTP URL of this file,
 120                                    for RTMP - RTMP URL,
 121                                    for HLS - URL of the M3U8 media playlist,
 122                                    for HDS - URL of the F4M manifest,
 123                                    for DASH
 124                                      - HTTP URL to plain file media (in case of
 125                                        unfragmented media)
 126                                      - URL of the MPD manifest or base URL
 127                                        representing the media if MPD manifest
 128                                        is parsed from a string (in case of
 129                                        fragmented media)
 130                                    for MSS - URL of the ISM manifest.
 131                     * manifest_url
 132                                  The URL of the manifest file in case of
 133                                  fragmented media:
 134                                    for HLS - URL of the M3U8 master playlist,
 135                                    for HDS - URL of the F4M manifest,
 136                                    for DASH - URL of the MPD manifest,
 137                                    for MSS - URL of the ISM manifest.
 138                     * manifest_stream_number  (For internal use only)
 139                                  The index of the stream in the manifest file
 140                     * ext        Will be calculated from URL if missing
 141                     * format     A human-readable description of the format
 142                                  ("mp4 container with h264/opus").
 143                                  Calculated from the format_id, width, height.
 144                                  and format_note fields if missing.
 145                     * format_id  A short description of the format
 146                                  ("mp4_h264_opus" or "19").
 147                                 Technically optional, but strongly recommended.
 148                     * format_note Additional info about the format
 149                                  ("3D" or "DASH video")
 150                     * width      Width of the video, if known
 151                     * height     Height of the video, if known
 152                     * resolution Textual description of width and height
 153                     * dynamic_range The dynamic range of the video. One of:
 154                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 155                     * tbr        Average bitrate of audio and video in KBit/s
 156                     * abr        Average audio bitrate in KBit/s
 157                     * acodec     Name of the audio codec in use
 158                     * asr        Audio sampling rate in Hertz
 159                     * audio_channels  Number of audio channels
 160                     * vbr        Average video bitrate in KBit/s
 161                     * fps        Frame rate
 162                     * vcodec     Name of the video codec in use
 163                     * container  Name of the container format
 164                     * filesize   The number of bytes, if known in advance
 165                     * filesize_approx  An estimate for the number of bytes
 166                     * player_url SWF Player URL (used for rtmpdump).
 167                     * protocol   The protocol that will be used for the actual
 168                                  download, lower-case. One of "http", "https" or
 169                                  one of the protocols defined in downloader.PROTOCOL_MAP
 170                     * fragment_base_url
 171                                  Base URL for fragments. Each fragment's path
 172                                  value (if present) will be relative to
 173                                  this URL.
 174                     * fragments  A list of fragments of a fragmented media.
 175                                  Each fragment entry must contain either an url
 176                                  or a path. If an url is present it should be
 177                                  considered by a client. Otherwise both path and
 178                                  fragment_base_url must be present. Here is
 179                                  the list of all potential fields:
 180                                  * "url" - fragment's URL
 181                                  * "path" - fragment's path relative to
 182                                             fragment_base_url
 183                                  * "duration" (optional, int or float)
 184                                  * "filesize" (optional, int)
 185                     * is_from_start  Is a live format that can be downloaded
 186                                 from the start. Boolean
 187                     * preference Order number of this format. If this field is
 188                                  present and not None, the formats get sorted
 189                                  by this field, regardless of all other values.
 190                                  -1 for default (order by other properties),
 191                                  -2 or smaller for less than default.
 192                                  < -1000 to hide the format (if there is
 193                                     another one which is strictly better)
 194                     * language   Language code, e.g. "de" or "en-US".
 195                     * language_preference  Is this in the language mentioned in
 196                                  the URL?
 197                                  10 if it's what the URL is about,
 198                                  -1 for default (don't know),
 199                                  -10 otherwise, other values reserved for now.
 200                     * quality    Order number of the video quality of this
 201                                  format, irrespective of the file format.
 202                                  -1 for default (order by other properties),
 203                                  -2 or smaller for less than default.
 204                     * source_preference  Order number for this video source
 205                                   (quality takes higher priority)
 206                                  -1 for default (order by other properties),
 207                                  -2 or smaller for less than default.
 208                     * http_headers  A dictionary of additional HTTP headers
 209                                  to add to the request.
 210                     * stretched_ratio  If given and not 1, indicates that the
 211                                  video's pixels are not square.
 212                                  width : height ratio as float.
 213                     * no_resume  The server does not support resuming the
 214                                  (HTTP or RTMP) download. Boolean.
 215                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 216                     * downloader_options  A dictionary of downloader options
 217                                  (For internal use only)
 218                                  * http_chunk_size Chunk size for HTTP downloads
 219                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 220                     RTMP formats can also have the additional fields: page_url,
 221                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 222                     rtmp_protocol, rtmp_real_time
 223
 224     url:            Final video URL.
 225     ext:            Video filename extension.
 226     format:         The video format, defaults to ext (used for --get-format)
 227     player_url:     SWF Player URL (used for rtmpdump).
 228
 229     The following fields are optional:
 230
 231     direct:         True if a direct video file was given (must only be set by GenericIE)
 232     alt_title:      A secondary title of the video.
 233     display_id      An alternative identifier for the video, not necessarily
 234                     unique, but available before title. Typically, id is
 235                     something like "4234987", title "Dancing naked mole rats",
 236                     and display_id "dancing-naked-mole-rats"
 237     thumbnails:     A list of dictionaries, with the following entries:
 238                         * "id" (optional, string) - Thumbnail format ID
 239                         * "url"
 240                         * "preference" (optional, int) - quality of the image
 241                         * "width" (optional, int)
 242                         * "height" (optional, int)
 243                         * "resolution" (optional, string "{width}x{height}",
 244                                         deprecated)
 245                         * "filesize" (optional, int)
 246                         * "http_headers" (dict) - HTTP headers for the request
 247     thumbnail:      Full URL to a video thumbnail image.
 248     description:    Full video description.
 249     uploader:       Full name of the video uploader.
 250     license:        License name the video is licensed under.
 251     creator:        The creator of the video.
 252     timestamp:      UNIX timestamp of the moment the video was uploaded
 253     upload_date:    Video upload date in UTC (YYYYMMDD).
 254                     If not explicitly set, calculated from timestamp
 255     release_timestamp: UNIX timestamp of the moment the video was released.
 256                     If it is not clear whether to use timestamp or this, use the former
 257     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 258                     If not explicitly set, calculated from release_timestamp
 259     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 260     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 261                     If not explicitly set, calculated from modified_timestamp
 262     uploader_id:    Nickname or id of the video uploader.
 263     uploader_url:   Full URL to a personal webpage of the video uploader.
 264     channel:        Full name of the channel the video is uploaded on.
 265                     Note that channel fields may or may not repeat uploader
 266                     fields. This depends on a particular extractor.
 267     channel_id:     Id of the channel.
 268     channel_url:    Full URL to a channel webpage.
 269     channel_follower_count: Number of followers of the channel.
 270     location:       Physical location where the video was filmed.
 271     subtitles:      The available subtitles as a dictionary in the format
 272                     {tag: subformats}. "tag" is usually a language code, and
 273                     "subformats" is a list sorted from lower to higher
 274                     preference, each element is a dictionary with the "ext"
 275                     entry and one of:
 276                         * "data": The subtitles file contents
 277                         * "url": A URL pointing to the subtitles file
 278                     It can optionally also have:
 279                         * "name": Name or description of the subtitles
 280                         * "http_headers": A dictionary of additional HTTP headers
 281                                   to add to the request.
 282                     "ext" will be calculated from URL if missing
 283     automatic_captions: Like 'subtitles'; contains automatically generated
 284                     captions instead of normal subtitles
 285     duration:       Length of the video in seconds, as an integer or float.
 286     view_count:     How many users have watched the video on the platform.
 287     like_count:     Number of positive ratings of the video
 288     dislike_count:  Number of negative ratings of the video
 289     repost_count:   Number of reposts of the video
 290     average_rating: Average rating give by users, the scale used depends on the webpage
 291     comment_count:  Number of comments on the video
 292     comments:       A list of comments, each with one or more of the following
 293                     properties (all but one of text or html optional):
 294                         * "author" - human-readable name of the comment author
 295                         * "author_id" - user ID of the comment author
 296                         * "author_thumbnail" - The thumbnail of the comment author
 297                         * "id" - Comment ID
 298                         * "html" - Comment as HTML
 299                         * "text" - Plain text of the comment
 300                         * "timestamp" - UNIX timestamp of comment
 301                         * "parent" - ID of the comment this one is replying to.
 302                                      Set to "root" to indicate that this is a
 303                                      comment to the original video.
 304                         * "like_count" - Number of positive ratings of the comment
 305                         * "dislike_count" - Number of negative ratings of the comment
 306                         * "is_favorited" - Whether the comment is marked as
 307                                            favorite by the video uploader
 308                         * "author_is_uploader" - Whether the comment is made by
 309                                                  the video uploader
 310     age_limit:      Age restriction for the video, as an integer (years)
 311     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 312                     should allow to get the same result again. (It will be set
 313                     by YoutubeDL if it's missing)
 314     categories:     A list of categories that the video falls in, for example
 315                     ["Sports", "Berlin"]
 316     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 317     cast:           A list of the video cast
 318     is_live:        True, False, or None (=unknown). Whether this video is a
 319                     live stream that goes on instead of a fixed-length video.
 320     was_live:       True, False, or None (=unknown). Whether this video was
 321                     originally a live stream.
 322     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 323                     or 'post_live' (was live, but VOD is not yet processed)
 324                     If absent, automatically set from is_live, was_live
 325     start_time:     Time in seconds where the reproduction should start, as
 326                     specified in the URL.
 327     end_time:       Time in seconds where the reproduction should end, as
 328                     specified in the URL.
 329     chapters:       A list of dictionaries, with the following entries:
 330                         * "start_time" - The start time of the chapter in seconds
 331                         * "end_time" - The end time of the chapter in seconds
 332                         * "title" (optional, string)
 333     playable_in_embed: Whether this video is allowed to play in embedded
 334                     players on other sites. Can be True (=always allowed),
 335                     False (=never allowed), None (=unknown), or a string
 336                     specifying the criteria for embedability; e.g. 'whitelist'
 337     availability:   Under what condition the video is available. One of
 338                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 339                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 340                     to set it
 341     _old_archive_ids: A list of old archive ids needed for backward compatibility
 342     __post_extractor: A function to be called just before the metadata is
 343                     written to either disk, logger or console. The function
 344                     must return a dict which will be added to the info_dict.
 345                     This is usefull for additional information that is
 346                     time-consuming to extract. Note that the fields thus
 347                     extracted will not be available to output template and
 348                     match_filter. So, only "comments" and "comment_count" are
 349                     currently allowed to be extracted via this method.
 350
 351     The following fields should only be used when the video belongs to some logical
 352     chapter or section:
 353
 354     chapter:        Name or title of the chapter the video belongs to.
 355     chapter_number: Number of the chapter the video belongs to, as an integer.
 356     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 357
 358     The following fields should only be used when the video is an episode of some
 359     series, programme or podcast:
 360
 361     series:         Title of the series or programme the video episode belongs to.
 362     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 363     season:         Title of the season the video episode belongs to.
 364     season_number:  Number of the season the video episode belongs to, as an integer.
 365     season_id:      Id of the season the video episode belongs to, as a unicode string.
 366     episode:        Title of the video episode. Unlike mandatory video title field,
 367                     this field should denote the exact title of the video episode
 368                     without any kind of decoration.
 369     episode_number: Number of the video episode within a season, as an integer.
 370     episode_id:     Id of the video episode, as a unicode string.
 371
 372     The following fields should only be used when the media is a track or a part of
 373     a music album:
 374
 375     track:          Title of the track.
 376     track_number:   Number of the track within an album or a disc, as an integer.
 377     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 378                     as a unicode string.
 379     artist:         Artist(s) of the track.
 380     genre:          Genre(s) of the track.
 381     album:          Title of the album the track belongs to.
 382     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 383     album_artist:   List of all artists appeared on the album (e.g.
 384                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 385                     and compilations).
 386     disc_number:    Number of the disc or other physical medium the track belongs to,
 387                     as an integer.
 388     release_year:   Year (YYYY) when the album was released.
 389     composer:       Composer of the piece
 390
 391     The following fields should only be set for clips that should be cut from the original video:
 392
 393     section_start:  Start time of the section in seconds
 394     section_end:    End time of the section in seconds
 395
 396     The following fields should only be set for storyboards:
 397     rows:           Number of rows in each storyboard fragment, as an integer
 398     columns:        Number of columns in each storyboard fragment, as an integer
 399
 400     Unless mentioned otherwise, the fields should be Unicode strings.
 401
 402     Unless mentioned otherwise, None is equivalent to absence of information.
 403
 404
 405     _type "playlist" indicates multiple videos.
 406     There must be a key "entries", which is a list, an iterable, or a PagedList
 407     object, each element of which is a valid dictionary by this specification.
 408
 409     Additionally, playlists can have "id", "title", and any other relevant
 410     attributes with the same semantics as videos (see above).
 411
 412     It can also have the following optional fields:
 413
 414     playlist_count: The total number of videos in a playlist. If not given,
 415                     YoutubeDL tries to calculate it from "entries"
 416
 417
 418     _type "multi_video" indicates that there are multiple videos that
 419     form a single show, for examples multiple acts of an opera or TV episode.
 420     It must have an entries key like a playlist and contain all the keys
 421     required for a video at the same time.
 422
 423
 424     _type "url" indicates that the video must be extracted from another
 425     location, possibly by a different extractor. Its only required key is:
 426     "url" - the next URL to extract.
 427     The key "ie_key" can be set to the class name (minus the trailing "IE",
 428     e.g. "Youtube") if the extractor class is known in advance.
 429     Additionally, the dictionary may have any properties of the resolved entity
 430     known in advance, for example "title" if the title of the referred video is
 431     known ahead of time.
 432
 433
 434     _type "url_transparent" entities have the same specification as "url", but
 435     indicate that the given additional information is more precise than the one
 436     associated with the resolved URL.
 437     This is useful when a site employs a video service that hosts the video and
 438     its technical metadata, but that video service does not embed a useful
 439     title, description etc.
 440
 441
 442     Subclasses of this should also be added to the list of extractors and
 443     should define a _VALID_URL regexp and, re-define the _real_extract() and
 444     (optionally) _real_initialize() methods.
 445
 446     Subclasses may also override suitable() if necessary, but ensure the function
 447     signature is preserved and that this function imports everything it needs
 448     (except other extractors), so that lazy_extractors works correctly.
 449
 450     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 451     the HTML of Generic webpages. It may also override _extract_embed_urls
 452     or _extract_from_webpage as necessary. While these are normally classmethods,
 453     _extract_from_webpage is allowed to be an instance method.
 454
 455     _extract_from_webpage may raise self.StopExtraction() to stop further
 456     processing of the webpage and obtain exclusive rights to it. This is useful
 457     when the extractor cannot reliably be matched using just the URL,
 458     e.g. invidious/peertube instances
 459
 460     Embed-only extractors can be defined by setting _VALID_URL = False.
 461
 462     To support username + password (or netrc) login, the extractor must define a
 463     _NETRC_MACHINE and re-define _perform_login(username, password) and
 464     (optionally) _initialize_pre_login() methods. The _perform_login method will
 465     be called between _initialize_pre_login and _real_initialize if credentials
 466     are passed by the user. In cases where it is necessary to have the login
 467     process as part of the extraction rather than initialization, _perform_login
 468     can be left undefined.
 469
 470     _GEO_BYPASS attribute may be set to False in order to disable
 471     geo restriction bypass mechanisms for a particular extractor.
 472     Though it won't disable explicit geo restriction bypass based on
 473     country code provided with geo_bypass_country.
 474
 475     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 476     countries for this extractor. One of these countries will be used by
 477     geo restriction bypass mechanism right away in order to bypass
 478     geo restriction, of course, if the mechanism is not disabled.
 479
 480     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 481     IP blocks in CIDR notation for this extractor. One of these IP blocks
 482     will be used by geo restriction bypass mechanism similarly
 483     to _GEO_COUNTRIES.
 484
 485     The _ENABLED attribute should be set to False for IEs that
 486     are disabled by default and must be explicitly enabled.
 487
 488     The _WORKING attribute should be set to False for broken IEs
 489     in order to warn the users and skip the tests.
 490     """
 491
 492     _ready = False
 493     _downloader = None
 494     _x_forwarded_for_ip = None
 495     _GEO_BYPASS = True
 496     _GEO_COUNTRIES = None
 497     _GEO_IP_BLOCKS = None
 498     _WORKING = True
 499     _ENABLED = True
 500     _NETRC_MACHINE = None
 501     IE_DESC = None
 502     SEARCH_KEY = None
 503     _VALID_URL = None
 504     _EMBED_REGEX = []
 505
 506     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 507         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 508         return {
 509             None: '',
 510             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 511             'password': f'Use {password_hint}',
 512             'cookies': (
 513                 'Use --cookies-from-browser or --cookies for the authentication. '
 514                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 515         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 516
 517     def __init__(self, downloader=None):
 518         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 519         If a downloader is not passed during initialization,
 520         it must be set using "set_downloader()" before "extract()" is called"""
 521         self._ready = False
 522         self._x_forwarded_for_ip = None
 523         self._printed_messages = set()
 524         self.set_downloader(downloader)
 525
 526     @classmethod
 527     def _match_valid_url(cls, url):
 528         if cls._VALID_URL is False:
 529             return None
 530         # This does not use has/getattr intentionally - we want to know whether
 531         # we have cached the regexp for *this* class, whereas getattr would also
 532         # match the superclass
 533         if '_VALID_URL_RE' not in cls.__dict__:
 534             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 535         return cls._VALID_URL_RE.match(url)
 536
 537     @classmethod
 538     def suitable(cls, url):
 539         """Receives a URL and returns True if suitable for this IE."""
 540         # This function must import everything it needs (except other extractors),
 541         # so that lazy_extractors works correctly
 542         return cls._match_valid_url(url) is not None
 543
 544     @classmethod
 545     def _match_id(cls, url):
 546         return cls._match_valid_url(url).group('id')
 547
 548     @classmethod
 549     def get_temp_id(cls, url):
 550         try:
 551             return cls._match_id(url)
 552         except (IndexError, AttributeError):
 553             return None
 554
 555     @classmethod
 556     def working(cls):
 557         """Getter method for _WORKING."""
 558         return cls._WORKING
 559
 560     @classmethod
 561     def supports_login(cls):
 562         return bool(cls._NETRC_MACHINE)
 563
 564     def initialize(self):
 565         """Initializes an instance (authentication, etc)."""
 566         self._printed_messages = set()
 567         self._initialize_geo_bypass({
 568             'countries': self._GEO_COUNTRIES,
 569             'ip_blocks': self._GEO_IP_BLOCKS,
 570         })
 571         if not self._ready:
 572             self._initialize_pre_login()
 573             if self.supports_login():
 574                 username, password = self._get_login_info()
 575                 if username:
 576                     self._perform_login(username, password)
 577             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 578                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 579             self._real_initialize()
 580             self._ready = True
 581
 582     def _initialize_geo_bypass(self, geo_bypass_context):
 583         """
 584         Initialize geo restriction bypass mechanism.
 585
 586         This method is used to initialize geo bypass mechanism based on faking
 587         X-Forwarded-For HTTP header. A random country from provided country list
 588         is selected and a random IP belonging to this country is generated. This
 589         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 590         HTTP requests.
 591
 592         This method will be used for initial geo bypass mechanism initialization
 593         during the instance initialization with _GEO_COUNTRIES and
 594         _GEO_IP_BLOCKS.
 595
 596         You may also manually call it from extractor's code if geo bypass
 597         information is not available beforehand (e.g. obtained during
 598         extraction) or due to some other reason. In this case you should pass
 599         this information in geo bypass context passed as first argument. It may
 600         contain following fields:
 601
 602         countries:  List of geo unrestricted countries (similar
 603                     to _GEO_COUNTRIES)
 604         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 605                     (similar to _GEO_IP_BLOCKS)
 606
 607         """
 608         if not self._x_forwarded_for_ip:
 609
 610             # Geo bypass mechanism is explicitly disabled by user
 611             if not self.get_param('geo_bypass', True):
 612                 return
 613
 614             if not geo_bypass_context:
 615                 geo_bypass_context = {}
 616
 617             # Backward compatibility: previously _initialize_geo_bypass
 618             # expected a list of countries, some 3rd party code may still use
 619             # it this way
 620             if isinstance(geo_bypass_context, (list, tuple)):
 621                 geo_bypass_context = {
 622                     'countries': geo_bypass_context,
 623                 }
 624
 625             # The whole point of geo bypass mechanism is to fake IP
 626             # as X-Forwarded-For HTTP header based on some IP block or
 627             # country code.
 628
 629             # Path 1: bypassing based on IP block in CIDR notation
 630
 631             # Explicit IP block specified by user, use it right away
 632             # regardless of whether extractor is geo bypassable or not
 633             ip_block = self.get_param('geo_bypass_ip_block', None)
 634
 635             # Otherwise use random IP block from geo bypass context but only
 636             # if extractor is known as geo bypassable
 637             if not ip_block:
 638                 ip_blocks = geo_bypass_context.get('ip_blocks')
 639                 if self._GEO_BYPASS and ip_blocks:
 640                     ip_block = random.choice(ip_blocks)
 641
 642             if ip_block:
 643                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 644                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 645                 return
 646
 647             # Path 2: bypassing based on country code
 648
 649             # Explicit country code specified by user, use it right away
 650             # regardless of whether extractor is geo bypassable or not
 651             country = self.get_param('geo_bypass_country', None)
 652
 653             # Otherwise use random country code from geo bypass context but
 654             # only if extractor is known as geo bypassable
 655             if not country:
 656                 countries = geo_bypass_context.get('countries')
 657                 if self._GEO_BYPASS and countries:
 658                     country = random.choice(countries)
 659
 660             if country:
 661                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 662                 self._downloader.write_debug(
 663                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 664
 665     def extract(self, url):
 666         """Extracts URL information and returns it in list of dicts."""
 667         try:
 668             for _ in range(2):
 669                 try:
 670                     self.initialize()
 671                     self.write_debug('Extracting URL: %s' % url)
 672                     ie_result = self._real_extract(url)
 673                     if ie_result is None:
 674                         return None
 675                     if self._x_forwarded_for_ip:
 676                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 677                     subtitles = ie_result.get('subtitles') or {}
 678                     if 'no-live-chat' in self.get_param('compat_opts'):
 679                         for lang in ('live_chat', 'comments', 'danmaku'):
 680                             subtitles.pop(lang, None)
 681                     return ie_result
 682                 except GeoRestrictedError as e:
 683                     if self.__maybe_fake_ip_and_retry(e.countries):
 684                         continue
 685                     raise
 686         except UnsupportedError:
 687             raise
 688         except ExtractorError as e:
 689             kwargs = {
 690                 'video_id': e.video_id or self.get_temp_id(url),
 691                 'ie': self.IE_NAME,
 692                 'tb': e.traceback or sys.exc_info()[2],
 693                 'expected': e.expected,
 694                 'cause': e.cause
 695             }
 696             if hasattr(e, 'countries'):
 697                 kwargs['countries'] = e.countries
 698             raise type(e)(e.orig_msg, **kwargs)
 699         except http.client.IncompleteRead as e:
 700             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 701         except (KeyError, StopIteration) as e:
 702             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 703
 704     def __maybe_fake_ip_and_retry(self, countries):
 705         if (not self.get_param('geo_bypass_country', None)
 706                 and self._GEO_BYPASS
 707                 and self.get_param('geo_bypass', True)
 708                 and not self._x_forwarded_for_ip
 709                 and countries):
 710             country_code = random.choice(countries)
 711             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 712             if self._x_forwarded_for_ip:
 713                 self.report_warning(
 714                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 715                     % (self._x_forwarded_for_ip, country_code.upper()))
 716                 return True
 717         return False
 718
 719     def set_downloader(self, downloader):
 720         """Sets a YoutubeDL instance as the downloader for this IE."""
 721         self._downloader = downloader
 722
 723     @property
 724     def cache(self):
 725         return self._downloader.cache
 726
 727     @property
 728     def cookiejar(self):
 729         return self._downloader.cookiejar
 730
 731     def _initialize_pre_login(self):
 732         """ Initialization before login. Redefine in subclasses."""
 733         pass
 734
 735     def _perform_login(self, username, password):
 736         """ Login with username and password. Redefine in subclasses."""
 737         pass
 738
 739     def _real_initialize(self):
 740         """Real initialization process. Redefine in subclasses."""
 741         pass
 742
 743     def _real_extract(self, url):
 744         """Real extraction process. Redefine in subclasses."""
 745         raise NotImplementedError('This method must be implemented by subclasses')
 746
 747     @classmethod
 748     def ie_key(cls):
 749         """A string for getting the InfoExtractor with get_info_extractor"""
 750         return cls.__name__[:-2]
 751
 752     @classproperty
 753     def IE_NAME(cls):
 754         return cls.__name__[:-2]
 755
 756     @staticmethod
 757     def __can_accept_status_code(err, expected_status):
 758         assert isinstance(err, urllib.error.HTTPError)
 759         if expected_status is None:
 760             return False
 761         elif callable(expected_status):
 762             return expected_status(err.code) is True
 763         else:
 764             return err.code in variadic(expected_status)
 765
 766     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 767         if isinstance(url_or_request, urllib.request.Request):
 768             return update_Request(url_or_request, data=data, headers=headers, query=query)
 769         if query:
 770             url_or_request = update_url_query(url_or_request, query)
 771         return sanitized_Request(url_or_request, data, headers or {})
 772
 773     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 774         """
 775         Return the response handle.
 776
 777         See _download_webpage docstring for arguments specification.
 778         """
 779         if not self._downloader._first_webpage_request:
 780             sleep_interval = self.get_param('sleep_interval_requests') or 0
 781             if sleep_interval > 0:
 782                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 783                 time.sleep(sleep_interval)
 784         else:
 785             self._downloader._first_webpage_request = False
 786
 787         if note is None:
 788             self.report_download_webpage(video_id)
 789         elif note is not False:
 790             if video_id is None:
 791                 self.to_screen(str(note))
 792             else:
 793                 self.to_screen(f'{video_id}: {note}')
 794
 795         # Some sites check X-Forwarded-For HTTP header in order to figure out
 796         # the origin of the client behind proxy. This allows bypassing geo
 797         # restriction by faking this header's value to IP that belongs to some
 798         # geo unrestricted country. We will do so once we encounter any
 799         # geo restriction error.
 800         if self._x_forwarded_for_ip:
 801             headers = (headers or {}).copy()
 802             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 803
 804         try:
 805             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 806         except network_exceptions as err:
 807             if isinstance(err, urllib.error.HTTPError):
 808                 if self.__can_accept_status_code(err, expected_status):
 809                     # Retain reference to error to prevent file object from
 810                     # being closed before it can be read. Works around the
 811                     # effects of <https://bugs.python.org/issue15002>
 812                     # introduced in Python 3.4.1.
 813                     err.fp._error = err
 814                     return err.fp
 815
 816             if errnote is False:
 817                 return False
 818             if errnote is None:
 819                 errnote = 'Unable to download webpage'
 820
 821             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 822             if fatal:
 823                 raise ExtractorError(errmsg, cause=err)
 824             else:
 825                 self.report_warning(errmsg)
 826                 return False
 827
 828     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 829                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 830         """
 831         Return a tuple (page content as string, URL handle).
 832
 833         Arguments:
 834         url_or_request -- plain text URL as a string or
 835             a urllib.request.Request object
 836         video_id -- Video/playlist/item identifier (string)
 837
 838         Keyword arguments:
 839         note -- note printed before downloading (string)
 840         errnote -- note printed in case of an error (string)
 841         fatal -- flag denoting whether error should be considered fatal,
 842             i.e. whether it should cause ExtractionError to be raised,
 843             otherwise a warning will be reported and extraction continued
 844         encoding -- encoding for a page content decoding, guessed automatically
 845             when not explicitly specified
 846         data -- POST data (bytes)
 847         headers -- HTTP headers (dict)
 848         query -- URL query (dict)
 849         expected_status -- allows to accept failed HTTP requests (non 2xx
 850             status code) by explicitly specifying a set of accepted status
 851             codes. Can be any of the following entities:
 852                 - an integer type specifying an exact failed status code to
 853                   accept
 854                 - a list or a tuple of integer types specifying a list of
 855                   failed status codes to accept
 856                 - a callable accepting an actual failed status code and
 857                   returning True if it should be accepted
 858             Note that this argument does not affect success status codes (2xx)
 859             which are always accepted.
 860         """
 861
 862         # Strip hashes from the URL (#1038)
 863         if isinstance(url_or_request, str):
 864             url_or_request = url_or_request.partition('#')[0]
 865
 866         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 867         if urlh is False:
 868             assert not fatal
 869             return False
 870         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 871         return (content, urlh)
 872
 873     @staticmethod
 874     def _guess_encoding_from_content(content_type, webpage_bytes):
 875         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 876         if m:
 877             encoding = m.group(1)
 878         else:
 879             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 880                           webpage_bytes[:1024])
 881             if m:
 882                 encoding = m.group(1).decode('ascii')
 883             elif webpage_bytes.startswith(b'\xff\xfe'):
 884                 encoding = 'utf-16'
 885             else:
 886                 encoding = 'utf-8'
 887
 888         return encoding
 889
 890     def __check_blocked(self, content):
 891         first_block = content[:512]
 892         if ('<title>Access to this site is blocked</title>' in content
 893                 and 'Websense' in first_block):
 894             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 895             blocked_iframe = self._html_search_regex(
 896                 r'<iframe src="([^"]+)"', content,
 897                 'Websense information URL', default=None)
 898             if blocked_iframe:
 899                 msg += ' Visit %s for more details' % blocked_iframe
 900             raise ExtractorError(msg, expected=True)
 901         if '<title>The URL you requested has been blocked</title>' in first_block:
 902             msg = (
 903                 'Access to this webpage has been blocked by Indian censorship. '
 904                 'Use a VPN or proxy server (with --proxy) to route around it.')
 905             block_msg = self._html_search_regex(
 906                 r'</h1><p>(.*?)</p>',
 907                 content, 'block message', default=None)
 908             if block_msg:
 909                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 910             raise ExtractorError(msg, expected=True)
 911         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 912                 and 'blocklist.rkn.gov.ru' in content):
 913             raise ExtractorError(
 914                 'Access to this webpage has been blocked by decision of the Russian government. '
 915                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 916                 expected=True)
 917
 918     def _request_dump_filename(self, url, video_id):
 919         basen = f'{video_id}_{url}'
 920         trim_length = self.get_param('trim_file_name') or 240
 921         if len(basen) > trim_length:
 922             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 923             basen = basen[:trim_length - len(h)] + h
 924         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 925         # Working around MAX_PATH limitation on Windows (see
 926         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 927         if compat_os_name == 'nt':
 928             absfilepath = os.path.abspath(filename)
 929             if len(absfilepath) > 259:
 930                 filename = fR'\\?\{absfilepath}'
 931         return filename
 932
 933     def __decode_webpage(self, webpage_bytes, encoding, headers):
 934         if not encoding:
 935             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 936         try:
 937             return webpage_bytes.decode(encoding, 'replace')
 938         except LookupError:
 939             return webpage_bytes.decode('utf-8', 'replace')
 940
 941     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 942         webpage_bytes = urlh.read()
 943         if prefix is not None:
 944             webpage_bytes = prefix + webpage_bytes
 945         if self.get_param('dump_intermediate_pages', False):
 946             self.to_screen('Dumping request to ' + urlh.geturl())
 947             dump = base64.b64encode(webpage_bytes).decode('ascii')
 948             self._downloader.to_screen(dump)
 949         if self.get_param('write_pages'):
 950             filename = self._request_dump_filename(urlh.geturl(), video_id)
 951             self.to_screen(f'Saving request to {filename}')
 952             with open(filename, 'wb') as outf:
 953                 outf.write(webpage_bytes)
 954
 955         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 956         self.__check_blocked(content)
 957
 958         return content
 959
 960     def __print_error(self, errnote, fatal, video_id, err):
 961         if fatal:
 962             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 963         elif errnote:
 964             self.report_warning(f'{video_id}: {errnote}: {err}')
 965
 966     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 967         if transform_source:
 968             xml_string = transform_source(xml_string)
 969         try:
 970             return compat_etree_fromstring(xml_string.encode('utf-8'))
 971         except xml.etree.ElementTree.ParseError as ve:
 972             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 973
 974     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
 975         try:
 976             return json.loads(
 977                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 978         except ValueError as ve:
 979             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
 980
 981     def _parse_socket_response_as_json(self, data, *args, **kwargs):
 982         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
 983
 984     def __create_download_methods(name, parser, note, errnote, return_value):
 985
 986         def parse(ie, content, *args, errnote=errnote, **kwargs):
 987             if parser is None:
 988                 return content
 989             if errnote is False:
 990                 kwargs['errnote'] = errnote
 991             # parser is fetched by name so subclasses can override it
 992             return getattr(ie, parser)(content, *args, **kwargs)
 993
 994         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 995                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 996             res = self._download_webpage_handle(
 997                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 998                 data=data, headers=headers, query=query, expected_status=expected_status)
 999             if res is False:
1000                 return res
1001             content, urlh = res
1002             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1003
1004         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1005                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1006             if self.get_param('load_pages'):
1007                 url_or_request = self._create_request(url_or_request, data, headers, query)
1008                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1009                 self.to_screen(f'Loading request from {filename}')
1010                 try:
1011                     with open(filename, 'rb') as dumpf:
1012                         webpage_bytes = dumpf.read()
1013                 except OSError as e:
1014                     self.report_warning(f'Unable to load request from disk: {e}')
1015                 else:
1016                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1017                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1018             kwargs = {
1019                 'note': note,
1020                 'errnote': errnote,
1021                 'transform_source': transform_source,
1022                 'fatal': fatal,
1023                 'encoding': encoding,
1024                 'data': data,
1025                 'headers': headers,
1026                 'query': query,
1027                 'expected_status': expected_status,
1028             }
1029             if parser is None:
1030                 kwargs.pop('transform_source')
1031             # The method is fetched by name so subclasses can override _download_..._handle
1032             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1033             return res if res is False else res[0]
1034
1035         def impersonate(func, name, return_value):
1036             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1037             func.__doc__ = f'''
1038                 @param transform_source     Apply this transformation before parsing
1039                 @returns                    {return_value}
1040
1041                 See _download_webpage_handle docstring for other arguments specification
1042             '''
1043
1044         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1045         impersonate(download_content, f'_download_{name}', f'{return_value}')
1046         return download_handle, download_content
1047
1048     _download_xml_handle, _download_xml = __create_download_methods(
1049         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1050     _download_json_handle, _download_json = __create_download_methods(
1051         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1052     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1053         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1054     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1055
1056     def _download_webpage(
1057             self, url_or_request, video_id, note=None, errnote=None,
1058             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1059         """
1060         Return the data of the page as a string.
1061
1062         Keyword arguments:
1063         tries -- number of tries
1064         timeout -- sleep interval between tries
1065
1066         See _download_webpage_handle docstring for other arguments specification.
1067         """
1068
1069         R''' # NB: These are unused; should they be deprecated?
1070         if tries != 1:
1071             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1072         if timeout is NO_DEFAULT:
1073             timeout = 5
1074         else:
1075             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1076         '''
1077
1078         try_count = 0
1079         while True:
1080             try:
1081                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1082             except http.client.IncompleteRead as e:
1083                 try_count += 1
1084                 if try_count >= tries:
1085                     raise e
1086                 self._sleep(timeout, video_id)
1087
1088     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1089         idstr = format_field(video_id, None, '%s: ')
1090         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1091         if only_once:
1092             if f'WARNING: {msg}' in self._printed_messages:
1093                 return
1094             self._printed_messages.add(f'WARNING: {msg}')
1095         self._downloader.report_warning(msg, *args, **kwargs)
1096
1097     def to_screen(self, msg, *args, **kwargs):
1098         """Print msg to screen, prefixing it with '[ie_name]'"""
1099         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1100
1101     def write_debug(self, msg, *args, **kwargs):
1102         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1103
1104     def get_param(self, name, default=None, *args, **kwargs):
1105         if self._downloader:
1106             return self._downloader.params.get(name, default, *args, **kwargs)
1107         return default
1108
1109     def report_drm(self, video_id, partial=False):
1110         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1111
1112     def report_extraction(self, id_or_name):
1113         """Report information extraction."""
1114         self.to_screen('%s: Extracting information' % id_or_name)
1115
1116     def report_download_webpage(self, video_id):
1117         """Report webpage download."""
1118         self.to_screen('%s: Downloading webpage' % video_id)
1119
1120     def report_age_confirmation(self):
1121         """Report attempt to confirm age."""
1122         self.to_screen('Confirming age')
1123
1124     def report_login(self):
1125         """Report attempt to log in."""
1126         self.to_screen('Logging in')
1127
1128     def raise_login_required(
1129             self, msg='This video is only available for registered users',
1130             metadata_available=False, method=NO_DEFAULT):
1131         if metadata_available and (
1132                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1133             self.report_warning(msg)
1134             return
1135         msg += format_field(self._login_hint(method), None, '. %s')
1136         raise ExtractorError(msg, expected=True)
1137
1138     def raise_geo_restricted(
1139             self, msg='This video is not available from your location due to geo restriction',
1140             countries=None, metadata_available=False):
1141         if metadata_available and (
1142                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1143             self.report_warning(msg)
1144         else:
1145             raise GeoRestrictedError(msg, countries=countries)
1146
1147     def raise_no_formats(self, msg, expected=False, video_id=None):
1148         if expected and (
1149                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1150             self.report_warning(msg, video_id)
1151         elif isinstance(msg, ExtractorError):
1152             raise msg
1153         else:
1154             raise ExtractorError(msg, expected=expected, video_id=video_id)
1155
1156     # Methods for following #608
1157     @staticmethod
1158     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1159         """Returns a URL that points to a page that should be processed"""
1160         if ie is not None:
1161             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1162         if video_id is not None:
1163             kwargs['id'] = video_id
1164         if video_title is not None:
1165             kwargs['title'] = video_title
1166         return {
1167             **kwargs,
1168             '_type': 'url_transparent' if url_transparent else 'url',
1169             'url': url,
1170         }
1171
1172     @classmethod
1173     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1174                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1175         return cls.playlist_result(
1176             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1177             playlist_id, playlist_title, **kwargs)
1178
1179     @staticmethod
1180     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1181         """Returns a playlist"""
1182         if playlist_id:
1183             kwargs['id'] = playlist_id
1184         if playlist_title:
1185             kwargs['title'] = playlist_title
1186         if playlist_description is not None:
1187             kwargs['description'] = playlist_description
1188         return {
1189             **kwargs,
1190             '_type': 'multi_video' if multi_video else 'playlist',
1191             'entries': entries,
1192         }
1193
1194     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1195         """
1196         Perform a regex search on the given string, using a single or a list of
1197         patterns returning the first matching group.
1198         In case of failure return a default value or raise a WARNING or a
1199         RegexNotFoundError, depending on fatal, specifying the field name.
1200         """
1201         if string is None:
1202             mobj = None
1203         elif isinstance(pattern, (str, re.Pattern)):
1204             mobj = re.search(pattern, string, flags)
1205         else:
1206             for p in pattern:
1207                 mobj = re.search(p, string, flags)
1208                 if mobj:
1209                     break
1210
1211         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1212
1213         if mobj:
1214             if group is None:
1215                 # return the first matching group
1216                 return next(g for g in mobj.groups() if g is not None)
1217             elif isinstance(group, (list, tuple)):
1218                 return tuple(mobj.group(g) for g in group)
1219             else:
1220                 return mobj.group(group)
1221         elif default is not NO_DEFAULT:
1222             return default
1223         elif fatal:
1224             raise RegexNotFoundError('Unable to extract %s' % _name)
1225         else:
1226             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1227             return None
1228
1229     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1230                      contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs):
1231         """Searches string for the JSON object specified by start_pattern"""
1232         # NB: end_pattern is only used to reduce the size of the initial match
1233         if default is NO_DEFAULT:
1234             default, has_default = {}, False
1235         else:
1236             fatal, has_default = False, True
1237
1238         json_string = self._search_regex(
1239             rf'(?:{start_pattern})\s*(?P<json>{{\s*(?:{contains_pattern})\s*}})\s*(?:{end_pattern})',
1240             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1241         if not json_string:
1242             return default
1243
1244         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1245         try:
1246             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1247         except ExtractorError as e:
1248             if fatal:
1249                 raise ExtractorError(
1250                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1251             elif not has_default:
1252                 self.report_warning(
1253                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1254         return default
1255
1256     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1257         """
1258         Like _search_regex, but strips HTML tags and unescapes entities.
1259         """
1260         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1261         if res:
1262             return clean_html(res).strip()
1263         else:
1264             return res
1265
1266     def _get_netrc_login_info(self, netrc_machine=None):
1267         username = None
1268         password = None
1269         netrc_machine = netrc_machine or self._NETRC_MACHINE
1270
1271         if self.get_param('usenetrc', False):
1272             try:
1273                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1274                 if os.path.isdir(netrc_file):
1275                     netrc_file = os.path.join(netrc_file, '.netrc')
1276                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1277                 if info is not None:
1278                     username = info[0]
1279                     password = info[2]
1280                 else:
1281                     raise netrc.NetrcParseError(
1282                         'No authenticators for %s' % netrc_machine)
1283             except (OSError, netrc.NetrcParseError) as err:
1284                 self.report_warning(
1285                     'parsing .netrc: %s' % error_to_compat_str(err))
1286
1287         return username, password
1288
1289     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1290         """
1291         Get the login info as (username, password)
1292         First look for the manually specified credentials using username_option
1293         and password_option as keys in params dictionary. If no such credentials
1294         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1295         value.
1296         If there's no info available, return (None, None)
1297         """
1298
1299         # Attempt to use provided username and password or .netrc data
1300         username = self.get_param(username_option)
1301         if username is not None:
1302             password = self.get_param(password_option)
1303         else:
1304             username, password = self._get_netrc_login_info(netrc_machine)
1305
1306         return username, password
1307
1308     def _get_tfa_info(self, note='two-factor verification code'):
1309         """
1310         Get the two-factor authentication info
1311         TODO - asking the user will be required for sms/phone verify
1312         currently just uses the command line option
1313         If there's no info available, return None
1314         """
1315
1316         tfa = self.get_param('twofactor')
1317         if tfa is not None:
1318             return tfa
1319
1320         return getpass.getpass('Type %s and press [Return]: ' % note)
1321
1322     # Helper functions for extracting OpenGraph info
1323     @staticmethod
1324     def _og_regexes(prop):
1325         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1326         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1327                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1328         template = r'<meta[^>]+?%s[^>]+?%s'
1329         return [
1330             template % (property_re, content_re),
1331             template % (content_re, property_re),
1332         ]
1333
1334     @staticmethod
1335     def _meta_regex(prop):
1336         return r'''(?isx)<meta
1337                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1338                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1339
1340     def _og_search_property(self, prop, html, name=None, **kargs):
1341         prop = variadic(prop)
1342         if name is None:
1343             name = 'OpenGraph %s' % prop[0]
1344         og_regexes = []
1345         for p in prop:
1346             og_regexes.extend(self._og_regexes(p))
1347         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1348         if escaped is None:
1349             return None
1350         return unescapeHTML(escaped)
1351
1352     def _og_search_thumbnail(self, html, **kargs):
1353         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1354
1355     def _og_search_description(self, html, **kargs):
1356         return self._og_search_property('description', html, fatal=False, **kargs)
1357
1358     def _og_search_title(self, html, *, fatal=False, **kargs):
1359         return self._og_search_property('title', html, fatal=fatal, **kargs)
1360
1361     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1362         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1363         if secure:
1364             regexes = self._og_regexes('video:secure_url') + regexes
1365         return self._html_search_regex(regexes, html, name, **kargs)
1366
1367     def _og_search_url(self, html, **kargs):
1368         return self._og_search_property('url', html, **kargs)
1369
1370     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1371         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1372
1373     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1374         name = variadic(name)
1375         if display_name is None:
1376             display_name = name[0]
1377         return self._html_search_regex(
1378             [self._meta_regex(n) for n in name],
1379             html, display_name, fatal=fatal, group='content', **kwargs)
1380
1381     def _dc_search_uploader(self, html):
1382         return self._html_search_meta('dc.creator', html, 'uploader')
1383
1384     @staticmethod
1385     def _rta_search(html):
1386         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1387         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1388                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1389                      html):
1390             return 18
1391
1392         # And then there are the jokers who advertise that they use RTA, but actually don't.
1393         AGE_LIMIT_MARKERS = [
1394             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1395         ]
1396         if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
1397             return 18
1398         return 0
1399
1400     def _media_rating_search(self, html):
1401         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1402         rating = self._html_search_meta('rating', html)
1403
1404         if not rating:
1405             return None
1406
1407         RATING_TABLE = {
1408             'safe for kids': 0,
1409             'general': 8,
1410             '14 years': 14,
1411             'mature': 17,
1412             'restricted': 19,
1413         }
1414         return RATING_TABLE.get(rating.lower())
1415
1416     def _family_friendly_search(self, html):
1417         # See http://schema.org/VideoObject
1418         family_friendly = self._html_search_meta(
1419             'isFamilyFriendly', html, default=None)
1420
1421         if not family_friendly:
1422             return None
1423
1424         RATING_TABLE = {
1425             '1': 0,
1426             'true': 0,
1427             '0': 18,
1428             'false': 18,
1429         }
1430         return RATING_TABLE.get(family_friendly.lower())
1431
1432     def _twitter_search_player(self, html):
1433         return self._html_search_meta('twitter:player', html,
1434                                       'twitter card player')
1435
1436     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1437         """Yield all json ld objects in the html"""
1438         if default is not NO_DEFAULT:
1439             fatal = False
1440         for mobj in re.finditer(JSON_LD_RE, html):
1441             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1442             for json_ld in variadic(json_ld_item):
1443                 if isinstance(json_ld, dict):
1444                     yield json_ld
1445
1446     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1447         """Search for a video in any json ld in the html"""
1448         if default is not NO_DEFAULT:
1449             fatal = False
1450         info = self._json_ld(
1451             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1452             video_id, fatal=fatal, expected_type=expected_type)
1453         if info:
1454             return info
1455         if default is not NO_DEFAULT:
1456             return default
1457         elif fatal:
1458             raise RegexNotFoundError('Unable to extract JSON-LD')
1459         else:
1460             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1461             return {}
1462
1463     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1464         if isinstance(json_ld, str):
1465             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1466         if not json_ld:
1467             return {}
1468         info = {}
1469         if not isinstance(json_ld, (list, tuple, dict)):
1470             return info
1471         if isinstance(json_ld, dict):
1472             json_ld = [json_ld]
1473
1474         INTERACTION_TYPE_MAP = {
1475             'CommentAction': 'comment',
1476             'AgreeAction': 'like',
1477             'DisagreeAction': 'dislike',
1478             'LikeAction': 'like',
1479             'DislikeAction': 'dislike',
1480             'ListenAction': 'view',
1481             'WatchAction': 'view',
1482             'ViewAction': 'view',
1483         }
1484
1485         def is_type(e, *expected_types):
1486             type = variadic(traverse_obj(e, '@type'))
1487             return any(x in type for x in expected_types)
1488
1489         def extract_interaction_type(e):
1490             interaction_type = e.get('interactionType')
1491             if isinstance(interaction_type, dict):
1492                 interaction_type = interaction_type.get('@type')
1493             return str_or_none(interaction_type)
1494
1495         def extract_interaction_statistic(e):
1496             interaction_statistic = e.get('interactionStatistic')
1497             if isinstance(interaction_statistic, dict):
1498                 interaction_statistic = [interaction_statistic]
1499             if not isinstance(interaction_statistic, list):
1500                 return
1501             for is_e in interaction_statistic:
1502                 if not is_type(is_e, 'InteractionCounter'):
1503                     continue
1504                 interaction_type = extract_interaction_type(is_e)
1505                 if not interaction_type:
1506                     continue
1507                 # For interaction count some sites provide string instead of
1508                 # an integer (as per spec) with non digit characters (e.g. ",")
1509                 # so extracting count with more relaxed str_to_int
1510                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1511                 if interaction_count is None:
1512                     continue
1513                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1514                 if not count_kind:
1515                     continue
1516                 count_key = '%s_count' % count_kind
1517                 if info.get(count_key) is not None:
1518                     continue
1519                 info[count_key] = interaction_count
1520
1521         def extract_chapter_information(e):
1522             chapters = [{
1523                 'title': part.get('name'),
1524                 'start_time': part.get('startOffset'),
1525                 'end_time': part.get('endOffset'),
1526             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1527             for idx, (last_c, current_c, next_c) in enumerate(zip(
1528                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1529                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1530                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1531                 if None in current_c.values():
1532                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1533                     return
1534             if chapters:
1535                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1536                 info['chapters'] = chapters
1537
1538         def extract_video_object(e):
1539             author = e.get('author')
1540             info.update({
1541                 'url': url_or_none(e.get('contentUrl')),
1542                 'ext': mimetype2ext(e.get('encodingFormat')),
1543                 'title': unescapeHTML(e.get('name')),
1544                 'description': unescapeHTML(e.get('description')),
1545                 'thumbnails': [{'url': unescapeHTML(url)}
1546                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1547                                if url_or_none(url)],
1548                 'duration': parse_duration(e.get('duration')),
1549                 'timestamp': unified_timestamp(e.get('uploadDate')),
1550                 # author can be an instance of 'Organization' or 'Person' types.
1551                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1552                 # however some websites are using 'Text' type instead.
1553                 # 1. https://schema.org/VideoObject
1554                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1555                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1556                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1557                 'tbr': int_or_none(e.get('bitrate')),
1558                 'width': int_or_none(e.get('width')),
1559                 'height': int_or_none(e.get('height')),
1560                 'view_count': int_or_none(e.get('interactionCount')),
1561                 'tags': try_call(lambda: e.get('keywords').split(',')),
1562             })
1563             if is_type(e, 'AudioObject'):
1564                 info.update({
1565                     'vcodec': 'none',
1566                     'abr': int_or_none(e.get('bitrate')),
1567                 })
1568             extract_interaction_statistic(e)
1569             extract_chapter_information(e)
1570
1571         def traverse_json_ld(json_ld, at_top_level=True):
1572             for e in json_ld:
1573                 if at_top_level and '@context' not in e:
1574                     continue
1575                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1576                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1577                     break
1578                 if expected_type is not None and not is_type(e, expected_type):
1579                     continue
1580                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1581                 if rating is not None:
1582                     info['average_rating'] = rating
1583                 if is_type(e, 'TVEpisode', 'Episode'):
1584                     episode_name = unescapeHTML(e.get('name'))
1585                     info.update({
1586                         'episode': episode_name,
1587                         'episode_number': int_or_none(e.get('episodeNumber')),
1588                         'description': unescapeHTML(e.get('description')),
1589                     })
1590                     if not info.get('title') and episode_name:
1591                         info['title'] = episode_name
1592                     part_of_season = e.get('partOfSeason')
1593                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1594                         info.update({
1595                             'season': unescapeHTML(part_of_season.get('name')),
1596                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1597                         })
1598                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1599                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1600                         info['series'] = unescapeHTML(part_of_series.get('name'))
1601                 elif is_type(e, 'Movie'):
1602                     info.update({
1603                         'title': unescapeHTML(e.get('name')),
1604                         'description': unescapeHTML(e.get('description')),
1605                         'duration': parse_duration(e.get('duration')),
1606                         'timestamp': unified_timestamp(e.get('dateCreated')),
1607                     })
1608                 elif is_type(e, 'Article', 'NewsArticle'):
1609                     info.update({
1610                         'timestamp': parse_iso8601(e.get('datePublished')),
1611                         'title': unescapeHTML(e.get('headline')),
1612                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1613                     })
1614                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1615                         extract_video_object(e['video'][0])
1616                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1617                         extract_video_object(e['subjectOf'][0])
1618                 elif is_type(e, 'VideoObject', 'AudioObject'):
1619                     extract_video_object(e)
1620                     if expected_type is None:
1621                         continue
1622                     else:
1623                         break
1624                 video = e.get('video')
1625                 if is_type(video, 'VideoObject'):
1626                     extract_video_object(video)
1627                 if expected_type is None:
1628                     continue
1629                 else:
1630                     break
1631         traverse_json_ld(json_ld)
1632
1633         return filter_dict(info)
1634
1635     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1636         return self._parse_json(
1637             self._search_regex(
1638                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1639                 webpage, 'next.js data', fatal=fatal, **kw),
1640             video_id, transform_source=transform_source, fatal=fatal)
1641
1642     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1643         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1644         rectx = re.escape(context_name)
1645         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1646         js, arg_keys, arg_vals = self._search_regex(
1647             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1648             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
1649
1650         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1651
1652         for key, val in args.items():
1653             if val in ('undefined', 'void 0'):
1654                 args[key] = 'null'
1655
1656         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1657         return traverse_obj(ret, traverse) or {}
1658
1659     @staticmethod
1660     def _hidden_inputs(html):
1661         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1662         hidden_inputs = {}
1663         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1664             attrs = extract_attributes(input)
1665             if not input:
1666                 continue
1667             if attrs.get('type') not in ('hidden', 'submit'):
1668                 continue
1669             name = attrs.get('name') or attrs.get('id')
1670             value = attrs.get('value')
1671             if name and value is not None:
1672                 hidden_inputs[name] = value
1673         return hidden_inputs
1674
1675     def _form_hidden_inputs(self, form_id, html):
1676         form = self._search_regex(
1677             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1678             html, '%s form' % form_id, group='form')
1679         return self._hidden_inputs(form)
1680
1681     class FormatSort:
1682         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1683
1684         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1685                    'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
1686                    'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1687         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1688                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1689                         'fps', 'fs_approx', 'source', 'id')
1690
1691         settings = {
1692             'vcodec': {'type': 'ordered', 'regex': True,
1693                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1694             'acodec': {'type': 'ordered', 'regex': True,
1695                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1696             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1697                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1698             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1699                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1700             'vext': {'type': 'ordered', 'field': 'video_ext',
1701                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1702                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1703             'aext': {'type': 'ordered', 'field': 'audio_ext',
1704                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1705                      'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
1706             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1707             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1708                            'field': ('vcodec', 'acodec'),
1709                            'function': lambda it: int(any(v != 'none' for v in it))},
1710             'ie_pref': {'priority': True, 'type': 'extractor'},
1711             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1712             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1713             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1714             'quality': {'convert': 'float', 'default': -1},
1715             'filesize': {'convert': 'bytes'},
1716             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1717             'id': {'convert': 'string', 'field': 'format_id'},
1718             'height': {'convert': 'float_none'},
1719             'width': {'convert': 'float_none'},
1720             'fps': {'convert': 'float_none'},
1721             'channels': {'convert': 'float_none', 'field': 'audio_channels'},
1722             'tbr': {'convert': 'float_none'},
1723             'vbr': {'convert': 'float_none'},
1724             'abr': {'convert': 'float_none'},
1725             'asr': {'convert': 'float_none'},
1726             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1727
1728             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1729             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1730             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1731             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1732             'res': {'type': 'multiple', 'field': ('height', 'width'),
1733                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1734
1735             # Actual field names
1736             'format_id': {'type': 'alias', 'field': 'id'},
1737             'preference': {'type': 'alias', 'field': 'ie_pref'},
1738             'language_preference': {'type': 'alias', 'field': 'lang'},
1739             'source_preference': {'type': 'alias', 'field': 'source'},
1740             'protocol': {'type': 'alias', 'field': 'proto'},
1741             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1742             'audio_channels': {'type': 'alias', 'field': 'channels'},
1743
1744             # Deprecated
1745             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1746             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1747             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1748             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1749             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1750             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1751             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1752             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1753             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1754             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1755             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1756             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1757             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1758             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1759             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1760             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1761             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1762             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1763             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1764             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1765         }
1766
1767         def __init__(self, ie, field_preference):
1768             self._order = []
1769             self.ydl = ie._downloader
1770             self.evaluate_params(self.ydl.params, field_preference)
1771             if ie.get_param('verbose'):
1772                 self.print_verbose_info(self.ydl.write_debug)
1773
1774         def _get_field_setting(self, field, key):
1775             if field not in self.settings:
1776                 if key in ('forced', 'priority'):
1777                     return False
1778                 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
1779                                             'deprecated and may be removed in a future version')
1780                 self.settings[field] = {}
1781             propObj = self.settings[field]
1782             if key not in propObj:
1783                 type = propObj.get('type')
1784                 if key == 'field':
1785                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1786                 elif key == 'convert':
1787                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1788                 else:
1789                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1790                 propObj[key] = default
1791             return propObj[key]
1792
1793         def _resolve_field_value(self, field, value, convertNone=False):
1794             if value is None:
1795                 if not convertNone:
1796                     return None
1797             else:
1798                 value = value.lower()
1799             conversion = self._get_field_setting(field, 'convert')
1800             if conversion == 'ignore':
1801                 return None
1802             if conversion == 'string':
1803                 return value
1804             elif conversion == 'float_none':
1805                 return float_or_none(value)
1806             elif conversion == 'bytes':
1807                 return FileDownloader.parse_bytes(value)
1808             elif conversion == 'order':
1809                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1810                 use_regex = self._get_field_setting(field, 'regex')
1811                 list_length = len(order_list)
1812                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1813                 if use_regex and value is not None:
1814                     for i, regex in enumerate(order_list):
1815                         if regex and re.match(regex, value):
1816                             return list_length - i
1817                     return list_length - empty_pos  # not in list
1818                 else:  # not regex or  value = None
1819                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1820             else:
1821                 if value.isnumeric():
1822                     return float(value)
1823                 else:
1824                     self.settings[field]['convert'] = 'string'
1825                     return value
1826
1827         def evaluate_params(self, params, sort_extractor):
1828             self._use_free_order = params.get('prefer_free_formats', False)
1829             self._sort_user = params.get('format_sort', [])
1830             self._sort_extractor = sort_extractor
1831
1832             def add_item(field, reverse, closest, limit_text):
1833                 field = field.lower()
1834                 if field in self._order:
1835                     return
1836                 self._order.append(field)
1837                 limit = self._resolve_field_value(field, limit_text)
1838                 data = {
1839                     'reverse': reverse,
1840                     'closest': False if limit is None else closest,
1841                     'limit_text': limit_text,
1842                     'limit': limit}
1843                 if field in self.settings:
1844                     self.settings[field].update(data)
1845                 else:
1846                     self.settings[field] = data
1847
1848             sort_list = (
1849                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1850                 + (tuple() if params.get('format_sort_force', False)
1851                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1852                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1853
1854             for item in sort_list:
1855                 match = re.match(self.regex, item)
1856                 if match is None:
1857                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1858                 field = match.group('field')
1859                 if field is None:
1860                     continue
1861                 if self._get_field_setting(field, 'type') == 'alias':
1862                     alias, field = field, self._get_field_setting(field, 'field')
1863                     if self._get_field_setting(alias, 'deprecated'):
1864                         self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
1865                                                     'be removed in a future version. Please use {field} instead')
1866                 reverse = match.group('reverse') is not None
1867                 closest = match.group('separator') == '~'
1868                 limit_text = match.group('limit')
1869
1870                 has_limit = limit_text is not None
1871                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1872                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1873
1874                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1875                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1876                 limit_count = len(limits)
1877                 for (i, f) in enumerate(fields):
1878                     add_item(f, reverse, closest,
1879                              limits[i] if i < limit_count
1880                              else limits[0] if has_limit and not has_multiple_limits
1881                              else None)
1882
1883         def print_verbose_info(self, write_debug):
1884             if self._sort_user:
1885                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1886             if self._sort_extractor:
1887                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1888             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1889                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1890                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1891                               self._get_field_setting(field, 'limit_text'),
1892                               self._get_field_setting(field, 'limit'))
1893                 if self._get_field_setting(field, 'limit_text') is not None else '')
1894                 for field in self._order if self._get_field_setting(field, 'visible')]))
1895
1896         def _calculate_field_preference_from_value(self, format, field, type, value):
1897             reverse = self._get_field_setting(field, 'reverse')
1898             closest = self._get_field_setting(field, 'closest')
1899             limit = self._get_field_setting(field, 'limit')
1900
1901             if type == 'extractor':
1902                 maximum = self._get_field_setting(field, 'max')
1903                 if value is None or (maximum is not None and value >= maximum):
1904                     value = -1
1905             elif type == 'boolean':
1906                 in_list = self._get_field_setting(field, 'in_list')
1907                 not_in_list = self._get_field_setting(field, 'not_in_list')
1908                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1909             elif type == 'ordered':
1910                 value = self._resolve_field_value(field, value, True)
1911
1912             # try to convert to number
1913             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1914             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1915             if is_num:
1916                 value = val_num
1917
1918             return ((-10, 0) if value is None
1919                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1920                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1921                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1922                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1923                     else (-1, value, 0))
1924
1925         def _calculate_field_preference(self, format, field):
1926             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1927             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1928             if type == 'multiple':
1929                 type = 'field'  # Only 'field' is allowed in multiple for now
1930                 actual_fields = self._get_field_setting(field, 'field')
1931
1932                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1933             else:
1934                 value = get_value(field)
1935             return self._calculate_field_preference_from_value(format, field, type, value)
1936
1937         def calculate_preference(self, format):
1938             # Determine missing protocol
1939             if not format.get('protocol'):
1940                 format['protocol'] = determine_protocol(format)
1941
1942             # Determine missing ext
1943             if not format.get('ext') and 'url' in format:
1944                 format['ext'] = determine_ext(format['url'])
1945             if format.get('vcodec') == 'none':
1946                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1947                 format['video_ext'] = 'none'
1948             else:
1949                 format['video_ext'] = format['ext']
1950                 format['audio_ext'] = 'none'
1951             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1952             #    format['preference'] = -1000
1953
1954             # Determine missing bitrates
1955             if format.get('tbr') is None:
1956                 if format.get('vbr') is not None and format.get('abr') is not None:
1957                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1958             else:
1959                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1960                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1961                 if format.get('acodec') != 'none' and format.get('abr') is None:
1962                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1963
1964             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1965
1966     def _sort_formats(self, formats, field_preference=[]):
1967         if not formats:
1968             return
1969         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1970
1971     def _check_formats(self, formats, video_id):
1972         if formats:
1973             formats[:] = filter(
1974                 lambda f: self._is_valid_url(
1975                     f['url'], video_id,
1976                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1977                 formats)
1978
1979     @staticmethod
1980     def _remove_duplicate_formats(formats):
1981         format_urls = set()
1982         unique_formats = []
1983         for f in formats:
1984             if f['url'] not in format_urls:
1985                 format_urls.add(f['url'])
1986                 unique_formats.append(f)
1987         formats[:] = unique_formats
1988
1989     def _is_valid_url(self, url, video_id, item='video', headers={}):
1990         url = self._proto_relative_url(url, scheme='http:')
1991         # For now assume non HTTP(S) URLs always valid
1992         if not (url.startswith('http://') or url.startswith('https://')):
1993             return True
1994         try:
1995             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1996             return True
1997         except ExtractorError as e:
1998             self.to_screen(
1999                 '%s: %s URL is invalid, skipping: %s'
2000                 % (video_id, item, error_to_compat_str(e.cause)))
2001             return False
2002
2003     def http_scheme(self):
2004         """ Either "http:" or "https:", depending on the user's preferences """
2005         return (
2006             'http:'
2007             if self.get_param('prefer_insecure', False)
2008             else 'https:')
2009
2010     def _proto_relative_url(self, url, scheme=None):
2011         scheme = scheme or self.http_scheme()
2012         assert scheme.endswith(':')
2013         return sanitize_url(url, scheme=scheme[:-1])
2014
2015     def _sleep(self, timeout, video_id, msg_template=None):
2016         if msg_template is None:
2017             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
2018         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
2019         self.to_screen(msg)
2020         time.sleep(timeout)
2021
2022     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2023                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
2024                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
2025         res = self._download_xml_handle(
2026             manifest_url, video_id, 'Downloading f4m manifest',
2027             'Unable to download f4m manifest',
2028             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
2029             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
2030             transform_source=transform_source,
2031             fatal=fatal, data=data, headers=headers, query=query)
2032         if res is False:
2033             return []
2034
2035         manifest, urlh = res
2036         manifest_url = urlh.geturl()
2037
2038         return self._parse_f4m_formats(
2039             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2040             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2041
2042     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2043                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2044                            fatal=True, m3u8_id=None):
2045         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2046             return []
2047
2048         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2049         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2050         if akamai_pv is not None and ';' in akamai_pv.text:
2051             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2052             if playerVerificationChallenge.strip() != '':
2053                 return []
2054
2055         formats = []
2056         manifest_version = '1.0'
2057         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2058         if not media_nodes:
2059             manifest_version = '2.0'
2060             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2061         # Remove unsupported DRM protected media from final formats
2062         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2063         media_nodes = remove_encrypted_media(media_nodes)
2064         if not media_nodes:
2065             return formats
2066
2067         manifest_base_url = get_base_url(manifest)
2068
2069         bootstrap_info = xpath_element(
2070             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2071             'bootstrap info', default=None)
2072
2073         vcodec = None
2074         mime_type = xpath_text(
2075             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2076             'base URL', default=None)
2077         if mime_type and mime_type.startswith('audio/'):
2078             vcodec = 'none'
2079
2080         for i, media_el in enumerate(media_nodes):
2081             tbr = int_or_none(media_el.attrib.get('bitrate'))
2082             width = int_or_none(media_el.attrib.get('width'))
2083             height = int_or_none(media_el.attrib.get('height'))
2084             format_id = join_nonempty(f4m_id, tbr or i)
2085             # If <bootstrapInfo> is present, the specified f4m is a
2086             # stream-level manifest, and only set-level manifests may refer to
2087             # external resources.  See section 11.4 and section 4 of F4M spec
2088             if bootstrap_info is None:
2089                 media_url = None
2090                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2091                 if manifest_version == '2.0':
2092                     media_url = media_el.attrib.get('href')
2093                 if media_url is None:
2094                     media_url = media_el.attrib.get('url')
2095                 if not media_url:
2096                     continue
2097                 manifest_url = (
2098                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2099                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2100                 # If media_url is itself a f4m manifest do the recursive extraction
2101                 # since bitrates in parent manifest (this one) and media_url manifest
2102                 # may differ leading to inability to resolve the format by requested
2103                 # bitrate in f4m downloader
2104                 ext = determine_ext(manifest_url)
2105                 if ext == 'f4m':
2106                     f4m_formats = self._extract_f4m_formats(
2107                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2108                         transform_source=transform_source, fatal=fatal)
2109                     # Sometimes stream-level manifest contains single media entry that
2110                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2111                     # At the same time parent's media entry in set-level manifest may
2112                     # contain it. We will copy it from parent in such cases.
2113                     if len(f4m_formats) == 1:
2114                         f = f4m_formats[0]
2115                         f.update({
2116                             'tbr': f.get('tbr') or tbr,
2117                             'width': f.get('width') or width,
2118                             'height': f.get('height') or height,
2119                             'format_id': f.get('format_id') if not tbr else format_id,
2120                             'vcodec': vcodec,
2121                         })
2122                     formats.extend(f4m_formats)
2123                     continue
2124                 elif ext == 'm3u8':
2125                     formats.extend(self._extract_m3u8_formats(
2126                         manifest_url, video_id, 'mp4', preference=preference,
2127                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2128                     continue
2129             formats.append({
2130                 'format_id': format_id,
2131                 'url': manifest_url,
2132                 'manifest_url': manifest_url,
2133                 'ext': 'flv' if bootstrap_info is not None else None,
2134                 'protocol': 'f4m',
2135                 'tbr': tbr,
2136                 'width': width,
2137                 'height': height,
2138                 'vcodec': vcodec,
2139                 'preference': preference,
2140                 'quality': quality,
2141             })
2142         return formats
2143
2144     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2145         return {
2146             'format_id': join_nonempty(m3u8_id, 'meta'),
2147             'url': m3u8_url,
2148             'ext': ext,
2149             'protocol': 'm3u8',
2150             'preference': preference - 100 if preference else -100,
2151             'quality': quality,
2152             'resolution': 'multiple',
2153             'format_note': 'Quality selection URL',
2154         }
2155
2156     def _report_ignoring_subs(self, name):
2157         self.report_warning(bug_reports_message(
2158             f'Ignoring subtitle tracks found in the {name} manifest; '
2159             'if any subtitle tracks are missing,'
2160         ), only_once=True)
2161
2162     def _extract_m3u8_formats(self, *args, **kwargs):
2163         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2164         if subs:
2165             self._report_ignoring_subs('HLS')
2166         return fmts
2167
2168     def _extract_m3u8_formats_and_subtitles(
2169             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2170             preference=None, quality=None, m3u8_id=None, note=None,
2171             errnote=None, fatal=True, live=False, data=None, headers={},
2172             query={}):
2173
2174         res = self._download_webpage_handle(
2175             m3u8_url, video_id,
2176             note='Downloading m3u8 information' if note is None else note,
2177             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2178             fatal=fatal, data=data, headers=headers, query=query)
2179
2180         if res is False:
2181             return [], {}
2182
2183         m3u8_doc, urlh = res
2184         m3u8_url = urlh.geturl()
2185
2186         return self._parse_m3u8_formats_and_subtitles(
2187             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2188             preference=preference, quality=quality, m3u8_id=m3u8_id,
2189             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2190             headers=headers, query=query, video_id=video_id)
2191
2192     def _parse_m3u8_formats_and_subtitles(
2193             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2194             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2195             errnote=None, fatal=True, data=None, headers={}, query={},
2196             video_id=None):
2197         formats, subtitles = [], {}
2198
2199         has_drm = re.search('|'.join([
2200             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2201             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2202         ]), m3u8_doc)
2203
2204         def format_url(url):
2205             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2206
2207         if self.get_param('hls_split_discontinuity', False):
2208             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2209                 if not m3u8_doc:
2210                     if not manifest_url:
2211                         return []
2212                     m3u8_doc = self._download_webpage(
2213                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2214                         note=False, errnote='Failed to download m3u8 playlist information')
2215                     if m3u8_doc is False:
2216                         return []
2217                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2218
2219         else:
2220             def _extract_m3u8_playlist_indices(*args, **kwargs):
2221                 return [None]
2222
2223         # References:
2224         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2225         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2226         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2227
2228         # We should try extracting formats only from master playlists [1, 4.3.4],
2229         # i.e. playlists that describe available qualities. On the other hand
2230         # media playlists [1, 4.3.3] should be returned as is since they contain
2231         # just the media without qualities renditions.
2232         # Fortunately, master playlist can be easily distinguished from media
2233         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2234         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2235         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2236         # media playlist and MUST NOT appear in master playlist thus we can
2237         # clearly detect media playlist with this criterion.
2238
2239         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2240             formats = [{
2241                 'format_id': join_nonempty(m3u8_id, idx),
2242                 'format_index': idx,
2243                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2244                 'ext': ext,
2245                 'protocol': entry_protocol,
2246                 'preference': preference,
2247                 'quality': quality,
2248                 'has_drm': has_drm,
2249             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2250
2251             return formats, subtitles
2252
2253         groups = {}
2254         last_stream_inf = {}
2255
2256         def extract_media(x_media_line):
2257             media = parse_m3u8_attributes(x_media_line)
2258             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2259             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2260             if not (media_type and group_id and name):
2261                 return
2262             groups.setdefault(group_id, []).append(media)
2263             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2264             if media_type == 'SUBTITLES':
2265                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2266                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2267                 # However, lack of URI has been spotted in the wild.
2268                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2269                 if not media.get('URI'):
2270                     return
2271                 url = format_url(media['URI'])
2272                 sub_info = {
2273                     'url': url,
2274                     'ext': determine_ext(url),
2275                 }
2276                 if sub_info['ext'] == 'm3u8':
2277                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2278                     # files may contain is WebVTT:
2279                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2280                     sub_info['ext'] = 'vtt'
2281                     sub_info['protocol'] = 'm3u8_native'
2282                 lang = media.get('LANGUAGE') or 'und'
2283                 subtitles.setdefault(lang, []).append(sub_info)
2284             if media_type not in ('VIDEO', 'AUDIO'):
2285                 return
2286             media_url = media.get('URI')
2287             if media_url:
2288                 manifest_url = format_url(media_url)
2289                 formats.extend({
2290                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2291                     'format_note': name,
2292                     'format_index': idx,
2293                     'url': manifest_url,
2294                     'manifest_url': m3u8_url,
2295                     'language': media.get('LANGUAGE'),
2296                     'ext': ext,
2297                     'protocol': entry_protocol,
2298                     'preference': preference,
2299                     'quality': quality,
2300                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2301                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2302
2303         def build_stream_name():
2304             # Despite specification does not mention NAME attribute for
2305             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2306             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2307             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2308             stream_name = last_stream_inf.get('NAME')
2309             if stream_name:
2310                 return stream_name
2311             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2312             # from corresponding rendition group
2313             stream_group_id = last_stream_inf.get('VIDEO')
2314             if not stream_group_id:
2315                 return
2316             stream_group = groups.get(stream_group_id)
2317             if not stream_group:
2318                 return stream_group_id
2319             rendition = stream_group[0]
2320             return rendition.get('NAME') or stream_group_id
2321
2322         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2323         # chance to detect video only formats when EXT-X-STREAM-INF tags
2324         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2325         for line in m3u8_doc.splitlines():
2326             if line.startswith('#EXT-X-MEDIA:'):
2327                 extract_media(line)
2328
2329         for line in m3u8_doc.splitlines():
2330             if line.startswith('#EXT-X-STREAM-INF:'):
2331                 last_stream_inf = parse_m3u8_attributes(line)
2332             elif line.startswith('#') or not line.strip():
2333                 continue
2334             else:
2335                 tbr = float_or_none(
2336                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2337                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2338                 manifest_url = format_url(line.strip())
2339
2340                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2341                     format_id = [m3u8_id, None, idx]
2342                     # Bandwidth of live streams may differ over time thus making
2343                     # format_id unpredictable. So it's better to keep provided
2344                     # format_id intact.
2345                     if not live:
2346                         stream_name = build_stream_name()
2347                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2348                     f = {
2349                         'format_id': join_nonempty(*format_id),
2350                         'format_index': idx,
2351                         'url': manifest_url,
2352                         'manifest_url': m3u8_url,
2353                         'tbr': tbr,
2354                         'ext': ext,
2355                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2356                         'protocol': entry_protocol,
2357                         'preference': preference,
2358                         'quality': quality,
2359                     }
2360                     resolution = last_stream_inf.get('RESOLUTION')
2361                     if resolution:
2362                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2363                         if mobj:
2364                             f['width'] = int(mobj.group('width'))
2365                             f['height'] = int(mobj.group('height'))
2366                     # Unified Streaming Platform
2367                     mobj = re.search(
2368                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2369                     if mobj:
2370                         abr, vbr = mobj.groups()
2371                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2372                         f.update({
2373                             'vbr': vbr,
2374                             'abr': abr,
2375                         })
2376                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2377                     f.update(codecs)
2378                     audio_group_id = last_stream_inf.get('AUDIO')
2379                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2380                     # references a rendition group MUST have a CODECS attribute.
2381                     # However, this is not always respected. E.g. [2]
2382                     # contains EXT-X-STREAM-INF tag which references AUDIO
2383                     # rendition group but does not have CODECS and despite
2384                     # referencing an audio group it represents a complete
2385                     # (with audio and video) format. So, for such cases we will
2386                     # ignore references to rendition groups and treat them
2387                     # as complete formats.
2388                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2389                         audio_group = groups.get(audio_group_id)
2390                         if audio_group and audio_group[0].get('URI'):
2391                             # TODO: update acodec for audio only formats with
2392                             # the same GROUP-ID
2393                             f['acodec'] = 'none'
2394                     if not f.get('ext'):
2395                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2396                     formats.append(f)
2397
2398                     # for DailyMotion
2399                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2400                     if progressive_uri:
2401                         http_f = f.copy()
2402                         del http_f['manifest_url']
2403                         http_f.update({
2404                             'format_id': f['format_id'].replace('hls-', 'http-'),
2405                             'protocol': 'http',
2406                             'url': progressive_uri,
2407                         })
2408                         formats.append(http_f)
2409
2410                 last_stream_inf = {}
2411         return formats, subtitles
2412
2413     def _extract_m3u8_vod_duration(
2414             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2415
2416         m3u8_vod = self._download_webpage(
2417             m3u8_vod_url, video_id,
2418             note='Downloading m3u8 VOD manifest' if note is None else note,
2419             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2420             fatal=False, data=data, headers=headers, query=query)
2421
2422         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2423
2424     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2425         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2426             return None
2427
2428         return int(sum(
2429             float(line[len('#EXTINF:'):].split(',')[0])
2430             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2431
2432     @staticmethod
2433     def _xpath_ns(path, namespace=None):
2434         if not namespace:
2435             return path
2436         out = []
2437         for c in path.split('/'):
2438             if not c or c == '.':
2439                 out.append(c)
2440             else:
2441                 out.append('{%s}%s' % (namespace, c))
2442         return '/'.join(out)
2443
2444     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2445         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2446         if res is False:
2447             assert not fatal
2448             return [], {}
2449
2450         smil, urlh = res
2451         smil_url = urlh.geturl()
2452
2453         namespace = self._parse_smil_namespace(smil)
2454
2455         fmts = self._parse_smil_formats(
2456             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2457         subs = self._parse_smil_subtitles(
2458             smil, namespace=namespace)
2459
2460         return fmts, subs
2461
2462     def _extract_smil_formats(self, *args, **kwargs):
2463         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2464         if subs:
2465             self._report_ignoring_subs('SMIL')
2466         return fmts
2467
2468     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2469         res = self._download_smil(smil_url, video_id, fatal=fatal)
2470         if res is False:
2471             return {}
2472
2473         smil, urlh = res
2474         smil_url = urlh.geturl()
2475
2476         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2477
2478     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2479         return self._download_xml_handle(
2480             smil_url, video_id, 'Downloading SMIL file',
2481             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2482
2483     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2484         namespace = self._parse_smil_namespace(smil)
2485
2486         formats = self._parse_smil_formats(
2487             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2488         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2489
2490         video_id = os.path.splitext(url_basename(smil_url))[0]
2491         title = None
2492         description = None
2493         upload_date = None
2494         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2495             name = meta.attrib.get('name')
2496             content = meta.attrib.get('content')
2497             if not name or not content:
2498                 continue
2499             if not title and name == 'title':
2500                 title = content
2501             elif not description and name in ('description', 'abstract'):
2502                 description = content
2503             elif not upload_date and name == 'date':
2504                 upload_date = unified_strdate(content)
2505
2506         thumbnails = [{
2507             'id': image.get('type'),
2508             'url': image.get('src'),
2509             'width': int_or_none(image.get('width')),
2510             'height': int_or_none(image.get('height')),
2511         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2512
2513         return {
2514             'id': video_id,
2515             'title': title or video_id,
2516             'description': description,
2517             'upload_date': upload_date,
2518             'thumbnails': thumbnails,
2519             'formats': formats,
2520             'subtitles': subtitles,
2521         }
2522
2523     def _parse_smil_namespace(self, smil):
2524         return self._search_regex(
2525             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2526
2527     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2528         base = smil_url
2529         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2530             b = meta.get('base') or meta.get('httpBase')
2531             if b:
2532                 base = b
2533                 break
2534
2535         formats = []
2536         rtmp_count = 0
2537         http_count = 0
2538         m3u8_count = 0
2539         imgs_count = 0
2540
2541         srcs = set()
2542         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2543         for medium in media:
2544             src = medium.get('src')
2545             if not src or src in srcs:
2546                 continue
2547             srcs.add(src)
2548
2549             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2550             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2551             width = int_or_none(medium.get('width'))
2552             height = int_or_none(medium.get('height'))
2553             proto = medium.get('proto')
2554             ext = medium.get('ext')
2555             src_ext = determine_ext(src)
2556             streamer = medium.get('streamer') or base
2557
2558             if proto == 'rtmp' or streamer.startswith('rtmp'):
2559                 rtmp_count += 1
2560                 formats.append({
2561                     'url': streamer,
2562                     'play_path': src,
2563                     'ext': 'flv',
2564                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2565                     'tbr': bitrate,
2566                     'filesize': filesize,
2567                     'width': width,
2568                     'height': height,
2569                 })
2570                 if transform_rtmp_url:
2571                     streamer, src = transform_rtmp_url(streamer, src)
2572                     formats[-1].update({
2573                         'url': streamer,
2574                         'play_path': src,
2575                     })
2576                 continue
2577
2578             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2579             src_url = src_url.strip()
2580
2581             if proto == 'm3u8' or src_ext == 'm3u8':
2582                 m3u8_formats = self._extract_m3u8_formats(
2583                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2584                 if len(m3u8_formats) == 1:
2585                     m3u8_count += 1
2586                     m3u8_formats[0].update({
2587                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2588                         'tbr': bitrate,
2589                         'width': width,
2590                         'height': height,
2591                     })
2592                 formats.extend(m3u8_formats)
2593             elif src_ext == 'f4m':
2594                 f4m_url = src_url
2595                 if not f4m_params:
2596                     f4m_params = {
2597                         'hdcore': '3.2.0',
2598                         'plugin': 'flowplayer-3.2.0.1',
2599                     }
2600                 f4m_url += '&' if '?' in f4m_url else '?'
2601                 f4m_url += urllib.parse.urlencode(f4m_params)
2602                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2603             elif src_ext == 'mpd':
2604                 formats.extend(self._extract_mpd_formats(
2605                     src_url, video_id, mpd_id='dash', fatal=False))
2606             elif re.search(r'\.ism/[Mm]anifest', src_url):
2607                 formats.extend(self._extract_ism_formats(
2608                     src_url, video_id, ism_id='mss', fatal=False))
2609             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2610                 http_count += 1
2611                 formats.append({
2612                     'url': src_url,
2613                     'ext': ext or src_ext or 'flv',
2614                     'format_id': 'http-%d' % (bitrate or http_count),
2615                     'tbr': bitrate,
2616                     'filesize': filesize,
2617                     'width': width,
2618                     'height': height,
2619                 })
2620
2621         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2622             src = medium.get('src')
2623             if not src or src in srcs:
2624                 continue
2625             srcs.add(src)
2626
2627             imgs_count += 1
2628             formats.append({
2629                 'format_id': 'imagestream-%d' % (imgs_count),
2630                 'url': src,
2631                 'ext': mimetype2ext(medium.get('type')),
2632                 'acodec': 'none',
2633                 'vcodec': 'none',
2634                 'width': int_or_none(medium.get('width')),
2635                 'height': int_or_none(medium.get('height')),
2636                 'format_note': 'SMIL storyboards',
2637             })
2638
2639         return formats
2640
2641     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2642         urls = []
2643         subtitles = {}
2644         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2645             src = textstream.get('src')
2646             if not src or src in urls:
2647                 continue
2648             urls.append(src)
2649             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2650             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2651             subtitles.setdefault(lang, []).append({
2652                 'url': src,
2653                 'ext': ext,
2654             })
2655         return subtitles
2656
2657     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2658         res = self._download_xml_handle(
2659             xspf_url, playlist_id, 'Downloading xpsf playlist',
2660             'Unable to download xspf manifest', fatal=fatal)
2661         if res is False:
2662             return []
2663
2664         xspf, urlh = res
2665         xspf_url = urlh.geturl()
2666
2667         return self._parse_xspf(
2668             xspf, playlist_id, xspf_url=xspf_url,
2669             xspf_base_url=base_url(xspf_url))
2670
2671     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2672         NS_MAP = {
2673             'xspf': 'http://xspf.org/ns/0/',
2674             's1': 'http://static.streamone.nl/player/ns/0',
2675         }
2676
2677         entries = []
2678         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2679             title = xpath_text(
2680                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2681             description = xpath_text(
2682                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2683             thumbnail = xpath_text(
2684                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2685             duration = float_or_none(
2686                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2687
2688             formats = []
2689             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2690                 format_url = urljoin(xspf_base_url, location.text)
2691                 if not format_url:
2692                     continue
2693                 formats.append({
2694                     'url': format_url,
2695                     'manifest_url': xspf_url,
2696                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2697                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2698                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2699                 })
2700             self._sort_formats(formats)
2701
2702             entries.append({
2703                 'id': playlist_id,
2704                 'title': title,
2705                 'description': description,
2706                 'thumbnail': thumbnail,
2707                 'duration': duration,
2708                 'formats': formats,
2709             })
2710         return entries
2711
2712     def _extract_mpd_formats(self, *args, **kwargs):
2713         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2714         if subs:
2715             self._report_ignoring_subs('DASH')
2716         return fmts
2717
2718     def _extract_mpd_formats_and_subtitles(
2719             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2720             fatal=True, data=None, headers={}, query={}):
2721         res = self._download_xml_handle(
2722             mpd_url, video_id,
2723             note='Downloading MPD manifest' if note is None else note,
2724             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2725             fatal=fatal, data=data, headers=headers, query=query)
2726         if res is False:
2727             return [], {}
2728         mpd_doc, urlh = res
2729         if mpd_doc is None:
2730             return [], {}
2731
2732         # We could have been redirected to a new url when we retrieved our mpd file.
2733         mpd_url = urlh.geturl()
2734         mpd_base_url = base_url(mpd_url)
2735
2736         return self._parse_mpd_formats_and_subtitles(
2737             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2738
2739     def _parse_mpd_formats(self, *args, **kwargs):
2740         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2741         if subs:
2742             self._report_ignoring_subs('DASH')
2743         return fmts
2744
2745     def _parse_mpd_formats_and_subtitles(
2746             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2747         """
2748         Parse formats from MPD manifest.
2749         References:
2750          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2751             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2752          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2753         """
2754         if not self.get_param('dynamic_mpd', True):
2755             if mpd_doc.get('type') == 'dynamic':
2756                 return [], {}
2757
2758         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2759
2760         def _add_ns(path):
2761             return self._xpath_ns(path, namespace)
2762
2763         def is_drm_protected(element):
2764             return element.find(_add_ns('ContentProtection')) is not None
2765
2766         def extract_multisegment_info(element, ms_parent_info):
2767             ms_info = ms_parent_info.copy()
2768
2769             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2770             # common attributes and elements.  We will only extract relevant
2771             # for us.
2772             def extract_common(source):
2773                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2774                 if segment_timeline is not None:
2775                     s_e = segment_timeline.findall(_add_ns('S'))
2776                     if s_e:
2777                         ms_info['total_number'] = 0
2778                         ms_info['s'] = []
2779                         for s in s_e:
2780                             r = int(s.get('r', 0))
2781                             ms_info['total_number'] += 1 + r
2782                             ms_info['s'].append({
2783                                 't': int(s.get('t', 0)),
2784                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2785                                 'd': int(s.attrib['d']),
2786                                 'r': r,
2787                             })
2788                 start_number = source.get('startNumber')
2789                 if start_number:
2790                     ms_info['start_number'] = int(start_number)
2791                 timescale = source.get('timescale')
2792                 if timescale:
2793                     ms_info['timescale'] = int(timescale)
2794                 segment_duration = source.get('duration')
2795                 if segment_duration:
2796                     ms_info['segment_duration'] = float(segment_duration)
2797
2798             def extract_Initialization(source):
2799                 initialization = source.find(_add_ns('Initialization'))
2800                 if initialization is not None:
2801                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2802
2803             segment_list = element.find(_add_ns('SegmentList'))
2804             if segment_list is not None:
2805                 extract_common(segment_list)
2806                 extract_Initialization(segment_list)
2807                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2808                 if segment_urls_e:
2809                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2810             else:
2811                 segment_template = element.find(_add_ns('SegmentTemplate'))
2812                 if segment_template is not None:
2813                     extract_common(segment_template)
2814                     media = segment_template.get('media')
2815                     if media:
2816                         ms_info['media'] = media
2817                     initialization = segment_template.get('initialization')
2818                     if initialization:
2819                         ms_info['initialization'] = initialization
2820                     else:
2821                         extract_Initialization(segment_template)
2822             return ms_info
2823
2824         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2825         formats, subtitles = [], {}
2826         stream_numbers = collections.defaultdict(int)
2827         for period in mpd_doc.findall(_add_ns('Period')):
2828             period_duration = parse_duration(period.get('duration')) or mpd_duration
2829             period_ms_info = extract_multisegment_info(period, {
2830                 'start_number': 1,
2831                 'timescale': 1,
2832             })
2833             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2834                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2835                 for representation in adaptation_set.findall(_add_ns('Representation')):
2836                     representation_attrib = adaptation_set.attrib.copy()
2837                     representation_attrib.update(representation.attrib)
2838                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2839                     mime_type = representation_attrib['mimeType']
2840                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2841
2842                     codec_str = representation_attrib.get('codecs', '')
2843                     # Some kind of binary subtitle found in some youtube livestreams
2844                     if mime_type == 'application/x-rawcc':
2845                         codecs = {'scodec': codec_str}
2846                     else:
2847                         codecs = parse_codecs(codec_str)
2848                     if content_type not in ('video', 'audio', 'text'):
2849                         if mime_type == 'image/jpeg':
2850                             content_type = mime_type
2851                         elif codecs.get('vcodec', 'none') != 'none':
2852                             content_type = 'video'
2853                         elif codecs.get('acodec', 'none') != 'none':
2854                             content_type = 'audio'
2855                         elif codecs.get('scodec', 'none') != 'none':
2856                             content_type = 'text'
2857                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2858                             content_type = 'text'
2859                         else:
2860                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2861                             continue
2862
2863                     base_url = ''
2864                     for element in (representation, adaptation_set, period, mpd_doc):
2865                         base_url_e = element.find(_add_ns('BaseURL'))
2866                         if try_call(lambda: base_url_e.text) is not None:
2867                             base_url = base_url_e.text + base_url
2868                             if re.match(r'^https?://', base_url):
2869                                 break
2870                     if mpd_base_url and base_url.startswith('/'):
2871                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2872                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2873                         if not mpd_base_url.endswith('/'):
2874                             mpd_base_url += '/'
2875                         base_url = mpd_base_url + base_url
2876                     representation_id = representation_attrib.get('id')
2877                     lang = representation_attrib.get('lang')
2878                     url_el = representation.find(_add_ns('BaseURL'))
2879                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2880                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2881                     if representation_id is not None:
2882                         format_id = representation_id
2883                     else:
2884                         format_id = content_type
2885                     if mpd_id:
2886                         format_id = mpd_id + '-' + format_id
2887                     if content_type in ('video', 'audio'):
2888                         f = {
2889                             'format_id': format_id,
2890                             'manifest_url': mpd_url,
2891                             'ext': mimetype2ext(mime_type),
2892                             'width': int_or_none(representation_attrib.get('width')),
2893                             'height': int_or_none(representation_attrib.get('height')),
2894                             'tbr': float_or_none(bandwidth, 1000),
2895                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2896                             'fps': int_or_none(representation_attrib.get('frameRate')),
2897                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2898                             'format_note': 'DASH %s' % content_type,
2899                             'filesize': filesize,
2900                             'container': mimetype2ext(mime_type) + '_dash',
2901                             **codecs
2902                         }
2903                     elif content_type == 'text':
2904                         f = {
2905                             'ext': mimetype2ext(mime_type),
2906                             'manifest_url': mpd_url,
2907                             'filesize': filesize,
2908                         }
2909                     elif content_type == 'image/jpeg':
2910                         # See test case in VikiIE
2911                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2912                         f = {
2913                             'format_id': format_id,
2914                             'ext': 'mhtml',
2915                             'manifest_url': mpd_url,
2916                             'format_note': 'DASH storyboards (jpeg)',
2917                             'acodec': 'none',
2918                             'vcodec': 'none',
2919                         }
2920                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2921                         f['has_drm'] = True
2922                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2923
2924                     def prepare_template(template_name, identifiers):
2925                         tmpl = representation_ms_info[template_name]
2926                         if representation_id is not None:
2927                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2928                         # First of, % characters outside $...$ templates
2929                         # must be escaped by doubling for proper processing
2930                         # by % operator string formatting used further (see
2931                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2932                         t = ''
2933                         in_template = False
2934                         for c in tmpl:
2935                             t += c
2936                             if c == '$':
2937                                 in_template = not in_template
2938                             elif c == '%' and not in_template:
2939                                 t += c
2940                         # Next, $...$ templates are translated to their
2941                         # %(...) counterparts to be used with % operator
2942                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2943                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2944                         t.replace('$$', '$')
2945                         return t
2946
2947                     # @initialization is a regular template like @media one
2948                     # so it should be handled just the same way (see
2949                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2950                     if 'initialization' in representation_ms_info:
2951                         initialization_template = prepare_template(
2952                             'initialization',
2953                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2954                             # $Time$ shall not be included for @initialization thus
2955                             # only $Bandwidth$ remains
2956                             ('Bandwidth', ))
2957                         representation_ms_info['initialization_url'] = initialization_template % {
2958                             'Bandwidth': bandwidth,
2959                         }
2960
2961                     def location_key(location):
2962                         return 'url' if re.match(r'^https?://', location) else 'path'
2963
2964                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2965
2966                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2967                         media_location_key = location_key(media_template)
2968
2969                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2970                         # can't be used at the same time
2971                         if '%(Number' in media_template and 's' not in representation_ms_info:
2972                             segment_duration = None
2973                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2974                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2975                                 representation_ms_info['total_number'] = int(math.ceil(
2976                                     float_or_none(period_duration, segment_duration, default=0)))
2977                             representation_ms_info['fragments'] = [{
2978                                 media_location_key: media_template % {
2979                                     'Number': segment_number,
2980                                     'Bandwidth': bandwidth,
2981                                 },
2982                                 'duration': segment_duration,
2983                             } for segment_number in range(
2984                                 representation_ms_info['start_number'],
2985                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2986                         else:
2987                             # $Number*$ or $Time$ in media template with S list available
2988                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2989                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2990                             representation_ms_info['fragments'] = []
2991                             segment_time = 0
2992                             segment_d = None
2993                             segment_number = representation_ms_info['start_number']
2994
2995                             def add_segment_url():
2996                                 segment_url = media_template % {
2997                                     'Time': segment_time,
2998                                     'Bandwidth': bandwidth,
2999                                     'Number': segment_number,
3000                                 }
3001                                 representation_ms_info['fragments'].append({
3002                                     media_location_key: segment_url,
3003                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
3004                                 })
3005
3006                             for num, s in enumerate(representation_ms_info['s']):
3007                                 segment_time = s.get('t') or segment_time
3008                                 segment_d = s['d']
3009                                 add_segment_url()
3010                                 segment_number += 1
3011                                 for r in range(s.get('r', 0)):
3012                                     segment_time += segment_d
3013                                     add_segment_url()
3014                                     segment_number += 1
3015                                 segment_time += segment_d
3016                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
3017                         # No media template,
3018                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
3019                         # or any YouTube dashsegments video
3020                         fragments = []
3021                         segment_index = 0
3022                         timescale = representation_ms_info['timescale']
3023                         for s in representation_ms_info['s']:
3024                             duration = float_or_none(s['d'], timescale)
3025                             for r in range(s.get('r', 0) + 1):
3026                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
3027                                 fragments.append({
3028                                     location_key(segment_uri): segment_uri,
3029                                     'duration': duration,
3030                                 })
3031                                 segment_index += 1
3032                         representation_ms_info['fragments'] = fragments
3033                     elif 'segment_urls' in representation_ms_info:
3034                         # Segment URLs with no SegmentTimeline
3035                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3036                         # https://github.com/ytdl-org/youtube-dl/pull/14844
3037                         fragments = []
3038                         segment_duration = float_or_none(
3039                             representation_ms_info['segment_duration'],
3040                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3041                         for segment_url in representation_ms_info['segment_urls']:
3042                             fragment = {
3043                                 location_key(segment_url): segment_url,
3044                             }
3045                             if segment_duration:
3046                                 fragment['duration'] = segment_duration
3047                             fragments.append(fragment)
3048                         representation_ms_info['fragments'] = fragments
3049                     # If there is a fragments key available then we correctly recognized fragmented media.
3050                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3051                     # assumption is not necessarily correct since we may simply have no support for
3052                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3053                     if 'fragments' in representation_ms_info:
3054                         f.update({
3055                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3056                             'url': mpd_url or base_url,
3057                             'fragment_base_url': base_url,
3058                             'fragments': [],
3059                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3060                         })
3061                         if 'initialization_url' in representation_ms_info:
3062                             initialization_url = representation_ms_info['initialization_url']
3063                             if not f.get('url'):
3064                                 f['url'] = initialization_url
3065                             f['fragments'].append({location_key(initialization_url): initialization_url})
3066                         f['fragments'].extend(representation_ms_info['fragments'])
3067                         if not period_duration:
3068                             period_duration = try_get(
3069                                 representation_ms_info,
3070                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3071                     else:
3072                         # Assuming direct URL to unfragmented media.
3073                         f['url'] = base_url
3074                     if content_type in ('video', 'audio', 'image/jpeg'):
3075                         f['manifest_stream_number'] = stream_numbers[f['url']]
3076                         stream_numbers[f['url']] += 1
3077                         formats.append(f)
3078                     elif content_type == 'text':
3079                         subtitles.setdefault(lang or 'und', []).append(f)
3080
3081         return formats, subtitles
3082
3083     def _extract_ism_formats(self, *args, **kwargs):
3084         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3085         if subs:
3086             self._report_ignoring_subs('ISM')
3087         return fmts
3088
3089     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3090         res = self._download_xml_handle(
3091             ism_url, video_id,
3092             note='Downloading ISM manifest' if note is None else note,
3093             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3094             fatal=fatal, data=data, headers=headers, query=query)
3095         if res is False:
3096             return [], {}
3097         ism_doc, urlh = res
3098         if ism_doc is None:
3099             return [], {}
3100
3101         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3102
3103     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3104         """
3105         Parse formats from ISM manifest.
3106         References:
3107          1. [MS-SSTR]: Smooth Streaming Protocol,
3108             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3109         """
3110         if ism_doc.get('IsLive') == 'TRUE':
3111             return [], {}
3112
3113         duration = int(ism_doc.attrib['Duration'])
3114         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3115
3116         formats = []
3117         subtitles = {}
3118         for stream in ism_doc.findall('StreamIndex'):
3119             stream_type = stream.get('Type')
3120             if stream_type not in ('video', 'audio', 'text'):
3121                 continue
3122             url_pattern = stream.attrib['Url']
3123             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3124             stream_name = stream.get('Name')
3125             stream_language = stream.get('Language', 'und')
3126             for track in stream.findall('QualityLevel'):
3127                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3128                 # TODO: add support for WVC1 and WMAP
3129                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3130                     self.report_warning('%s is not a supported codec' % fourcc)
3131                     continue
3132                 tbr = int(track.attrib['Bitrate']) // 1000
3133                 # [1] does not mention Width and Height attributes. However,
3134                 # they're often present while MaxWidth and MaxHeight are
3135                 # missing, so should be used as fallbacks
3136                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3137                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3138                 sampling_rate = int_or_none(track.get('SamplingRate'))
3139
3140                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3141                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3142
3143                 fragments = []
3144                 fragment_ctx = {
3145                     'time': 0,
3146                 }
3147                 stream_fragments = stream.findall('c')
3148                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3149                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3150                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3151                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3152                     if not fragment_ctx['duration']:
3153                         try:
3154                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3155                         except IndexError:
3156                             next_fragment_time = duration
3157                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3158                     for _ in range(fragment_repeat):
3159                         fragments.append({
3160                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3161                             'duration': fragment_ctx['duration'] / stream_timescale,
3162                         })
3163                         fragment_ctx['time'] += fragment_ctx['duration']
3164
3165                 if stream_type == 'text':
3166                     subtitles.setdefault(stream_language, []).append({
3167                         'ext': 'ismt',
3168                         'protocol': 'ism',
3169                         'url': ism_url,
3170                         'manifest_url': ism_url,
3171                         'fragments': fragments,
3172                         '_download_params': {
3173                             'stream_type': stream_type,
3174                             'duration': duration,
3175                             'timescale': stream_timescale,
3176                             'fourcc': fourcc,
3177                             'language': stream_language,
3178                             'codec_private_data': track.get('CodecPrivateData'),
3179                         }
3180                     })
3181                 elif stream_type in ('video', 'audio'):
3182                     formats.append({
3183                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3184                         'url': ism_url,
3185                         'manifest_url': ism_url,
3186                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3187                         'width': width,
3188                         'height': height,
3189                         'tbr': tbr,
3190                         'asr': sampling_rate,
3191                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3192                         'acodec': 'none' if stream_type == 'video' else fourcc,
3193                         'protocol': 'ism',
3194                         'fragments': fragments,
3195                         'has_drm': ism_doc.find('Protection') is not None,
3196                         '_download_params': {
3197                             'stream_type': stream_type,
3198                             'duration': duration,
3199                             'timescale': stream_timescale,
3200                             'width': width or 0,
3201                             'height': height or 0,
3202                             'fourcc': fourcc,
3203                             'language': stream_language,
3204                             'codec_private_data': track.get('CodecPrivateData'),
3205                             'sampling_rate': sampling_rate,
3206                             'channels': int_or_none(track.get('Channels', 2)),
3207                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3208                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3209                         },
3210                     })
3211         return formats, subtitles
3212
3213     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3214         def absolute_url(item_url):
3215             return urljoin(base_url, item_url)
3216
3217         def parse_content_type(content_type):
3218             if not content_type:
3219                 return {}
3220             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3221             if ctr:
3222                 mimetype, codecs = ctr.groups()
3223                 f = parse_codecs(codecs)
3224                 f['ext'] = mimetype2ext(mimetype)
3225                 return f
3226             return {}
3227
3228         def _media_formats(src, cur_media_type, type_info=None):
3229             type_info = type_info or {}
3230             full_url = absolute_url(src)
3231             ext = type_info.get('ext') or determine_ext(full_url)
3232             if ext == 'm3u8':
3233                 is_plain_url = False
3234                 formats = self._extract_m3u8_formats(
3235                     full_url, video_id, ext='mp4',
3236                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3237                     preference=preference, quality=quality, fatal=False)
3238             elif ext == 'mpd':
3239                 is_plain_url = False
3240                 formats = self._extract_mpd_formats(
3241                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3242             else:
3243                 is_plain_url = True
3244                 formats = [{
3245                     'url': full_url,
3246                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3247                     'ext': ext,
3248                 }]
3249             return is_plain_url, formats
3250
3251         entries = []
3252         # amp-video and amp-audio are very similar to their HTML5 counterparts
3253         # so we will include them right here (see
3254         # https://www.ampproject.org/docs/reference/components/amp-video)
3255         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3256         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3257         media_tags = [(media_tag, media_tag_name, media_type, '')
3258                       for media_tag, media_tag_name, media_type
3259                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3260         media_tags.extend(re.findall(
3261             # We only allow video|audio followed by a whitespace or '>'.
3262             # Allowing more characters may end up in significant slow down (see
3263             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3264             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3265             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3266         for media_tag, _, media_type, media_content in media_tags:
3267             media_info = {
3268                 'formats': [],
3269                 'subtitles': {},
3270             }
3271             media_attributes = extract_attributes(media_tag)
3272             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3273             if src:
3274                 f = parse_content_type(media_attributes.get('type'))
3275                 _, formats = _media_formats(src, media_type, f)
3276                 media_info['formats'].extend(formats)
3277             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3278             if media_content:
3279                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3280                     s_attr = extract_attributes(source_tag)
3281                     # data-video-src and data-src are non standard but seen
3282                     # several times in the wild
3283                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3284                     if not src:
3285                         continue
3286                     f = parse_content_type(s_attr.get('type'))
3287                     is_plain_url, formats = _media_formats(src, media_type, f)
3288                     if is_plain_url:
3289                         # width, height, res, label and title attributes are
3290                         # all not standard but seen several times in the wild
3291                         labels = [
3292                             s_attr.get(lbl)
3293                             for lbl in ('label', 'title')
3294                             if str_or_none(s_attr.get(lbl))
3295                         ]
3296                         width = int_or_none(s_attr.get('width'))
3297                         height = (int_or_none(s_attr.get('height'))
3298                                   or int_or_none(s_attr.get('res')))
3299                         if not width or not height:
3300                             for lbl in labels:
3301                                 resolution = parse_resolution(lbl)
3302                                 if not resolution:
3303                                     continue
3304                                 width = width or resolution.get('width')
3305                                 height = height or resolution.get('height')
3306                         for lbl in labels:
3307                             tbr = parse_bitrate(lbl)
3308                             if tbr:
3309                                 break
3310                         else:
3311                             tbr = None
3312                         f.update({
3313                             'width': width,
3314                             'height': height,
3315                             'tbr': tbr,
3316                             'format_id': s_attr.get('label') or s_attr.get('title'),
3317                         })
3318                         f.update(formats[0])
3319                         media_info['formats'].append(f)
3320                     else:
3321                         media_info['formats'].extend(formats)
3322                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3323                     track_attributes = extract_attributes(track_tag)
3324                     kind = track_attributes.get('kind')
3325                     if not kind or kind in ('subtitles', 'captions'):
3326                         src = strip_or_none(track_attributes.get('src'))
3327                         if not src:
3328                             continue
3329                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3330                         media_info['subtitles'].setdefault(lang, []).append({
3331                             'url': absolute_url(src),
3332                         })
3333             for f in media_info['formats']:
3334                 f.setdefault('http_headers', {})['Referer'] = base_url
3335             if media_info['formats'] or media_info['subtitles']:
3336                 entries.append(media_info)
3337         return entries
3338
3339     def _extract_akamai_formats(self, *args, **kwargs):
3340         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3341         if subs:
3342             self._report_ignoring_subs('akamai')
3343         return fmts
3344
3345     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3346         signed = 'hdnea=' in manifest_url
3347         if not signed:
3348             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3349             manifest_url = re.sub(
3350                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3351                 '', manifest_url).strip('?')
3352
3353         formats = []
3354         subtitles = {}
3355
3356         hdcore_sign = 'hdcore=3.7.0'
3357         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3358         hds_host = hosts.get('hds')
3359         if hds_host:
3360             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3361         if 'hdcore=' not in f4m_url:
3362             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3363         f4m_formats = self._extract_f4m_formats(
3364             f4m_url, video_id, f4m_id='hds', fatal=False)
3365         for entry in f4m_formats:
3366             entry.update({'extra_param_to_segment_url': hdcore_sign})
3367         formats.extend(f4m_formats)
3368
3369         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3370         hls_host = hosts.get('hls')
3371         if hls_host:
3372             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3373         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3374             m3u8_url, video_id, 'mp4', 'm3u8_native',
3375             m3u8_id='hls', fatal=False)
3376         formats.extend(m3u8_formats)
3377         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3378
3379         http_host = hosts.get('http')
3380         if http_host and m3u8_formats and not signed:
3381             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3382             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3383             qualities_length = len(qualities)
3384             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3385                 i = 0
3386                 for f in m3u8_formats:
3387                     if f['vcodec'] != 'none':
3388                         for protocol in ('http', 'https'):
3389                             http_f = f.copy()
3390                             del http_f['manifest_url']
3391                             http_url = re.sub(
3392                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3393                             http_f.update({
3394                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3395                                 'url': http_url,
3396                                 'protocol': protocol,
3397                             })
3398                             formats.append(http_f)
3399                         i += 1
3400
3401         return formats, subtitles
3402
3403     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3404         query = urllib.parse.urlparse(url).query
3405         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3406         mobj = re.search(
3407             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3408         url_base = mobj.group('url')
3409         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3410         formats = []
3411
3412         def manifest_url(manifest):
3413             m_url = f'{http_base_url}/{manifest}'
3414             if query:
3415                 m_url += '?%s' % query
3416             return m_url
3417
3418         if 'm3u8' not in skip_protocols:
3419             formats.extend(self._extract_m3u8_formats(
3420                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3421                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3422         if 'f4m' not in skip_protocols:
3423             formats.extend(self._extract_f4m_formats(
3424                 manifest_url('manifest.f4m'),
3425                 video_id, f4m_id='hds', fatal=False))
3426         if 'dash' not in skip_protocols:
3427             formats.extend(self._extract_mpd_formats(
3428                 manifest_url('manifest.mpd'),
3429                 video_id, mpd_id='dash', fatal=False))
3430         if re.search(r'(?:/smil:|\.smil)', url_base):
3431             if 'smil' not in skip_protocols:
3432                 rtmp_formats = self._extract_smil_formats(
3433                     manifest_url('jwplayer.smil'),
3434                     video_id, fatal=False)
3435                 for rtmp_format in rtmp_formats:
3436                     rtsp_format = rtmp_format.copy()
3437                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3438                     del rtsp_format['play_path']
3439                     del rtsp_format['ext']
3440                     rtsp_format.update({
3441                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3442                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3443                         'protocol': 'rtsp',
3444                     })
3445                     formats.extend([rtmp_format, rtsp_format])
3446         else:
3447             for protocol in ('rtmp', 'rtsp'):
3448                 if protocol not in skip_protocols:
3449                     formats.append({
3450                         'url': f'{protocol}:{url_base}',
3451                         'format_id': protocol,
3452                         'protocol': protocol,
3453                     })
3454         return formats
3455
3456     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3457         mobj = re.search(
3458             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3459             webpage)
3460         if mobj:
3461             try:
3462                 jwplayer_data = self._parse_json(mobj.group('options'),
3463                                                  video_id=video_id,
3464                                                  transform_source=transform_source)
3465             except ExtractorError:
3466                 pass
3467             else:
3468                 if isinstance(jwplayer_data, dict):
3469                     return jwplayer_data
3470
3471     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3472         jwplayer_data = self._find_jwplayer_data(
3473             webpage, video_id, transform_source=js_to_json)
3474         return self._parse_jwplayer_data(
3475             jwplayer_data, video_id, *args, **kwargs)
3476
3477     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3478                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3479         # JWPlayer backward compatibility: flattened playlists
3480         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3481         if 'playlist' not in jwplayer_data:
3482             jwplayer_data = {'playlist': [jwplayer_data]}
3483
3484         entries = []
3485
3486         # JWPlayer backward compatibility: single playlist item
3487         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3488         if not isinstance(jwplayer_data['playlist'], list):
3489             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3490
3491         for video_data in jwplayer_data['playlist']:
3492             # JWPlayer backward compatibility: flattened sources
3493             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3494             if 'sources' not in video_data:
3495                 video_data['sources'] = [video_data]
3496
3497             this_video_id = video_id or video_data['mediaid']
3498
3499             formats = self._parse_jwplayer_formats(
3500                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3501                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3502
3503             subtitles = {}
3504             tracks = video_data.get('tracks')
3505             if tracks and isinstance(tracks, list):
3506                 for track in tracks:
3507                     if not isinstance(track, dict):
3508                         continue
3509                     track_kind = track.get('kind')
3510                     if not track_kind or not isinstance(track_kind, str):
3511                         continue
3512                     if track_kind.lower() not in ('captions', 'subtitles'):
3513                         continue
3514                     track_url = urljoin(base_url, track.get('file'))
3515                     if not track_url:
3516                         continue
3517                     subtitles.setdefault(track.get('label') or 'en', []).append({
3518                         'url': self._proto_relative_url(track_url)
3519                     })
3520
3521             entry = {
3522                 'id': this_video_id,
3523                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3524                 'description': clean_html(video_data.get('description')),
3525                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3526                 'timestamp': int_or_none(video_data.get('pubdate')),
3527                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3528                 'subtitles': subtitles,
3529             }
3530             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3531             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3532                 entry.update({
3533                     '_type': 'url_transparent',
3534                     'url': formats[0]['url'],
3535                 })
3536             else:
3537                 self._sort_formats(formats)
3538                 entry['formats'] = formats
3539             entries.append(entry)
3540         if len(entries) == 1:
3541             return entries[0]
3542         else:
3543             return self.playlist_result(entries)
3544
3545     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3546                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3547         urls = []
3548         formats = []
3549         for source in jwplayer_sources_data:
3550             if not isinstance(source, dict):
3551                 continue
3552             source_url = urljoin(
3553                 base_url, self._proto_relative_url(source.get('file')))
3554             if not source_url or source_url in urls:
3555                 continue
3556             urls.append(source_url)
3557             source_type = source.get('type') or ''
3558             ext = mimetype2ext(source_type) or determine_ext(source_url)
3559             if source_type == 'hls' or ext == 'm3u8':
3560                 formats.extend(self._extract_m3u8_formats(
3561                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3562                     m3u8_id=m3u8_id, fatal=False))
3563             elif source_type == 'dash' or ext == 'mpd':
3564                 formats.extend(self._extract_mpd_formats(
3565                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3566             elif ext == 'smil':
3567                 formats.extend(self._extract_smil_formats(
3568                     source_url, video_id, fatal=False))
3569             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3570             elif source_type.startswith('audio') or ext in (
3571                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3572                 formats.append({
3573                     'url': source_url,
3574                     'vcodec': 'none',
3575                     'ext': ext,
3576                 })
3577             else:
3578                 height = int_or_none(source.get('height'))
3579                 if height is None:
3580                     # Often no height is provided but there is a label in
3581                     # format like "1080p", "720p SD", or 1080.
3582                     height = int_or_none(self._search_regex(
3583                         r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
3584                         'height', default=None))
3585                 a_format = {
3586                     'url': source_url,
3587                     'width': int_or_none(source.get('width')),
3588                     'height': height,
3589                     'tbr': int_or_none(source.get('bitrate')),
3590                     'ext': ext,
3591                 }
3592                 if source_url.startswith('rtmp'):
3593                     a_format['ext'] = 'flv'
3594                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3595                     # of jwplayer.flash.swf
3596                     rtmp_url_parts = re.split(
3597                         r'((?:mp4|mp3|flv):)', source_url, 1)
3598                     if len(rtmp_url_parts) == 3:
3599                         rtmp_url, prefix, play_path = rtmp_url_parts
3600                         a_format.update({
3601                             'url': rtmp_url,
3602                             'play_path': prefix + play_path,
3603                         })
3604                     if rtmp_params:
3605                         a_format.update(rtmp_params)
3606                 formats.append(a_format)
3607         return formats
3608
3609     def _live_title(self, name):
3610         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3611         return name
3612
3613     def _int(self, v, name, fatal=False, **kwargs):
3614         res = int_or_none(v, **kwargs)
3615         if res is None:
3616             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3617             if fatal:
3618                 raise ExtractorError(msg)
3619             else:
3620                 self.report_warning(msg)
3621         return res
3622
3623     def _float(self, v, name, fatal=False, **kwargs):
3624         res = float_or_none(v, **kwargs)
3625         if res is None:
3626             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3627             if fatal:
3628                 raise ExtractorError(msg)
3629             else:
3630                 self.report_warning(msg)
3631         return res
3632
3633     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3634                     path='/', secure=False, discard=False, rest={}, **kwargs):
3635         cookie = http.cookiejar.Cookie(
3636             0, name, value, port, port is not None, domain, True,
3637             domain.startswith('.'), path, True, secure, expire_time,
3638             discard, None, None, rest)
3639         self.cookiejar.set_cookie(cookie)
3640
3641     def _get_cookies(self, url):
3642         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3643         return LenientSimpleCookie(self._downloader._calc_cookies(url))
3644
3645     def _apply_first_set_cookie_header(self, url_handle, cookie):
3646         """
3647         Apply first Set-Cookie header instead of the last. Experimental.
3648
3649         Some sites (e.g. [1-3]) may serve two cookies under the same name
3650         in Set-Cookie header and expect the first (old) one to be set rather
3651         than second (new). However, as of RFC6265 the newer one cookie
3652         should be set into cookie store what actually happens.
3653         We will workaround this issue by resetting the cookie to
3654         the first one manually.
3655         1. https://new.vk.com/
3656         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3657         3. https://learning.oreilly.com/
3658         """
3659         for header, cookies in url_handle.headers.items():
3660             if header.lower() != 'set-cookie':
3661                 continue
3662             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3663             cookie_value = re.search(
3664                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3665             if cookie_value:
3666                 value, domain = cookie_value.groups()
3667                 self._set_cookie(domain, cookie, value)
3668                 break
3669
3670     @classmethod
3671     def get_testcases(cls, include_onlymatching=False):
3672         t = getattr(cls, '_TEST', None)
3673         if t:
3674             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3675             tests = [t]
3676         else:
3677             tests = getattr(cls, '_TESTS', [])
3678         for t in tests:
3679             if not include_onlymatching and t.get('only_matching', False):
3680                 continue
3681             t['name'] = cls.ie_key()
3682             yield t
3683
3684     @classmethod
3685     def get_webpage_testcases(cls):
3686         tests = getattr(cls, '_WEBPAGE_TESTS', [])
3687         for t in tests:
3688             t['name'] = cls.ie_key()
3689         return tests
3690
3691     @classproperty
3692     def age_limit(cls):
3693         """Get age limit from the testcases"""
3694         return max(traverse_obj(
3695             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3696             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3697
3698     @classmethod
3699     def is_suitable(cls, age_limit):
3700         """Test whether the extractor is generally suitable for the given age limit"""
3701         return not age_restricted(cls.age_limit, age_limit)
3702
3703     @classmethod
3704     def description(cls, *, markdown=True, search_examples=None):
3705         """Description of the extractor"""
3706         desc = ''
3707         if cls._NETRC_MACHINE:
3708             if markdown:
3709                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3710             else:
3711                 desc += f' [{cls._NETRC_MACHINE}]'
3712         if cls.IE_DESC is False:
3713             desc += ' [HIDDEN]'
3714         elif cls.IE_DESC:
3715             desc += f' {cls.IE_DESC}'
3716         if cls.SEARCH_KEY:
3717             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3718             if search_examples:
3719                 _COUNTS = ('', '5', '10', 'all')
3720                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3721         if not cls.working():
3722             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3723
3724         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3725         return f'{name}:{desc}' if desc else name
3726
3727     def extract_subtitles(self, *args, **kwargs):
3728         if (self.get_param('writesubtitles', False)
3729                 or self.get_param('listsubtitles')):
3730             return self._get_subtitles(*args, **kwargs)
3731         return {}
3732
3733     def _get_subtitles(self, *args, **kwargs):
3734         raise NotImplementedError('This method must be implemented by subclasses')
3735
3736     def extract_comments(self, *args, **kwargs):
3737         if not self.get_param('getcomments'):
3738             return None
3739         generator = self._get_comments(*args, **kwargs)
3740
3741         def extractor():
3742             comments = []
3743             interrupted = True
3744             try:
3745                 while True:
3746                     comments.append(next(generator))
3747             except StopIteration:
3748                 interrupted = False
3749             except KeyboardInterrupt:
3750                 self.to_screen('Interrupted by user')
3751             except Exception as e:
3752                 if self.get_param('ignoreerrors') is not True:
3753                     raise
3754                 self._downloader.report_error(e)
3755             comment_count = len(comments)
3756             self.to_screen(f'Extracted {comment_count} comments')
3757             return {
3758                 'comments': comments,
3759                 'comment_count': None if interrupted else comment_count
3760             }
3761         return extractor
3762
3763     def _get_comments(self, *args, **kwargs):
3764         raise NotImplementedError('This method must be implemented by subclasses')
3765
3766     @staticmethod
3767     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3768         """ Merge subtitle items for one language. Items with duplicated URLs/data
3769         will be dropped. """
3770         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3771         ret = list(subtitle_list1)
3772         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3773         return ret
3774
3775     @classmethod
3776     def _merge_subtitles(cls, *dicts, target=None):
3777         """ Merge subtitle dictionaries, language by language. """
3778         if target is None:
3779             target = {}
3780         for d in dicts:
3781             for lang, subs in d.items():
3782                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3783         return target
3784
3785     def extract_automatic_captions(self, *args, **kwargs):
3786         if (self.get_param('writeautomaticsub', False)
3787                 or self.get_param('listsubtitles')):
3788             return self._get_automatic_captions(*args, **kwargs)
3789         return {}
3790
3791     def _get_automatic_captions(self, *args, **kwargs):
3792         raise NotImplementedError('This method must be implemented by subclasses')
3793
3794     @functools.cached_property
3795     def _cookies_passed(self):
3796         """Whether cookies have been passed to YoutubeDL"""
3797         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3798
3799     def mark_watched(self, *args, **kwargs):
3800         if not self.get_param('mark_watched', False):
3801             return
3802         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3803             self._mark_watched(*args, **kwargs)
3804
3805     def _mark_watched(self, *args, **kwargs):
3806         raise NotImplementedError('This method must be implemented by subclasses')
3807
3808     def geo_verification_headers(self):
3809         headers = {}
3810         geo_verification_proxy = self.get_param('geo_verification_proxy')
3811         if geo_verification_proxy:
3812             headers['Ytdl-request-proxy'] = geo_verification_proxy
3813         return headers
3814
3815     @staticmethod
3816     def _generic_id(url):
3817         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3818
3819     @staticmethod
3820     def _generic_title(url):
3821         return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3822
3823     @staticmethod
3824     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3825         all_known = all(map(
3826             lambda x: x is not None,
3827             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3828         return (
3829             'private' if is_private
3830             else 'premium_only' if needs_premium
3831             else 'subscriber_only' if needs_subscription
3832             else 'needs_auth' if needs_auth
3833             else 'unlisted' if is_unlisted
3834             else 'public' if all_known
3835             else None)
3836
3837     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3838         '''
3839         @returns            A list of values for the extractor argument given by "key"
3840                             or "default" if no such key is present
3841         @param default      The default value to return when the key is not present (default: [])
3842         @param casesense    When false, the values are converted to lower case
3843         '''
3844         val = traverse_obj(
3845             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3846         if val is None:
3847             return [] if default is NO_DEFAULT else default
3848         return list(val) if casesense else [x.lower() for x in val]
3849
3850     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3851         if not playlist_id or not video_id:
3852             return not video_id
3853
3854         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3855         if no_playlist is not None:
3856             return not no_playlist
3857
3858         video_id = '' if video_id is True else f' {video_id}'
3859         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3860         if self.get_param('noplaylist'):
3861             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3862             return False
3863         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3864         return True
3865
3866     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3867         RetryManager.report_retry(
3868             err, _count or int(fatal), _retries,
3869             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3870             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3871
3872     def RetryManager(self, **kwargs):
3873         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3874
3875     @classmethod
3876     def extract_from_webpage(cls, ydl, url, webpage):
3877         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3878               else ydl.get_info_extractor(cls.ie_key()))
3879         for info in ie._extract_from_webpage(url, webpage) or []:
3880             # url = None since we do not want to set (webpage/original)_url
3881             ydl.add_default_extra_info(info, ie, None)
3882             yield info
3883
3884     @classmethod
3885     def _extract_from_webpage(cls, url, webpage):
3886         for embed_url in orderedSet(
3887                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3888             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3889
3890     @classmethod
3891     def _extract_embed_urls(cls, url, webpage):
3892         """@returns all the embed urls on the webpage"""
3893         if '_EMBED_URL_RE' not in cls.__dict__:
3894             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3895             for idx, regex in enumerate(cls._EMBED_REGEX):
3896                 assert regex.count('(?P<url>') == 1, \
3897                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3898             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3899
3900         for regex in cls._EMBED_URL_RE:
3901             for mobj in regex.finditer(webpage):
3902                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3903                 if cls._VALID_URL is False or cls.suitable(embed_url):
3904                     yield embed_url
3905
3906     class StopExtraction(Exception):
3907         pass
3908
3909     @classmethod
3910     def _extract_url(cls, webpage):  # TODO: Remove
3911         """Only for compatibility with some older extractors"""
3912         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3913
3914     @classmethod
3915     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3916         if plugin_name:
3917             mro = inspect.getmro(cls)
3918             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3919             cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key
3920             while getattr(super_class, '__wrapped__', None):
3921                 super_class = super_class.__wrapped__
3922             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3923
3924         return super().__init_subclass__(**kwargs)
3925
3926
3927 class SearchInfoExtractor(InfoExtractor):
3928     """
3929     Base class for paged search queries extractors.
3930     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3931     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3932     """
3933
3934     _MAX_RESULTS = float('inf')
3935
3936     @classproperty
3937     def _VALID_URL(cls):
3938         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3939
3940     def _real_extract(self, query):
3941         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3942         if prefix == '':
3943             return self._get_n_results(query, 1)
3944         elif prefix == 'all':
3945             return self._get_n_results(query, self._MAX_RESULTS)
3946         else:
3947             n = int(prefix)
3948             if n <= 0:
3949                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3950             elif n > self._MAX_RESULTS:
3951                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3952                 n = self._MAX_RESULTS
3953             return self._get_n_results(query, n)
3954
3955     def _get_n_results(self, query, n):
3956         """Get a specified number of results for a query.
3957         Either this function or _search_results must be overridden by subclasses """
3958         return self.playlist_result(
3959             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3960             query, query)
3961
3962     def _search_results(self, query):
3963         """Returns an iterator of search results"""
3964         raise NotImplementedError('This method must be implemented by subclasses')
3965
3966     @classproperty
3967     def SEARCH_KEY(cls):
3968         return cls._SEARCH_KEY
3969
3970
3971 class UnsupportedURLIE(InfoExtractor):
3972     _VALID_URL = '.*'
3973     _ENABLED = False
3974     IE_DESC = False
3975
3976     def _real_extract(self, url):
3977         raise UnsupportedError(url)