yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import sys
  17 import time
  18 import types
  19 import urllib.parse
  20 import urllib.request
  21 import xml.etree.ElementTree
  22
  23 from ..compat import functools  # isort: split
  24 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  25 from ..cookies import LenientSimpleCookie
  26 from ..downloader import FileDownloader
  27 from ..downloader.f4m import get_base_url, remove_encrypted_media
  28 from ..utils import (
  29     IDENTITY,
  30     JSON_LD_RE,
  31     NO_DEFAULT,
  32     ExtractorError,
  33     GeoRestrictedError,
  34     GeoUtils,
  35     LenientJSONDecoder,
  36     RegexNotFoundError,
  37     RetryManager,
  38     UnsupportedError,
  39     age_restricted,
  40     base_url,
  41     bug_reports_message,
  42     classproperty,
  43     clean_html,
  44     determine_ext,
  45     determine_protocol,
  46     dict_get,
  47     encode_data_uri,
  48     error_to_compat_str,
  49     extract_attributes,
  50     filter_dict,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     int_or_none,
  55     join_nonempty,
  56     js_to_json,
  57     mimetype2ext,
  58     network_exceptions,
  59     orderedSet,
  60     parse_bitrate,
  61     parse_codecs,
  62     parse_duration,
  63     parse_iso8601,
  64     parse_m3u8_attributes,
  65     parse_resolution,
  66     sanitize_filename,
  67     sanitize_url,
  68     sanitized_Request,
  69     str_or_none,
  70     str_to_int,
  71     strip_or_none,
  72     traverse_obj,
  73     try_call,
  74     try_get,
  75     unescapeHTML,
  76     unified_strdate,
  77     unified_timestamp,
  78     update_Request,
  79     update_url_query,
  80     url_basename,
  81     url_or_none,
  82     urljoin,
  83     variadic,
  84     xpath_element,
  85     xpath_text,
  86     xpath_with_ns,
  87 )
  88
  89
  90 class InfoExtractor:
  91     """Information Extractor class.
  92
  93     Information extractors are the classes that, given a URL, extract
  94     information about the video (or videos) the URL refers to. This
  95     information includes the real video URL, the video title, author and
  96     others. The information is stored in a dictionary which is then
  97     passed to the YoutubeDL. The YoutubeDL processes this
  98     information possibly downloading the video to the file system, among
  99     other possible outcomes.
 100
 101     The type field determines the type of the result.
 102     By far the most common value (and the default if _type is missing) is
 103     "video", which indicates a single video.
 104
 105     For a video, the dictionaries must include the following fields:
 106
 107     id:             Video identifier.
 108     title:          Video title, unescaped. Set to an empty string if video has
 109                     no title as opposed to "None" which signifies that the
 110                     extractor failed to obtain a title
 111
 112     Additionally, it must contain either a formats entry or a url one:
 113
 114     formats:        A list of dictionaries for each format available, ordered
 115                     from worst to best quality.
 116
 117                     Potential fields:
 118                     * url        The mandatory URL representing the media:
 119                                    for plain file media - HTTP URL of this file,
 120                                    for RTMP - RTMP URL,
 121                                    for HLS - URL of the M3U8 media playlist,
 122                                    for HDS - URL of the F4M manifest,
 123                                    for DASH
 124                                      - HTTP URL to plain file media (in case of
 125                                        unfragmented media)
 126                                      - URL of the MPD manifest or base URL
 127                                        representing the media if MPD manifest
 128                                        is parsed from a string (in case of
 129                                        fragmented media)
 130                                    for MSS - URL of the ISM manifest.
 131                     * manifest_url
 132                                  The URL of the manifest file in case of
 133                                  fragmented media:
 134                                    for HLS - URL of the M3U8 master playlist,
 135                                    for HDS - URL of the F4M manifest,
 136                                    for DASH - URL of the MPD manifest,
 137                                    for MSS - URL of the ISM manifest.
 138                     * manifest_stream_number  (For internal use only)
 139                                  The index of the stream in the manifest file
 140                     * ext        Will be calculated from URL if missing
 141                     * format     A human-readable description of the format
 142                                  ("mp4 container with h264/opus").
 143                                  Calculated from the format_id, width, height.
 144                                  and format_note fields if missing.
 145                     * format_id  A short description of the format
 146                                  ("mp4_h264_opus" or "19").
 147                                 Technically optional, but strongly recommended.
 148                     * format_note Additional info about the format
 149                                  ("3D" or "DASH video")
 150                     * width      Width of the video, if known
 151                     * height     Height of the video, if known
 152                     * resolution Textual description of width and height
 153                     * dynamic_range The dynamic range of the video. One of:
 154                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 155                     * tbr        Average bitrate of audio and video in KBit/s
 156                     * abr        Average audio bitrate in KBit/s
 157                     * acodec     Name of the audio codec in use
 158                     * asr        Audio sampling rate in Hertz
 159                     * audio_channels  Number of audio channels
 160                     * vbr        Average video bitrate in KBit/s
 161                     * fps        Frame rate
 162                     * vcodec     Name of the video codec in use
 163                     * container  Name of the container format
 164                     * filesize   The number of bytes, if known in advance
 165                     * filesize_approx  An estimate for the number of bytes
 166                     * player_url SWF Player URL (used for rtmpdump).
 167                     * protocol   The protocol that will be used for the actual
 168                                  download, lower-case. One of "http", "https" or
 169                                  one of the protocols defined in downloader.PROTOCOL_MAP
 170                     * fragment_base_url
 171                                  Base URL for fragments. Each fragment's path
 172                                  value (if present) will be relative to
 173                                  this URL.
 174                     * fragments  A list of fragments of a fragmented media.
 175                                  Each fragment entry must contain either an url
 176                                  or a path. If an url is present it should be
 177                                  considered by a client. Otherwise both path and
 178                                  fragment_base_url must be present. Here is
 179                                  the list of all potential fields:
 180                                  * "url" - fragment's URL
 181                                  * "path" - fragment's path relative to
 182                                             fragment_base_url
 183                                  * "duration" (optional, int or float)
 184                                  * "filesize" (optional, int)
 185                     * is_from_start  Is a live format that can be downloaded
 186                                 from the start. Boolean
 187                     * preference Order number of this format. If this field is
 188                                  present and not None, the formats get sorted
 189                                  by this field, regardless of all other values.
 190                                  -1 for default (order by other properties),
 191                                  -2 or smaller for less than default.
 192                                  < -1000 to hide the format (if there is
 193                                     another one which is strictly better)
 194                     * language   Language code, e.g. "de" or "en-US".
 195                     * language_preference  Is this in the language mentioned in
 196                                  the URL?
 197                                  10 if it's what the URL is about,
 198                                  -1 for default (don't know),
 199                                  -10 otherwise, other values reserved for now.
 200                     * quality    Order number of the video quality of this
 201                                  format, irrespective of the file format.
 202                                  -1 for default (order by other properties),
 203                                  -2 or smaller for less than default.
 204                     * source_preference  Order number for this video source
 205                                   (quality takes higher priority)
 206                                  -1 for default (order by other properties),
 207                                  -2 or smaller for less than default.
 208                     * http_headers  A dictionary of additional HTTP headers
 209                                  to add to the request.
 210                     * stretched_ratio  If given and not 1, indicates that the
 211                                  video's pixels are not square.
 212                                  width : height ratio as float.
 213                     * no_resume  The server does not support resuming the
 214                                  (HTTP or RTMP) download. Boolean.
 215                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 216                     * downloader_options  A dictionary of downloader options
 217                                  (For internal use only)
 218                                  * http_chunk_size Chunk size for HTTP downloads
 219                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 220                     RTMP formats can also have the additional fields: page_url,
 221                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 222                     rtmp_protocol, rtmp_real_time
 223
 224     url:            Final video URL.
 225     ext:            Video filename extension.
 226     format:         The video format, defaults to ext (used for --get-format)
 227     player_url:     SWF Player URL (used for rtmpdump).
 228
 229     The following fields are optional:
 230
 231     direct:         True if a direct video file was given (must only be set by GenericIE)
 232     alt_title:      A secondary title of the video.
 233     display_id      An alternative identifier for the video, not necessarily
 234                     unique, but available before title. Typically, id is
 235                     something like "4234987", title "Dancing naked mole rats",
 236                     and display_id "dancing-naked-mole-rats"
 237     thumbnails:     A list of dictionaries, with the following entries:
 238                         * "id" (optional, string) - Thumbnail format ID
 239                         * "url"
 240                         * "preference" (optional, int) - quality of the image
 241                         * "width" (optional, int)
 242                         * "height" (optional, int)
 243                         * "resolution" (optional, string "{width}x{height}",
 244                                         deprecated)
 245                         * "filesize" (optional, int)
 246                         * "http_headers" (dict) - HTTP headers for the request
 247     thumbnail:      Full URL to a video thumbnail image.
 248     description:    Full video description.
 249     uploader:       Full name of the video uploader.
 250     license:        License name the video is licensed under.
 251     creator:        The creator of the video.
 252     timestamp:      UNIX timestamp of the moment the video was uploaded
 253     upload_date:    Video upload date in UTC (YYYYMMDD).
 254                     If not explicitly set, calculated from timestamp
 255     release_timestamp: UNIX timestamp of the moment the video was released.
 256                     If it is not clear whether to use timestamp or this, use the former
 257     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 258                     If not explicitly set, calculated from release_timestamp
 259     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 260     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 261                     If not explicitly set, calculated from modified_timestamp
 262     uploader_id:    Nickname or id of the video uploader.
 263     uploader_url:   Full URL to a personal webpage of the video uploader.
 264     channel:        Full name of the channel the video is uploaded on.
 265                     Note that channel fields may or may not repeat uploader
 266                     fields. This depends on a particular extractor.
 267     channel_id:     Id of the channel.
 268     channel_url:    Full URL to a channel webpage.
 269     channel_follower_count: Number of followers of the channel.
 270     location:       Physical location where the video was filmed.
 271     subtitles:      The available subtitles as a dictionary in the format
 272                     {tag: subformats}. "tag" is usually a language code, and
 273                     "subformats" is a list sorted from lower to higher
 274                     preference, each element is a dictionary with the "ext"
 275                     entry and one of:
 276                         * "data": The subtitles file contents
 277                         * "url": A URL pointing to the subtitles file
 278                     It can optionally also have:
 279                         * "name": Name or description of the subtitles
 280                         * "http_headers": A dictionary of additional HTTP headers
 281                                   to add to the request.
 282                     "ext" will be calculated from URL if missing
 283     automatic_captions: Like 'subtitles'; contains automatically generated
 284                     captions instead of normal subtitles
 285     duration:       Length of the video in seconds, as an integer or float.
 286     view_count:     How many users have watched the video on the platform.
 287     concurrent_view_count: How many users are currently watching the video on the platform.
 288     like_count:     Number of positive ratings of the video
 289     dislike_count:  Number of negative ratings of the video
 290     repost_count:   Number of reposts of the video
 291     average_rating: Average rating give by users, the scale used depends on the webpage
 292     comment_count:  Number of comments on the video
 293     comments:       A list of comments, each with one or more of the following
 294                     properties (all but one of text or html optional):
 295                         * "author" - human-readable name of the comment author
 296                         * "author_id" - user ID of the comment author
 297                         * "author_thumbnail" - The thumbnail of the comment author
 298                         * "id" - Comment ID
 299                         * "html" - Comment as HTML
 300                         * "text" - Plain text of the comment
 301                         * "timestamp" - UNIX timestamp of comment
 302                         * "parent" - ID of the comment this one is replying to.
 303                                      Set to "root" to indicate that this is a
 304                                      comment to the original video.
 305                         * "like_count" - Number of positive ratings of the comment
 306                         * "dislike_count" - Number of negative ratings of the comment
 307                         * "is_favorited" - Whether the comment is marked as
 308                                            favorite by the video uploader
 309                         * "author_is_uploader" - Whether the comment is made by
 310                                                  the video uploader
 311     age_limit:      Age restriction for the video, as an integer (years)
 312     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 313                     should allow to get the same result again. (It will be set
 314                     by YoutubeDL if it's missing)
 315     categories:     A list of categories that the video falls in, for example
 316                     ["Sports", "Berlin"]
 317     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 318     cast:           A list of the video cast
 319     is_live:        True, False, or None (=unknown). Whether this video is a
 320                     live stream that goes on instead of a fixed-length video.
 321     was_live:       True, False, or None (=unknown). Whether this video was
 322                     originally a live stream.
 323     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 324                     or 'post_live' (was live, but VOD is not yet processed)
 325                     If absent, automatically set from is_live, was_live
 326     start_time:     Time in seconds where the reproduction should start, as
 327                     specified in the URL.
 328     end_time:       Time in seconds where the reproduction should end, as
 329                     specified in the URL.
 330     chapters:       A list of dictionaries, with the following entries:
 331                         * "start_time" - The start time of the chapter in seconds
 332                         * "end_time" - The end time of the chapter in seconds
 333                         * "title" (optional, string)
 334     playable_in_embed: Whether this video is allowed to play in embedded
 335                     players on other sites. Can be True (=always allowed),
 336                     False (=never allowed), None (=unknown), or a string
 337                     specifying the criteria for embedability; e.g. 'whitelist'
 338     availability:   Under what condition the video is available. One of
 339                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 340                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 341                     to set it
 342     _old_archive_ids: A list of old archive ids needed for backward compatibility
 343     __post_extractor: A function to be called just before the metadata is
 344                     written to either disk, logger or console. The function
 345                     must return a dict which will be added to the info_dict.
 346                     This is usefull for additional information that is
 347                     time-consuming to extract. Note that the fields thus
 348                     extracted will not be available to output template and
 349                     match_filter. So, only "comments" and "comment_count" are
 350                     currently allowed to be extracted via this method.
 351
 352     The following fields should only be used when the video belongs to some logical
 353     chapter or section:
 354
 355     chapter:        Name or title of the chapter the video belongs to.
 356     chapter_number: Number of the chapter the video belongs to, as an integer.
 357     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 358
 359     The following fields should only be used when the video is an episode of some
 360     series, programme or podcast:
 361
 362     series:         Title of the series or programme the video episode belongs to.
 363     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 364     season:         Title of the season the video episode belongs to.
 365     season_number:  Number of the season the video episode belongs to, as an integer.
 366     season_id:      Id of the season the video episode belongs to, as a unicode string.
 367     episode:        Title of the video episode. Unlike mandatory video title field,
 368                     this field should denote the exact title of the video episode
 369                     without any kind of decoration.
 370     episode_number: Number of the video episode within a season, as an integer.
 371     episode_id:     Id of the video episode, as a unicode string.
 372
 373     The following fields should only be used when the media is a track or a part of
 374     a music album:
 375
 376     track:          Title of the track.
 377     track_number:   Number of the track within an album or a disc, as an integer.
 378     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 379                     as a unicode string.
 380     artist:         Artist(s) of the track.
 381     genre:          Genre(s) of the track.
 382     album:          Title of the album the track belongs to.
 383     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 384     album_artist:   List of all artists appeared on the album (e.g.
 385                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 386                     and compilations).
 387     disc_number:    Number of the disc or other physical medium the track belongs to,
 388                     as an integer.
 389     release_year:   Year (YYYY) when the album was released.
 390     composer:       Composer of the piece
 391
 392     The following fields should only be set for clips that should be cut from the original video:
 393
 394     section_start:  Start time of the section in seconds
 395     section_end:    End time of the section in seconds
 396
 397     The following fields should only be set for storyboards:
 398     rows:           Number of rows in each storyboard fragment, as an integer
 399     columns:        Number of columns in each storyboard fragment, as an integer
 400
 401     Unless mentioned otherwise, the fields should be Unicode strings.
 402
 403     Unless mentioned otherwise, None is equivalent to absence of information.
 404
 405
 406     _type "playlist" indicates multiple videos.
 407     There must be a key "entries", which is a list, an iterable, or a PagedList
 408     object, each element of which is a valid dictionary by this specification.
 409
 410     Additionally, playlists can have "id", "title", and any other relevant
 411     attributes with the same semantics as videos (see above).
 412
 413     It can also have the following optional fields:
 414
 415     playlist_count: The total number of videos in a playlist. If not given,
 416                     YoutubeDL tries to calculate it from "entries"
 417
 418
 419     _type "multi_video" indicates that there are multiple videos that
 420     form a single show, for examples multiple acts of an opera or TV episode.
 421     It must have an entries key like a playlist and contain all the keys
 422     required for a video at the same time.
 423
 424
 425     _type "url" indicates that the video must be extracted from another
 426     location, possibly by a different extractor. Its only required key is:
 427     "url" - the next URL to extract.
 428     The key "ie_key" can be set to the class name (minus the trailing "IE",
 429     e.g. "Youtube") if the extractor class is known in advance.
 430     Additionally, the dictionary may have any properties of the resolved entity
 431     known in advance, for example "title" if the title of the referred video is
 432     known ahead of time.
 433
 434
 435     _type "url_transparent" entities have the same specification as "url", but
 436     indicate that the given additional information is more precise than the one
 437     associated with the resolved URL.
 438     This is useful when a site employs a video service that hosts the video and
 439     its technical metadata, but that video service does not embed a useful
 440     title, description etc.
 441
 442
 443     Subclasses of this should also be added to the list of extractors and
 444     should define a _VALID_URL regexp and, re-define the _real_extract() and
 445     (optionally) _real_initialize() methods.
 446
 447     Subclasses may also override suitable() if necessary, but ensure the function
 448     signature is preserved and that this function imports everything it needs
 449     (except other extractors), so that lazy_extractors works correctly.
 450
 451     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 452     the HTML of Generic webpages. It may also override _extract_embed_urls
 453     or _extract_from_webpage as necessary. While these are normally classmethods,
 454     _extract_from_webpage is allowed to be an instance method.
 455
 456     _extract_from_webpage may raise self.StopExtraction() to stop further
 457     processing of the webpage and obtain exclusive rights to it. This is useful
 458     when the extractor cannot reliably be matched using just the URL,
 459     e.g. invidious/peertube instances
 460
 461     Embed-only extractors can be defined by setting _VALID_URL = False.
 462
 463     To support username + password (or netrc) login, the extractor must define a
 464     _NETRC_MACHINE and re-define _perform_login(username, password) and
 465     (optionally) _initialize_pre_login() methods. The _perform_login method will
 466     be called between _initialize_pre_login and _real_initialize if credentials
 467     are passed by the user. In cases where it is necessary to have the login
 468     process as part of the extraction rather than initialization, _perform_login
 469     can be left undefined.
 470
 471     _GEO_BYPASS attribute may be set to False in order to disable
 472     geo restriction bypass mechanisms for a particular extractor.
 473     Though it won't disable explicit geo restriction bypass based on
 474     country code provided with geo_bypass_country.
 475
 476     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 477     countries for this extractor. One of these countries will be used by
 478     geo restriction bypass mechanism right away in order to bypass
 479     geo restriction, of course, if the mechanism is not disabled.
 480
 481     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 482     IP blocks in CIDR notation for this extractor. One of these IP blocks
 483     will be used by geo restriction bypass mechanism similarly
 484     to _GEO_COUNTRIES.
 485
 486     The _ENABLED attribute should be set to False for IEs that
 487     are disabled by default and must be explicitly enabled.
 488
 489     The _WORKING attribute should be set to False for broken IEs
 490     in order to warn the users and skip the tests.
 491     """
 492
 493     _ready = False
 494     _downloader = None
 495     _x_forwarded_for_ip = None
 496     _GEO_BYPASS = True
 497     _GEO_COUNTRIES = None
 498     _GEO_IP_BLOCKS = None
 499     _WORKING = True
 500     _ENABLED = True
 501     _NETRC_MACHINE = None
 502     IE_DESC = None
 503     SEARCH_KEY = None
 504     _VALID_URL = None
 505     _EMBED_REGEX = []
 506
 507     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 508         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 509         return {
 510             None: '',
 511             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 512             'password': f'Use {password_hint}',
 513             'cookies': (
 514                 'Use --cookies-from-browser or --cookies for the authentication. '
 515                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 516         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 517
 518     def __init__(self, downloader=None):
 519         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 520         If a downloader is not passed during initialization,
 521         it must be set using "set_downloader()" before "extract()" is called"""
 522         self._ready = False
 523         self._x_forwarded_for_ip = None
 524         self._printed_messages = set()
 525         self.set_downloader(downloader)
 526
 527     @classmethod
 528     def _match_valid_url(cls, url):
 529         if cls._VALID_URL is False:
 530             return None
 531         # This does not use has/getattr intentionally - we want to know whether
 532         # we have cached the regexp for *this* class, whereas getattr would also
 533         # match the superclass
 534         if '_VALID_URL_RE' not in cls.__dict__:
 535             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 536         return cls._VALID_URL_RE.match(url)
 537
 538     @classmethod
 539     def suitable(cls, url):
 540         """Receives a URL and returns True if suitable for this IE."""
 541         # This function must import everything it needs (except other extractors),
 542         # so that lazy_extractors works correctly
 543         return cls._match_valid_url(url) is not None
 544
 545     @classmethod
 546     def _match_id(cls, url):
 547         return cls._match_valid_url(url).group('id')
 548
 549     @classmethod
 550     def get_temp_id(cls, url):
 551         try:
 552             return cls._match_id(url)
 553         except (IndexError, AttributeError):
 554             return None
 555
 556     @classmethod
 557     def working(cls):
 558         """Getter method for _WORKING."""
 559         return cls._WORKING
 560
 561     @classmethod
 562     def supports_login(cls):
 563         return bool(cls._NETRC_MACHINE)
 564
 565     def initialize(self):
 566         """Initializes an instance (authentication, etc)."""
 567         self._printed_messages = set()
 568         self._initialize_geo_bypass({
 569             'countries': self._GEO_COUNTRIES,
 570             'ip_blocks': self._GEO_IP_BLOCKS,
 571         })
 572         if not self._ready:
 573             self._initialize_pre_login()
 574             if self.supports_login():
 575                 username, password = self._get_login_info()
 576                 if username:
 577                     self._perform_login(username, password)
 578             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 579                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 580             self._real_initialize()
 581             self._ready = True
 582
 583     def _initialize_geo_bypass(self, geo_bypass_context):
 584         """
 585         Initialize geo restriction bypass mechanism.
 586
 587         This method is used to initialize geo bypass mechanism based on faking
 588         X-Forwarded-For HTTP header. A random country from provided country list
 589         is selected and a random IP belonging to this country is generated. This
 590         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 591         HTTP requests.
 592
 593         This method will be used for initial geo bypass mechanism initialization
 594         during the instance initialization with _GEO_COUNTRIES and
 595         _GEO_IP_BLOCKS.
 596
 597         You may also manually call it from extractor's code if geo bypass
 598         information is not available beforehand (e.g. obtained during
 599         extraction) or due to some other reason. In this case you should pass
 600         this information in geo bypass context passed as first argument. It may
 601         contain following fields:
 602
 603         countries:  List of geo unrestricted countries (similar
 604                     to _GEO_COUNTRIES)
 605         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 606                     (similar to _GEO_IP_BLOCKS)
 607
 608         """
 609         if not self._x_forwarded_for_ip:
 610
 611             # Geo bypass mechanism is explicitly disabled by user
 612             if not self.get_param('geo_bypass', True):
 613                 return
 614
 615             if not geo_bypass_context:
 616                 geo_bypass_context = {}
 617
 618             # Backward compatibility: previously _initialize_geo_bypass
 619             # expected a list of countries, some 3rd party code may still use
 620             # it this way
 621             if isinstance(geo_bypass_context, (list, tuple)):
 622                 geo_bypass_context = {
 623                     'countries': geo_bypass_context,
 624                 }
 625
 626             # The whole point of geo bypass mechanism is to fake IP
 627             # as X-Forwarded-For HTTP header based on some IP block or
 628             # country code.
 629
 630             # Path 1: bypassing based on IP block in CIDR notation
 631
 632             # Explicit IP block specified by user, use it right away
 633             # regardless of whether extractor is geo bypassable or not
 634             ip_block = self.get_param('geo_bypass_ip_block', None)
 635
 636             # Otherwise use random IP block from geo bypass context but only
 637             # if extractor is known as geo bypassable
 638             if not ip_block:
 639                 ip_blocks = geo_bypass_context.get('ip_blocks')
 640                 if self._GEO_BYPASS and ip_blocks:
 641                     ip_block = random.choice(ip_blocks)
 642
 643             if ip_block:
 644                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 645                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 646                 return
 647
 648             # Path 2: bypassing based on country code
 649
 650             # Explicit country code specified by user, use it right away
 651             # regardless of whether extractor is geo bypassable or not
 652             country = self.get_param('geo_bypass_country', None)
 653
 654             # Otherwise use random country code from geo bypass context but
 655             # only if extractor is known as geo bypassable
 656             if not country:
 657                 countries = geo_bypass_context.get('countries')
 658                 if self._GEO_BYPASS and countries:
 659                     country = random.choice(countries)
 660
 661             if country:
 662                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 663                 self._downloader.write_debug(
 664                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 665
 666     def extract(self, url):
 667         """Extracts URL information and returns it in list of dicts."""
 668         try:
 669             for _ in range(2):
 670                 try:
 671                     self.initialize()
 672                     self.write_debug('Extracting URL: %s' % url)
 673                     ie_result = self._real_extract(url)
 674                     if ie_result is None:
 675                         return None
 676                     if self._x_forwarded_for_ip:
 677                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 678                     subtitles = ie_result.get('subtitles') or {}
 679                     if 'no-live-chat' in self.get_param('compat_opts'):
 680                         for lang in ('live_chat', 'comments', 'danmaku'):
 681                             subtitles.pop(lang, None)
 682                     return ie_result
 683                 except GeoRestrictedError as e:
 684                     if self.__maybe_fake_ip_and_retry(e.countries):
 685                         continue
 686                     raise
 687         except UnsupportedError:
 688             raise
 689         except ExtractorError as e:
 690             kwargs = {
 691                 'video_id': e.video_id or self.get_temp_id(url),
 692                 'ie': self.IE_NAME,
 693                 'tb': e.traceback or sys.exc_info()[2],
 694                 'expected': e.expected,
 695                 'cause': e.cause
 696             }
 697             if hasattr(e, 'countries'):
 698                 kwargs['countries'] = e.countries
 699             raise type(e)(e.orig_msg, **kwargs)
 700         except http.client.IncompleteRead as e:
 701             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 702         except (KeyError, StopIteration) as e:
 703             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 704
 705     def __maybe_fake_ip_and_retry(self, countries):
 706         if (not self.get_param('geo_bypass_country', None)
 707                 and self._GEO_BYPASS
 708                 and self.get_param('geo_bypass', True)
 709                 and not self._x_forwarded_for_ip
 710                 and countries):
 711             country_code = random.choice(countries)
 712             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 713             if self._x_forwarded_for_ip:
 714                 self.report_warning(
 715                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 716                     % (self._x_forwarded_for_ip, country_code.upper()))
 717                 return True
 718         return False
 719
 720     def set_downloader(self, downloader):
 721         """Sets a YoutubeDL instance as the downloader for this IE."""
 722         self._downloader = downloader
 723
 724     @property
 725     def cache(self):
 726         return self._downloader.cache
 727
 728     @property
 729     def cookiejar(self):
 730         return self._downloader.cookiejar
 731
 732     def _initialize_pre_login(self):
 733         """ Initialization before login. Redefine in subclasses."""
 734         pass
 735
 736     def _perform_login(self, username, password):
 737         """ Login with username and password. Redefine in subclasses."""
 738         pass
 739
 740     def _real_initialize(self):
 741         """Real initialization process. Redefine in subclasses."""
 742         pass
 743
 744     def _real_extract(self, url):
 745         """Real extraction process. Redefine in subclasses."""
 746         raise NotImplementedError('This method must be implemented by subclasses')
 747
 748     @classmethod
 749     def ie_key(cls):
 750         """A string for getting the InfoExtractor with get_info_extractor"""
 751         return cls.__name__[:-2]
 752
 753     @classproperty
 754     def IE_NAME(cls):
 755         return cls.__name__[:-2]
 756
 757     @staticmethod
 758     def __can_accept_status_code(err, expected_status):
 759         assert isinstance(err, urllib.error.HTTPError)
 760         if expected_status is None:
 761             return False
 762         elif callable(expected_status):
 763             return expected_status(err.code) is True
 764         else:
 765             return err.code in variadic(expected_status)
 766
 767     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 768         if isinstance(url_or_request, urllib.request.Request):
 769             return update_Request(url_or_request, data=data, headers=headers, query=query)
 770         if query:
 771             url_or_request = update_url_query(url_or_request, query)
 772         return sanitized_Request(url_or_request, data, headers or {})
 773
 774     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 775         """
 776         Return the response handle.
 777
 778         See _download_webpage docstring for arguments specification.
 779         """
 780         if not self._downloader._first_webpage_request:
 781             sleep_interval = self.get_param('sleep_interval_requests') or 0
 782             if sleep_interval > 0:
 783                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 784                 time.sleep(sleep_interval)
 785         else:
 786             self._downloader._first_webpage_request = False
 787
 788         if note is None:
 789             self.report_download_webpage(video_id)
 790         elif note is not False:
 791             if video_id is None:
 792                 self.to_screen(str(note))
 793             else:
 794                 self.to_screen(f'{video_id}: {note}')
 795
 796         # Some sites check X-Forwarded-For HTTP header in order to figure out
 797         # the origin of the client behind proxy. This allows bypassing geo
 798         # restriction by faking this header's value to IP that belongs to some
 799         # geo unrestricted country. We will do so once we encounter any
 800         # geo restriction error.
 801         if self._x_forwarded_for_ip:
 802             headers = (headers or {}).copy()
 803             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 804
 805         try:
 806             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 807         except network_exceptions as err:
 808             if isinstance(err, urllib.error.HTTPError):
 809                 if self.__can_accept_status_code(err, expected_status):
 810                     # Retain reference to error to prevent file object from
 811                     # being closed before it can be read. Works around the
 812                     # effects of <https://bugs.python.org/issue15002>
 813                     # introduced in Python 3.4.1.
 814                     err.fp._error = err
 815                     return err.fp
 816
 817             if errnote is False:
 818                 return False
 819             if errnote is None:
 820                 errnote = 'Unable to download webpage'
 821
 822             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 823             if fatal:
 824                 raise ExtractorError(errmsg, cause=err)
 825             else:
 826                 self.report_warning(errmsg)
 827                 return False
 828
 829     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 830                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 831         """
 832         Return a tuple (page content as string, URL handle).
 833
 834         Arguments:
 835         url_or_request -- plain text URL as a string or
 836             a urllib.request.Request object
 837         video_id -- Video/playlist/item identifier (string)
 838
 839         Keyword arguments:
 840         note -- note printed before downloading (string)
 841         errnote -- note printed in case of an error (string)
 842         fatal -- flag denoting whether error should be considered fatal,
 843             i.e. whether it should cause ExtractionError to be raised,
 844             otherwise a warning will be reported and extraction continued
 845         encoding -- encoding for a page content decoding, guessed automatically
 846             when not explicitly specified
 847         data -- POST data (bytes)
 848         headers -- HTTP headers (dict)
 849         query -- URL query (dict)
 850         expected_status -- allows to accept failed HTTP requests (non 2xx
 851             status code) by explicitly specifying a set of accepted status
 852             codes. Can be any of the following entities:
 853                 - an integer type specifying an exact failed status code to
 854                   accept
 855                 - a list or a tuple of integer types specifying a list of
 856                   failed status codes to accept
 857                 - a callable accepting an actual failed status code and
 858                   returning True if it should be accepted
 859             Note that this argument does not affect success status codes (2xx)
 860             which are always accepted.
 861         """
 862
 863         # Strip hashes from the URL (#1038)
 864         if isinstance(url_or_request, str):
 865             url_or_request = url_or_request.partition('#')[0]
 866
 867         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 868         if urlh is False:
 869             assert not fatal
 870             return False
 871         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 872         return (content, urlh)
 873
 874     @staticmethod
 875     def _guess_encoding_from_content(content_type, webpage_bytes):
 876         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 877         if m:
 878             encoding = m.group(1)
 879         else:
 880             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 881                           webpage_bytes[:1024])
 882             if m:
 883                 encoding = m.group(1).decode('ascii')
 884             elif webpage_bytes.startswith(b'\xff\xfe'):
 885                 encoding = 'utf-16'
 886             else:
 887                 encoding = 'utf-8'
 888
 889         return encoding
 890
 891     def __check_blocked(self, content):
 892         first_block = content[:512]
 893         if ('<title>Access to this site is blocked</title>' in content
 894                 and 'Websense' in first_block):
 895             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 896             blocked_iframe = self._html_search_regex(
 897                 r'<iframe src="([^"]+)"', content,
 898                 'Websense information URL', default=None)
 899             if blocked_iframe:
 900                 msg += ' Visit %s for more details' % blocked_iframe
 901             raise ExtractorError(msg, expected=True)
 902         if '<title>The URL you requested has been blocked</title>' in first_block:
 903             msg = (
 904                 'Access to this webpage has been blocked by Indian censorship. '
 905                 'Use a VPN or proxy server (with --proxy) to route around it.')
 906             block_msg = self._html_search_regex(
 907                 r'</h1><p>(.*?)</p>',
 908                 content, 'block message', default=None)
 909             if block_msg:
 910                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 911             raise ExtractorError(msg, expected=True)
 912         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 913                 and 'blocklist.rkn.gov.ru' in content):
 914             raise ExtractorError(
 915                 'Access to this webpage has been blocked by decision of the Russian government. '
 916                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 917                 expected=True)
 918
 919     def _request_dump_filename(self, url, video_id):
 920         basen = f'{video_id}_{url}'
 921         trim_length = self.get_param('trim_file_name') or 240
 922         if len(basen) > trim_length:
 923             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 924             basen = basen[:trim_length - len(h)] + h
 925         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 926         # Working around MAX_PATH limitation on Windows (see
 927         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 928         if compat_os_name == 'nt':
 929             absfilepath = os.path.abspath(filename)
 930             if len(absfilepath) > 259:
 931                 filename = fR'\\?\{absfilepath}'
 932         return filename
 933
 934     def __decode_webpage(self, webpage_bytes, encoding, headers):
 935         if not encoding:
 936             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 937         try:
 938             return webpage_bytes.decode(encoding, 'replace')
 939         except LookupError:
 940             return webpage_bytes.decode('utf-8', 'replace')
 941
 942     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 943         webpage_bytes = urlh.read()
 944         if prefix is not None:
 945             webpage_bytes = prefix + webpage_bytes
 946         if self.get_param('dump_intermediate_pages', False):
 947             self.to_screen('Dumping request to ' + urlh.geturl())
 948             dump = base64.b64encode(webpage_bytes).decode('ascii')
 949             self._downloader.to_screen(dump)
 950         if self.get_param('write_pages'):
 951             filename = self._request_dump_filename(urlh.geturl(), video_id)
 952             self.to_screen(f'Saving request to {filename}')
 953             with open(filename, 'wb') as outf:
 954                 outf.write(webpage_bytes)
 955
 956         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 957         self.__check_blocked(content)
 958
 959         return content
 960
 961     def __print_error(self, errnote, fatal, video_id, err):
 962         if fatal:
 963             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 964         elif errnote:
 965             self.report_warning(f'{video_id}: {errnote}: {err}')
 966
 967     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 968         if transform_source:
 969             xml_string = transform_source(xml_string)
 970         try:
 971             return compat_etree_fromstring(xml_string.encode('utf-8'))
 972         except xml.etree.ElementTree.ParseError as ve:
 973             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 974
 975     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
 976         try:
 977             return json.loads(
 978                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 979         except ValueError as ve:
 980             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
 981
 982     def _parse_socket_response_as_json(self, data, *args, **kwargs):
 983         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
 984
 985     def __create_download_methods(name, parser, note, errnote, return_value):
 986
 987         def parse(ie, content, *args, errnote=errnote, **kwargs):
 988             if parser is None:
 989                 return content
 990             if errnote is False:
 991                 kwargs['errnote'] = errnote
 992             # parser is fetched by name so subclasses can override it
 993             return getattr(ie, parser)(content, *args, **kwargs)
 994
 995         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 996                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 997             res = self._download_webpage_handle(
 998                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 999                 data=data, headers=headers, query=query, expected_status=expected_status)
1000             if res is False:
1001                 return res
1002             content, urlh = res
1003             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1004
1005         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1006                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1007             if self.get_param('load_pages'):
1008                 url_or_request = self._create_request(url_or_request, data, headers, query)
1009                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1010                 self.to_screen(f'Loading request from {filename}')
1011                 try:
1012                     with open(filename, 'rb') as dumpf:
1013                         webpage_bytes = dumpf.read()
1014                 except OSError as e:
1015                     self.report_warning(f'Unable to load request from disk: {e}')
1016                 else:
1017                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1018                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1019             kwargs = {
1020                 'note': note,
1021                 'errnote': errnote,
1022                 'transform_source': transform_source,
1023                 'fatal': fatal,
1024                 'encoding': encoding,
1025                 'data': data,
1026                 'headers': headers,
1027                 'query': query,
1028                 'expected_status': expected_status,
1029             }
1030             if parser is None:
1031                 kwargs.pop('transform_source')
1032             # The method is fetched by name so subclasses can override _download_..._handle
1033             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1034             return res if res is False else res[0]
1035
1036         def impersonate(func, name, return_value):
1037             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1038             func.__doc__ = f'''
1039                 @param transform_source     Apply this transformation before parsing
1040                 @returns                    {return_value}
1041
1042                 See _download_webpage_handle docstring for other arguments specification
1043             '''
1044
1045         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1046         impersonate(download_content, f'_download_{name}', f'{return_value}')
1047         return download_handle, download_content
1048
1049     _download_xml_handle, _download_xml = __create_download_methods(
1050         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1051     _download_json_handle, _download_json = __create_download_methods(
1052         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1053     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1054         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1055     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1056
1057     def _download_webpage(
1058             self, url_or_request, video_id, note=None, errnote=None,
1059             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1060         """
1061         Return the data of the page as a string.
1062
1063         Keyword arguments:
1064         tries -- number of tries
1065         timeout -- sleep interval between tries
1066
1067         See _download_webpage_handle docstring for other arguments specification.
1068         """
1069
1070         R''' # NB: These are unused; should they be deprecated?
1071         if tries != 1:
1072             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1073         if timeout is NO_DEFAULT:
1074             timeout = 5
1075         else:
1076             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1077         '''
1078
1079         try_count = 0
1080         while True:
1081             try:
1082                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1083             except http.client.IncompleteRead as e:
1084                 try_count += 1
1085                 if try_count >= tries:
1086                     raise e
1087                 self._sleep(timeout, video_id)
1088
1089     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1090         idstr = format_field(video_id, None, '%s: ')
1091         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1092         if only_once:
1093             if f'WARNING: {msg}' in self._printed_messages:
1094                 return
1095             self._printed_messages.add(f'WARNING: {msg}')
1096         self._downloader.report_warning(msg, *args, **kwargs)
1097
1098     def to_screen(self, msg, *args, **kwargs):
1099         """Print msg to screen, prefixing it with '[ie_name]'"""
1100         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1101
1102     def write_debug(self, msg, *args, **kwargs):
1103         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1104
1105     def get_param(self, name, default=None, *args, **kwargs):
1106         if self._downloader:
1107             return self._downloader.params.get(name, default, *args, **kwargs)
1108         return default
1109
1110     def report_drm(self, video_id, partial=False):
1111         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1112
1113     def report_extraction(self, id_or_name):
1114         """Report information extraction."""
1115         self.to_screen('%s: Extracting information' % id_or_name)
1116
1117     def report_download_webpage(self, video_id):
1118         """Report webpage download."""
1119         self.to_screen('%s: Downloading webpage' % video_id)
1120
1121     def report_age_confirmation(self):
1122         """Report attempt to confirm age."""
1123         self.to_screen('Confirming age')
1124
1125     def report_login(self):
1126         """Report attempt to log in."""
1127         self.to_screen('Logging in')
1128
1129     def raise_login_required(
1130             self, msg='This video is only available for registered users',
1131             metadata_available=False, method=NO_DEFAULT):
1132         if metadata_available and (
1133                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1134             self.report_warning(msg)
1135             return
1136         msg += format_field(self._login_hint(method), None, '. %s')
1137         raise ExtractorError(msg, expected=True)
1138
1139     def raise_geo_restricted(
1140             self, msg='This video is not available from your location due to geo restriction',
1141             countries=None, metadata_available=False):
1142         if metadata_available and (
1143                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1144             self.report_warning(msg)
1145         else:
1146             raise GeoRestrictedError(msg, countries=countries)
1147
1148     def raise_no_formats(self, msg, expected=False, video_id=None):
1149         if expected and (
1150                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1151             self.report_warning(msg, video_id)
1152         elif isinstance(msg, ExtractorError):
1153             raise msg
1154         else:
1155             raise ExtractorError(msg, expected=expected, video_id=video_id)
1156
1157     # Methods for following #608
1158     @staticmethod
1159     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1160         """Returns a URL that points to a page that should be processed"""
1161         if ie is not None:
1162             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1163         if video_id is not None:
1164             kwargs['id'] = video_id
1165         if video_title is not None:
1166             kwargs['title'] = video_title
1167         return {
1168             **kwargs,
1169             '_type': 'url_transparent' if url_transparent else 'url',
1170             'url': url,
1171         }
1172
1173     @classmethod
1174     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1175                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1176         return cls.playlist_result(
1177             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1178             playlist_id, playlist_title, **kwargs)
1179
1180     @staticmethod
1181     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1182         """Returns a playlist"""
1183         if playlist_id:
1184             kwargs['id'] = playlist_id
1185         if playlist_title:
1186             kwargs['title'] = playlist_title
1187         if playlist_description is not None:
1188             kwargs['description'] = playlist_description
1189         return {
1190             **kwargs,
1191             '_type': 'multi_video' if multi_video else 'playlist',
1192             'entries': entries,
1193         }
1194
1195     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1196         """
1197         Perform a regex search on the given string, using a single or a list of
1198         patterns returning the first matching group.
1199         In case of failure return a default value or raise a WARNING or a
1200         RegexNotFoundError, depending on fatal, specifying the field name.
1201         """
1202         if string is None:
1203             mobj = None
1204         elif isinstance(pattern, (str, re.Pattern)):
1205             mobj = re.search(pattern, string, flags)
1206         else:
1207             for p in pattern:
1208                 mobj = re.search(p, string, flags)
1209                 if mobj:
1210                     break
1211
1212         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1213
1214         if mobj:
1215             if group is None:
1216                 # return the first matching group
1217                 return next(g for g in mobj.groups() if g is not None)
1218             elif isinstance(group, (list, tuple)):
1219                 return tuple(mobj.group(g) for g in group)
1220             else:
1221                 return mobj.group(group)
1222         elif default is not NO_DEFAULT:
1223             return default
1224         elif fatal:
1225             raise RegexNotFoundError('Unable to extract %s' % _name)
1226         else:
1227             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1228             return None
1229
1230     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1231                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1232         """Searches string for the JSON object specified by start_pattern"""
1233         # NB: end_pattern is only used to reduce the size of the initial match
1234         if default is NO_DEFAULT:
1235             default, has_default = {}, False
1236         else:
1237             fatal, has_default = False, True
1238
1239         json_string = self._search_regex(
1240             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1241             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1242         if not json_string:
1243             return default
1244
1245         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1246         try:
1247             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1248         except ExtractorError as e:
1249             if fatal:
1250                 raise ExtractorError(
1251                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1252             elif not has_default:
1253                 self.report_warning(
1254                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1255         return default
1256
1257     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1258         """
1259         Like _search_regex, but strips HTML tags and unescapes entities.
1260         """
1261         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1262         if res:
1263             return clean_html(res).strip()
1264         else:
1265             return res
1266
1267     def _get_netrc_login_info(self, netrc_machine=None):
1268         username = None
1269         password = None
1270         netrc_machine = netrc_machine or self._NETRC_MACHINE
1271
1272         if self.get_param('usenetrc', False):
1273             try:
1274                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1275                 if os.path.isdir(netrc_file):
1276                     netrc_file = os.path.join(netrc_file, '.netrc')
1277                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1278                 if info is not None:
1279                     username = info[0]
1280                     password = info[2]
1281                 else:
1282                     raise netrc.NetrcParseError(
1283                         'No authenticators for %s' % netrc_machine)
1284             except (OSError, netrc.NetrcParseError) as err:
1285                 self.report_warning(
1286                     'parsing .netrc: %s' % error_to_compat_str(err))
1287
1288         return username, password
1289
1290     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1291         """
1292         Get the login info as (username, password)
1293         First look for the manually specified credentials using username_option
1294         and password_option as keys in params dictionary. If no such credentials
1295         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1296         value.
1297         If there's no info available, return (None, None)
1298         """
1299
1300         # Attempt to use provided username and password or .netrc data
1301         username = self.get_param(username_option)
1302         if username is not None:
1303             password = self.get_param(password_option)
1304         else:
1305             username, password = self._get_netrc_login_info(netrc_machine)
1306
1307         return username, password
1308
1309     def _get_tfa_info(self, note='two-factor verification code'):
1310         """
1311         Get the two-factor authentication info
1312         TODO - asking the user will be required for sms/phone verify
1313         currently just uses the command line option
1314         If there's no info available, return None
1315         """
1316
1317         tfa = self.get_param('twofactor')
1318         if tfa is not None:
1319             return tfa
1320
1321         return getpass.getpass('Type %s and press [Return]: ' % note)
1322
1323     # Helper functions for extracting OpenGraph info
1324     @staticmethod
1325     def _og_regexes(prop):
1326         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1327         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1328                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1329         template = r'<meta[^>]+?%s[^>]+?%s'
1330         return [
1331             template % (property_re, content_re),
1332             template % (content_re, property_re),
1333         ]
1334
1335     @staticmethod
1336     def _meta_regex(prop):
1337         return r'''(?isx)<meta
1338                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1339                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1340
1341     def _og_search_property(self, prop, html, name=None, **kargs):
1342         prop = variadic(prop)
1343         if name is None:
1344             name = 'OpenGraph %s' % prop[0]
1345         og_regexes = []
1346         for p in prop:
1347             og_regexes.extend(self._og_regexes(p))
1348         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1349         if escaped is None:
1350             return None
1351         return unescapeHTML(escaped)
1352
1353     def _og_search_thumbnail(self, html, **kargs):
1354         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1355
1356     def _og_search_description(self, html, **kargs):
1357         return self._og_search_property('description', html, fatal=False, **kargs)
1358
1359     def _og_search_title(self, html, *, fatal=False, **kargs):
1360         return self._og_search_property('title', html, fatal=fatal, **kargs)
1361
1362     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1363         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1364         if secure:
1365             regexes = self._og_regexes('video:secure_url') + regexes
1366         return self._html_search_regex(regexes, html, name, **kargs)
1367
1368     def _og_search_url(self, html, **kargs):
1369         return self._og_search_property('url', html, **kargs)
1370
1371     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1372         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1373
1374     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1375         name = variadic(name)
1376         if display_name is None:
1377             display_name = name[0]
1378         return self._html_search_regex(
1379             [self._meta_regex(n) for n in name],
1380             html, display_name, fatal=fatal, group='content', **kwargs)
1381
1382     def _dc_search_uploader(self, html):
1383         return self._html_search_meta('dc.creator', html, 'uploader')
1384
1385     @staticmethod
1386     def _rta_search(html):
1387         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1388         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1389                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1390                      html):
1391             return 18
1392
1393         # And then there are the jokers who advertise that they use RTA, but actually don't.
1394         AGE_LIMIT_MARKERS = [
1395             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1396         ]
1397         if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
1398             return 18
1399         return 0
1400
1401     def _media_rating_search(self, html):
1402         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1403         rating = self._html_search_meta('rating', html)
1404
1405         if not rating:
1406             return None
1407
1408         RATING_TABLE = {
1409             'safe for kids': 0,
1410             'general': 8,
1411             '14 years': 14,
1412             'mature': 17,
1413             'restricted': 19,
1414         }
1415         return RATING_TABLE.get(rating.lower())
1416
1417     def _family_friendly_search(self, html):
1418         # See http://schema.org/VideoObject
1419         family_friendly = self._html_search_meta(
1420             'isFamilyFriendly', html, default=None)
1421
1422         if not family_friendly:
1423             return None
1424
1425         RATING_TABLE = {
1426             '1': 0,
1427             'true': 0,
1428             '0': 18,
1429             'false': 18,
1430         }
1431         return RATING_TABLE.get(family_friendly.lower())
1432
1433     def _twitter_search_player(self, html):
1434         return self._html_search_meta('twitter:player', html,
1435                                       'twitter card player')
1436
1437     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1438         """Yield all json ld objects in the html"""
1439         if default is not NO_DEFAULT:
1440             fatal = False
1441         for mobj in re.finditer(JSON_LD_RE, html):
1442             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1443             for json_ld in variadic(json_ld_item):
1444                 if isinstance(json_ld, dict):
1445                     yield json_ld
1446
1447     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1448         """Search for a video in any json ld in the html"""
1449         if default is not NO_DEFAULT:
1450             fatal = False
1451         info = self._json_ld(
1452             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1453             video_id, fatal=fatal, expected_type=expected_type)
1454         if info:
1455             return info
1456         if default is not NO_DEFAULT:
1457             return default
1458         elif fatal:
1459             raise RegexNotFoundError('Unable to extract JSON-LD')
1460         else:
1461             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1462             return {}
1463
1464     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1465         if isinstance(json_ld, str):
1466             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1467         if not json_ld:
1468             return {}
1469         info = {}
1470
1471         INTERACTION_TYPE_MAP = {
1472             'CommentAction': 'comment',
1473             'AgreeAction': 'like',
1474             'DisagreeAction': 'dislike',
1475             'LikeAction': 'like',
1476             'DislikeAction': 'dislike',
1477             'ListenAction': 'view',
1478             'WatchAction': 'view',
1479             'ViewAction': 'view',
1480         }
1481
1482         def is_type(e, *expected_types):
1483             type = variadic(traverse_obj(e, '@type'))
1484             return any(x in type for x in expected_types)
1485
1486         def extract_interaction_type(e):
1487             interaction_type = e.get('interactionType')
1488             if isinstance(interaction_type, dict):
1489                 interaction_type = interaction_type.get('@type')
1490             return str_or_none(interaction_type)
1491
1492         def extract_interaction_statistic(e):
1493             interaction_statistic = e.get('interactionStatistic')
1494             if isinstance(interaction_statistic, dict):
1495                 interaction_statistic = [interaction_statistic]
1496             if not isinstance(interaction_statistic, list):
1497                 return
1498             for is_e in interaction_statistic:
1499                 if not is_type(is_e, 'InteractionCounter'):
1500                     continue
1501                 interaction_type = extract_interaction_type(is_e)
1502                 if not interaction_type:
1503                     continue
1504                 # For interaction count some sites provide string instead of
1505                 # an integer (as per spec) with non digit characters (e.g. ",")
1506                 # so extracting count with more relaxed str_to_int
1507                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1508                 if interaction_count is None:
1509                     continue
1510                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1511                 if not count_kind:
1512                     continue
1513                 count_key = '%s_count' % count_kind
1514                 if info.get(count_key) is not None:
1515                     continue
1516                 info[count_key] = interaction_count
1517
1518         def extract_chapter_information(e):
1519             chapters = [{
1520                 'title': part.get('name'),
1521                 'start_time': part.get('startOffset'),
1522                 'end_time': part.get('endOffset'),
1523             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1524             for idx, (last_c, current_c, next_c) in enumerate(zip(
1525                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1526                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1527                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1528                 if None in current_c.values():
1529                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1530                     return
1531             if chapters:
1532                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1533                 info['chapters'] = chapters
1534
1535         def extract_video_object(e):
1536             author = e.get('author')
1537             info.update({
1538                 'url': url_or_none(e.get('contentUrl')),
1539                 'ext': mimetype2ext(e.get('encodingFormat')),
1540                 'title': unescapeHTML(e.get('name')),
1541                 'description': unescapeHTML(e.get('description')),
1542                 'thumbnails': [{'url': unescapeHTML(url)}
1543                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1544                                if url_or_none(url)],
1545                 'duration': parse_duration(e.get('duration')),
1546                 'timestamp': unified_timestamp(e.get('uploadDate')),
1547                 # author can be an instance of 'Organization' or 'Person' types.
1548                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1549                 # however some websites are using 'Text' type instead.
1550                 # 1. https://schema.org/VideoObject
1551                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1552                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1553                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1554                 'tbr': int_or_none(e.get('bitrate')),
1555                 'width': int_or_none(e.get('width')),
1556                 'height': int_or_none(e.get('height')),
1557                 'view_count': int_or_none(e.get('interactionCount')),
1558                 'tags': try_call(lambda: e.get('keywords').split(',')),
1559             })
1560             if is_type(e, 'AudioObject'):
1561                 info.update({
1562                     'vcodec': 'none',
1563                     'abr': int_or_none(e.get('bitrate')),
1564                 })
1565             extract_interaction_statistic(e)
1566             extract_chapter_information(e)
1567
1568         def traverse_json_ld(json_ld, at_top_level=True):
1569             for e in variadic(json_ld):
1570                 if not isinstance(e, dict):
1571                     continue
1572                 if at_top_level and '@context' not in e:
1573                     continue
1574                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1575                     traverse_json_ld(e['@graph'], at_top_level=False)
1576                     break
1577                 if expected_type is not None and not is_type(e, expected_type):
1578                     continue
1579                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1580                 if rating is not None:
1581                     info['average_rating'] = rating
1582                 if is_type(e, 'TVEpisode', 'Episode'):
1583                     episode_name = unescapeHTML(e.get('name'))
1584                     info.update({
1585                         'episode': episode_name,
1586                         'episode_number': int_or_none(e.get('episodeNumber')),
1587                         'description': unescapeHTML(e.get('description')),
1588                     })
1589                     if not info.get('title') and episode_name:
1590                         info['title'] = episode_name
1591                     part_of_season = e.get('partOfSeason')
1592                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1593                         info.update({
1594                             'season': unescapeHTML(part_of_season.get('name')),
1595                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1596                         })
1597                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1598                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1599                         info['series'] = unescapeHTML(part_of_series.get('name'))
1600                 elif is_type(e, 'Movie'):
1601                     info.update({
1602                         'title': unescapeHTML(e.get('name')),
1603                         'description': unescapeHTML(e.get('description')),
1604                         'duration': parse_duration(e.get('duration')),
1605                         'timestamp': unified_timestamp(e.get('dateCreated')),
1606                     })
1607                 elif is_type(e, 'Article', 'NewsArticle'):
1608                     info.update({
1609                         'timestamp': parse_iso8601(e.get('datePublished')),
1610                         'title': unescapeHTML(e.get('headline')),
1611                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1612                     })
1613                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1614                         extract_video_object(e['video'][0])
1615                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1616                         extract_video_object(e['subjectOf'][0])
1617                 elif is_type(e, 'VideoObject', 'AudioObject'):
1618                     extract_video_object(e)
1619                     if expected_type is None:
1620                         continue
1621                     else:
1622                         break
1623                 video = e.get('video')
1624                 if is_type(video, 'VideoObject'):
1625                     extract_video_object(video)
1626                 if expected_type is None:
1627                     continue
1628                 else:
1629                     break
1630
1631         traverse_json_ld(json_ld)
1632         return filter_dict(info)
1633
1634     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1635         return self._parse_json(
1636             self._search_regex(
1637                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1638                 webpage, 'next.js data', fatal=fatal, **kw),
1639             video_id, transform_source=transform_source, fatal=fatal)
1640
1641     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1642         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1643         rectx = re.escape(context_name)
1644         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1645         js, arg_keys, arg_vals = self._search_regex(
1646             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1647             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
1648
1649         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1650
1651         for key, val in args.items():
1652             if val in ('undefined', 'void 0'):
1653                 args[key] = 'null'
1654
1655         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1656         return traverse_obj(ret, traverse) or {}
1657
1658     @staticmethod
1659     def _hidden_inputs(html):
1660         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1661         hidden_inputs = {}
1662         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1663             attrs = extract_attributes(input)
1664             if not input:
1665                 continue
1666             if attrs.get('type') not in ('hidden', 'submit'):
1667                 continue
1668             name = attrs.get('name') or attrs.get('id')
1669             value = attrs.get('value')
1670             if name and value is not None:
1671                 hidden_inputs[name] = value
1672         return hidden_inputs
1673
1674     def _form_hidden_inputs(self, form_id, html):
1675         form = self._search_regex(
1676             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1677             html, '%s form' % form_id, group='form')
1678         return self._hidden_inputs(form)
1679
1680     class FormatSort:
1681         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1682
1683         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1684                    'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
1685                    'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1686         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1687                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1688                         'fps', 'fs_approx', 'source', 'id')
1689
1690         settings = {
1691             'vcodec': {'type': 'ordered', 'regex': True,
1692                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1693             'acodec': {'type': 'ordered', 'regex': True,
1694                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1695             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1696                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1697             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1698                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1699             'vext': {'type': 'ordered', 'field': 'video_ext',
1700                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1701                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1702             'aext': {'type': 'ordered', 'field': 'audio_ext',
1703                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1704                      'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
1705             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1706             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1707                            'field': ('vcodec', 'acodec'),
1708                            'function': lambda it: int(any(v != 'none' for v in it))},
1709             'ie_pref': {'priority': True, 'type': 'extractor'},
1710             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1711             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1712             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1713             'quality': {'convert': 'float', 'default': -1},
1714             'filesize': {'convert': 'bytes'},
1715             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1716             'id': {'convert': 'string', 'field': 'format_id'},
1717             'height': {'convert': 'float_none'},
1718             'width': {'convert': 'float_none'},
1719             'fps': {'convert': 'float_none'},
1720             'channels': {'convert': 'float_none', 'field': 'audio_channels'},
1721             'tbr': {'convert': 'float_none'},
1722             'vbr': {'convert': 'float_none'},
1723             'abr': {'convert': 'float_none'},
1724             'asr': {'convert': 'float_none'},
1725             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1726
1727             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1728             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1729             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1730             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1731             'res': {'type': 'multiple', 'field': ('height', 'width'),
1732                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1733
1734             # Actual field names
1735             'format_id': {'type': 'alias', 'field': 'id'},
1736             'preference': {'type': 'alias', 'field': 'ie_pref'},
1737             'language_preference': {'type': 'alias', 'field': 'lang'},
1738             'source_preference': {'type': 'alias', 'field': 'source'},
1739             'protocol': {'type': 'alias', 'field': 'proto'},
1740             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1741             'audio_channels': {'type': 'alias', 'field': 'channels'},
1742
1743             # Deprecated
1744             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1745             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1746             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1747             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1748             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1749             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1750             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1751             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1752             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1753             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1754             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1755             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1756             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1757             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1758             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1759             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1760             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1761             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1762             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1763             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1764         }
1765
1766         def __init__(self, ie, field_preference):
1767             self._order = []
1768             self.ydl = ie._downloader
1769             self.evaluate_params(self.ydl.params, field_preference)
1770             if ie.get_param('verbose'):
1771                 self.print_verbose_info(self.ydl.write_debug)
1772
1773         def _get_field_setting(self, field, key):
1774             if field not in self.settings:
1775                 if key in ('forced', 'priority'):
1776                     return False
1777                 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
1778                                             'deprecated and may be removed in a future version')
1779                 self.settings[field] = {}
1780             propObj = self.settings[field]
1781             if key not in propObj:
1782                 type = propObj.get('type')
1783                 if key == 'field':
1784                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1785                 elif key == 'convert':
1786                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1787                 else:
1788                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1789                 propObj[key] = default
1790             return propObj[key]
1791
1792         def _resolve_field_value(self, field, value, convertNone=False):
1793             if value is None:
1794                 if not convertNone:
1795                     return None
1796             else:
1797                 value = value.lower()
1798             conversion = self._get_field_setting(field, 'convert')
1799             if conversion == 'ignore':
1800                 return None
1801             if conversion == 'string':
1802                 return value
1803             elif conversion == 'float_none':
1804                 return float_or_none(value)
1805             elif conversion == 'bytes':
1806                 return FileDownloader.parse_bytes(value)
1807             elif conversion == 'order':
1808                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1809                 use_regex = self._get_field_setting(field, 'regex')
1810                 list_length = len(order_list)
1811                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1812                 if use_regex and value is not None:
1813                     for i, regex in enumerate(order_list):
1814                         if regex and re.match(regex, value):
1815                             return list_length - i
1816                     return list_length - empty_pos  # not in list
1817                 else:  # not regex or  value = None
1818                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1819             else:
1820                 if value.isnumeric():
1821                     return float(value)
1822                 else:
1823                     self.settings[field]['convert'] = 'string'
1824                     return value
1825
1826         def evaluate_params(self, params, sort_extractor):
1827             self._use_free_order = params.get('prefer_free_formats', False)
1828             self._sort_user = params.get('format_sort', [])
1829             self._sort_extractor = sort_extractor
1830
1831             def add_item(field, reverse, closest, limit_text):
1832                 field = field.lower()
1833                 if field in self._order:
1834                     return
1835                 self._order.append(field)
1836                 limit = self._resolve_field_value(field, limit_text)
1837                 data = {
1838                     'reverse': reverse,
1839                     'closest': False if limit is None else closest,
1840                     'limit_text': limit_text,
1841                     'limit': limit}
1842                 if field in self.settings:
1843                     self.settings[field].update(data)
1844                 else:
1845                     self.settings[field] = data
1846
1847             sort_list = (
1848                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1849                 + (tuple() if params.get('format_sort_force', False)
1850                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1851                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1852
1853             for item in sort_list:
1854                 match = re.match(self.regex, item)
1855                 if match is None:
1856                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1857                 field = match.group('field')
1858                 if field is None:
1859                     continue
1860                 if self._get_field_setting(field, 'type') == 'alias':
1861                     alias, field = field, self._get_field_setting(field, 'field')
1862                     if self._get_field_setting(alias, 'deprecated'):
1863                         self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
1864                                                     f'be removed in a future version. Please use {field} instead')
1865                 reverse = match.group('reverse') is not None
1866                 closest = match.group('separator') == '~'
1867                 limit_text = match.group('limit')
1868
1869                 has_limit = limit_text is not None
1870                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1871                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1872
1873                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1874                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1875                 limit_count = len(limits)
1876                 for (i, f) in enumerate(fields):
1877                     add_item(f, reverse, closest,
1878                              limits[i] if i < limit_count
1879                              else limits[0] if has_limit and not has_multiple_limits
1880                              else None)
1881
1882         def print_verbose_info(self, write_debug):
1883             if self._sort_user:
1884                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1885             if self._sort_extractor:
1886                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1887             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1888                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1889                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1890                               self._get_field_setting(field, 'limit_text'),
1891                               self._get_field_setting(field, 'limit'))
1892                 if self._get_field_setting(field, 'limit_text') is not None else '')
1893                 for field in self._order if self._get_field_setting(field, 'visible')]))
1894
1895         def _calculate_field_preference_from_value(self, format, field, type, value):
1896             reverse = self._get_field_setting(field, 'reverse')
1897             closest = self._get_field_setting(field, 'closest')
1898             limit = self._get_field_setting(field, 'limit')
1899
1900             if type == 'extractor':
1901                 maximum = self._get_field_setting(field, 'max')
1902                 if value is None or (maximum is not None and value >= maximum):
1903                     value = -1
1904             elif type == 'boolean':
1905                 in_list = self._get_field_setting(field, 'in_list')
1906                 not_in_list = self._get_field_setting(field, 'not_in_list')
1907                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1908             elif type == 'ordered':
1909                 value = self._resolve_field_value(field, value, True)
1910
1911             # try to convert to number
1912             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1913             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1914             if is_num:
1915                 value = val_num
1916
1917             return ((-10, 0) if value is None
1918                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1919                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1920                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1921                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1922                     else (-1, value, 0))
1923
1924         def _calculate_field_preference(self, format, field):
1925             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1926             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1927             if type == 'multiple':
1928                 type = 'field'  # Only 'field' is allowed in multiple for now
1929                 actual_fields = self._get_field_setting(field, 'field')
1930
1931                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1932             else:
1933                 value = get_value(field)
1934             return self._calculate_field_preference_from_value(format, field, type, value)
1935
1936         def calculate_preference(self, format):
1937             # Determine missing protocol
1938             if not format.get('protocol'):
1939                 format['protocol'] = determine_protocol(format)
1940
1941             # Determine missing ext
1942             if not format.get('ext') and 'url' in format:
1943                 format['ext'] = determine_ext(format['url'])
1944             if format.get('vcodec') == 'none':
1945                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1946                 format['video_ext'] = 'none'
1947             else:
1948                 format['video_ext'] = format['ext']
1949                 format['audio_ext'] = 'none'
1950             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1951             #    format['preference'] = -1000
1952
1953             # Determine missing bitrates
1954             if format.get('tbr') is None:
1955                 if format.get('vbr') is not None and format.get('abr') is not None:
1956                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1957             else:
1958                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1959                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1960                 if format.get('acodec') != 'none' and format.get('abr') is None:
1961                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1962
1963             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1964
1965     def _sort_formats(self, formats, field_preference=[]):
1966         if not formats:
1967             return
1968         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1969
1970     def _check_formats(self, formats, video_id):
1971         if formats:
1972             formats[:] = filter(
1973                 lambda f: self._is_valid_url(
1974                     f['url'], video_id,
1975                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1976                 formats)
1977
1978     @staticmethod
1979     def _remove_duplicate_formats(formats):
1980         format_urls = set()
1981         unique_formats = []
1982         for f in formats:
1983             if f['url'] not in format_urls:
1984                 format_urls.add(f['url'])
1985                 unique_formats.append(f)
1986         formats[:] = unique_formats
1987
1988     def _is_valid_url(self, url, video_id, item='video', headers={}):
1989         url = self._proto_relative_url(url, scheme='http:')
1990         # For now assume non HTTP(S) URLs always valid
1991         if not (url.startswith('http://') or url.startswith('https://')):
1992             return True
1993         try:
1994             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1995             return True
1996         except ExtractorError as e:
1997             self.to_screen(
1998                 '%s: %s URL is invalid, skipping: %s'
1999                 % (video_id, item, error_to_compat_str(e.cause)))
2000             return False
2001
2002     def http_scheme(self):
2003         """ Either "http:" or "https:", depending on the user's preferences """
2004         return (
2005             'http:'
2006             if self.get_param('prefer_insecure', False)
2007             else 'https:')
2008
2009     def _proto_relative_url(self, url, scheme=None):
2010         scheme = scheme or self.http_scheme()
2011         assert scheme.endswith(':')
2012         return sanitize_url(url, scheme=scheme[:-1])
2013
2014     def _sleep(self, timeout, video_id, msg_template=None):
2015         if msg_template is None:
2016             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
2017         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
2018         self.to_screen(msg)
2019         time.sleep(timeout)
2020
2021     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2022                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
2023                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
2024         res = self._download_xml_handle(
2025             manifest_url, video_id, 'Downloading f4m manifest',
2026             'Unable to download f4m manifest',
2027             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
2028             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
2029             transform_source=transform_source,
2030             fatal=fatal, data=data, headers=headers, query=query)
2031         if res is False:
2032             return []
2033
2034         manifest, urlh = res
2035         manifest_url = urlh.geturl()
2036
2037         return self._parse_f4m_formats(
2038             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2039             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2040
2041     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2042                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2043                            fatal=True, m3u8_id=None):
2044         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2045             return []
2046
2047         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2048         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2049         if akamai_pv is not None and ';' in akamai_pv.text:
2050             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2051             if playerVerificationChallenge.strip() != '':
2052                 return []
2053
2054         formats = []
2055         manifest_version = '1.0'
2056         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2057         if not media_nodes:
2058             manifest_version = '2.0'
2059             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2060         # Remove unsupported DRM protected media from final formats
2061         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2062         media_nodes = remove_encrypted_media(media_nodes)
2063         if not media_nodes:
2064             return formats
2065
2066         manifest_base_url = get_base_url(manifest)
2067
2068         bootstrap_info = xpath_element(
2069             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2070             'bootstrap info', default=None)
2071
2072         vcodec = None
2073         mime_type = xpath_text(
2074             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2075             'base URL', default=None)
2076         if mime_type and mime_type.startswith('audio/'):
2077             vcodec = 'none'
2078
2079         for i, media_el in enumerate(media_nodes):
2080             tbr = int_or_none(media_el.attrib.get('bitrate'))
2081             width = int_or_none(media_el.attrib.get('width'))
2082             height = int_or_none(media_el.attrib.get('height'))
2083             format_id = join_nonempty(f4m_id, tbr or i)
2084             # If <bootstrapInfo> is present, the specified f4m is a
2085             # stream-level manifest, and only set-level manifests may refer to
2086             # external resources.  See section 11.4 and section 4 of F4M spec
2087             if bootstrap_info is None:
2088                 media_url = None
2089                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2090                 if manifest_version == '2.0':
2091                     media_url = media_el.attrib.get('href')
2092                 if media_url is None:
2093                     media_url = media_el.attrib.get('url')
2094                 if not media_url:
2095                     continue
2096                 manifest_url = (
2097                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2098                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2099                 # If media_url is itself a f4m manifest do the recursive extraction
2100                 # since bitrates in parent manifest (this one) and media_url manifest
2101                 # may differ leading to inability to resolve the format by requested
2102                 # bitrate in f4m downloader
2103                 ext = determine_ext(manifest_url)
2104                 if ext == 'f4m':
2105                     f4m_formats = self._extract_f4m_formats(
2106                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2107                         transform_source=transform_source, fatal=fatal)
2108                     # Sometimes stream-level manifest contains single media entry that
2109                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2110                     # At the same time parent's media entry in set-level manifest may
2111                     # contain it. We will copy it from parent in such cases.
2112                     if len(f4m_formats) == 1:
2113                         f = f4m_formats[0]
2114                         f.update({
2115                             'tbr': f.get('tbr') or tbr,
2116                             'width': f.get('width') or width,
2117                             'height': f.get('height') or height,
2118                             'format_id': f.get('format_id') if not tbr else format_id,
2119                             'vcodec': vcodec,
2120                         })
2121                     formats.extend(f4m_formats)
2122                     continue
2123                 elif ext == 'm3u8':
2124                     formats.extend(self._extract_m3u8_formats(
2125                         manifest_url, video_id, 'mp4', preference=preference,
2126                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2127                     continue
2128             formats.append({
2129                 'format_id': format_id,
2130                 'url': manifest_url,
2131                 'manifest_url': manifest_url,
2132                 'ext': 'flv' if bootstrap_info is not None else None,
2133                 'protocol': 'f4m',
2134                 'tbr': tbr,
2135                 'width': width,
2136                 'height': height,
2137                 'vcodec': vcodec,
2138                 'preference': preference,
2139                 'quality': quality,
2140             })
2141         return formats
2142
2143     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2144         return {
2145             'format_id': join_nonempty(m3u8_id, 'meta'),
2146             'url': m3u8_url,
2147             'ext': ext,
2148             'protocol': 'm3u8',
2149             'preference': preference - 100 if preference else -100,
2150             'quality': quality,
2151             'resolution': 'multiple',
2152             'format_note': 'Quality selection URL',
2153         }
2154
2155     def _report_ignoring_subs(self, name):
2156         self.report_warning(bug_reports_message(
2157             f'Ignoring subtitle tracks found in the {name} manifest; '
2158             'if any subtitle tracks are missing,'
2159         ), only_once=True)
2160
2161     def _extract_m3u8_formats(self, *args, **kwargs):
2162         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2163         if subs:
2164             self._report_ignoring_subs('HLS')
2165         return fmts
2166
2167     def _extract_m3u8_formats_and_subtitles(
2168             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2169             preference=None, quality=None, m3u8_id=None, note=None,
2170             errnote=None, fatal=True, live=False, data=None, headers={},
2171             query={}):
2172
2173         res = self._download_webpage_handle(
2174             m3u8_url, video_id,
2175             note='Downloading m3u8 information' if note is None else note,
2176             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2177             fatal=fatal, data=data, headers=headers, query=query)
2178
2179         if res is False:
2180             return [], {}
2181
2182         m3u8_doc, urlh = res
2183         m3u8_url = urlh.geturl()
2184
2185         return self._parse_m3u8_formats_and_subtitles(
2186             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2187             preference=preference, quality=quality, m3u8_id=m3u8_id,
2188             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2189             headers=headers, query=query, video_id=video_id)
2190
2191     def _parse_m3u8_formats_and_subtitles(
2192             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2193             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2194             errnote=None, fatal=True, data=None, headers={}, query={},
2195             video_id=None):
2196         formats, subtitles = [], {}
2197
2198         has_drm = re.search('|'.join([
2199             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2200             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2201         ]), m3u8_doc)
2202
2203         def format_url(url):
2204             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2205
2206         if self.get_param('hls_split_discontinuity', False):
2207             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2208                 if not m3u8_doc:
2209                     if not manifest_url:
2210                         return []
2211                     m3u8_doc = self._download_webpage(
2212                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2213                         note=False, errnote='Failed to download m3u8 playlist information')
2214                     if m3u8_doc is False:
2215                         return []
2216                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2217
2218         else:
2219             def _extract_m3u8_playlist_indices(*args, **kwargs):
2220                 return [None]
2221
2222         # References:
2223         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2224         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2225         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2226
2227         # We should try extracting formats only from master playlists [1, 4.3.4],
2228         # i.e. playlists that describe available qualities. On the other hand
2229         # media playlists [1, 4.3.3] should be returned as is since they contain
2230         # just the media without qualities renditions.
2231         # Fortunately, master playlist can be easily distinguished from media
2232         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2233         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2234         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2235         # media playlist and MUST NOT appear in master playlist thus we can
2236         # clearly detect media playlist with this criterion.
2237
2238         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2239             formats = [{
2240                 'format_id': join_nonempty(m3u8_id, idx),
2241                 'format_index': idx,
2242                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2243                 'ext': ext,
2244                 'protocol': entry_protocol,
2245                 'preference': preference,
2246                 'quality': quality,
2247                 'has_drm': has_drm,
2248             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2249
2250             return formats, subtitles
2251
2252         groups = {}
2253         last_stream_inf = {}
2254
2255         def extract_media(x_media_line):
2256             media = parse_m3u8_attributes(x_media_line)
2257             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2258             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2259             if not (media_type and group_id and name):
2260                 return
2261             groups.setdefault(group_id, []).append(media)
2262             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2263             if media_type == 'SUBTITLES':
2264                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2265                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2266                 # However, lack of URI has been spotted in the wild.
2267                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2268                 if not media.get('URI'):
2269                     return
2270                 url = format_url(media['URI'])
2271                 sub_info = {
2272                     'url': url,
2273                     'ext': determine_ext(url),
2274                 }
2275                 if sub_info['ext'] == 'm3u8':
2276                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2277                     # files may contain is WebVTT:
2278                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2279                     sub_info['ext'] = 'vtt'
2280                     sub_info['protocol'] = 'm3u8_native'
2281                 lang = media.get('LANGUAGE') or 'und'
2282                 subtitles.setdefault(lang, []).append(sub_info)
2283             if media_type not in ('VIDEO', 'AUDIO'):
2284                 return
2285             media_url = media.get('URI')
2286             if media_url:
2287                 manifest_url = format_url(media_url)
2288                 formats.extend({
2289                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2290                     'format_note': name,
2291                     'format_index': idx,
2292                     'url': manifest_url,
2293                     'manifest_url': m3u8_url,
2294                     'language': media.get('LANGUAGE'),
2295                     'ext': ext,
2296                     'protocol': entry_protocol,
2297                     'preference': preference,
2298                     'quality': quality,
2299                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2300                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2301
2302         def build_stream_name():
2303             # Despite specification does not mention NAME attribute for
2304             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2305             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2306             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2307             stream_name = last_stream_inf.get('NAME')
2308             if stream_name:
2309                 return stream_name
2310             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2311             # from corresponding rendition group
2312             stream_group_id = last_stream_inf.get('VIDEO')
2313             if not stream_group_id:
2314                 return
2315             stream_group = groups.get(stream_group_id)
2316             if not stream_group:
2317                 return stream_group_id
2318             rendition = stream_group[0]
2319             return rendition.get('NAME') or stream_group_id
2320
2321         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2322         # chance to detect video only formats when EXT-X-STREAM-INF tags
2323         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2324         for line in m3u8_doc.splitlines():
2325             if line.startswith('#EXT-X-MEDIA:'):
2326                 extract_media(line)
2327
2328         for line in m3u8_doc.splitlines():
2329             if line.startswith('#EXT-X-STREAM-INF:'):
2330                 last_stream_inf = parse_m3u8_attributes(line)
2331             elif line.startswith('#') or not line.strip():
2332                 continue
2333             else:
2334                 tbr = float_or_none(
2335                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2336                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2337                 manifest_url = format_url(line.strip())
2338
2339                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2340                     format_id = [m3u8_id, None, idx]
2341                     # Bandwidth of live streams may differ over time thus making
2342                     # format_id unpredictable. So it's better to keep provided
2343                     # format_id intact.
2344                     if not live:
2345                         stream_name = build_stream_name()
2346                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2347                     f = {
2348                         'format_id': join_nonempty(*format_id),
2349                         'format_index': idx,
2350                         'url': manifest_url,
2351                         'manifest_url': m3u8_url,
2352                         'tbr': tbr,
2353                         'ext': ext,
2354                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2355                         'protocol': entry_protocol,
2356                         'preference': preference,
2357                         'quality': quality,
2358                     }
2359                     resolution = last_stream_inf.get('RESOLUTION')
2360                     if resolution:
2361                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2362                         if mobj:
2363                             f['width'] = int(mobj.group('width'))
2364                             f['height'] = int(mobj.group('height'))
2365                     # Unified Streaming Platform
2366                     mobj = re.search(
2367                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2368                     if mobj:
2369                         abr, vbr = mobj.groups()
2370                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2371                         f.update({
2372                             'vbr': vbr,
2373                             'abr': abr,
2374                         })
2375                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2376                     f.update(codecs)
2377                     audio_group_id = last_stream_inf.get('AUDIO')
2378                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2379                     # references a rendition group MUST have a CODECS attribute.
2380                     # However, this is not always respected. E.g. [2]
2381                     # contains EXT-X-STREAM-INF tag which references AUDIO
2382                     # rendition group but does not have CODECS and despite
2383                     # referencing an audio group it represents a complete
2384                     # (with audio and video) format. So, for such cases we will
2385                     # ignore references to rendition groups and treat them
2386                     # as complete formats.
2387                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2388                         audio_group = groups.get(audio_group_id)
2389                         if audio_group and audio_group[0].get('URI'):
2390                             # TODO: update acodec for audio only formats with
2391                             # the same GROUP-ID
2392                             f['acodec'] = 'none'
2393                     if not f.get('ext'):
2394                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2395                     formats.append(f)
2396
2397                     # for DailyMotion
2398                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2399                     if progressive_uri:
2400                         http_f = f.copy()
2401                         del http_f['manifest_url']
2402                         http_f.update({
2403                             'format_id': f['format_id'].replace('hls-', 'http-'),
2404                             'protocol': 'http',
2405                             'url': progressive_uri,
2406                         })
2407                         formats.append(http_f)
2408
2409                 last_stream_inf = {}
2410         return formats, subtitles
2411
2412     def _extract_m3u8_vod_duration(
2413             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2414
2415         m3u8_vod = self._download_webpage(
2416             m3u8_vod_url, video_id,
2417             note='Downloading m3u8 VOD manifest' if note is None else note,
2418             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2419             fatal=False, data=data, headers=headers, query=query)
2420
2421         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2422
2423     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2424         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2425             return None
2426
2427         return int(sum(
2428             float(line[len('#EXTINF:'):].split(',')[0])
2429             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2430
2431     @staticmethod
2432     def _xpath_ns(path, namespace=None):
2433         if not namespace:
2434             return path
2435         out = []
2436         for c in path.split('/'):
2437             if not c or c == '.':
2438                 out.append(c)
2439             else:
2440                 out.append('{%s}%s' % (namespace, c))
2441         return '/'.join(out)
2442
2443     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2444         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2445         if res is False:
2446             assert not fatal
2447             return [], {}
2448
2449         smil, urlh = res
2450         smil_url = urlh.geturl()
2451
2452         namespace = self._parse_smil_namespace(smil)
2453
2454         fmts = self._parse_smil_formats(
2455             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2456         subs = self._parse_smil_subtitles(
2457             smil, namespace=namespace)
2458
2459         return fmts, subs
2460
2461     def _extract_smil_formats(self, *args, **kwargs):
2462         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2463         if subs:
2464             self._report_ignoring_subs('SMIL')
2465         return fmts
2466
2467     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2468         res = self._download_smil(smil_url, video_id, fatal=fatal)
2469         if res is False:
2470             return {}
2471
2472         smil, urlh = res
2473         smil_url = urlh.geturl()
2474
2475         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2476
2477     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2478         return self._download_xml_handle(
2479             smil_url, video_id, 'Downloading SMIL file',
2480             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2481
2482     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2483         namespace = self._parse_smil_namespace(smil)
2484
2485         formats = self._parse_smil_formats(
2486             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2487         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2488
2489         video_id = os.path.splitext(url_basename(smil_url))[0]
2490         title = None
2491         description = None
2492         upload_date = None
2493         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2494             name = meta.attrib.get('name')
2495             content = meta.attrib.get('content')
2496             if not name or not content:
2497                 continue
2498             if not title and name == 'title':
2499                 title = content
2500             elif not description and name in ('description', 'abstract'):
2501                 description = content
2502             elif not upload_date and name == 'date':
2503                 upload_date = unified_strdate(content)
2504
2505         thumbnails = [{
2506             'id': image.get('type'),
2507             'url': image.get('src'),
2508             'width': int_or_none(image.get('width')),
2509             'height': int_or_none(image.get('height')),
2510         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2511
2512         return {
2513             'id': video_id,
2514             'title': title or video_id,
2515             'description': description,
2516             'upload_date': upload_date,
2517             'thumbnails': thumbnails,
2518             'formats': formats,
2519             'subtitles': subtitles,
2520         }
2521
2522     def _parse_smil_namespace(self, smil):
2523         return self._search_regex(
2524             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2525
2526     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2527         base = smil_url
2528         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2529             b = meta.get('base') or meta.get('httpBase')
2530             if b:
2531                 base = b
2532                 break
2533
2534         formats = []
2535         rtmp_count = 0
2536         http_count = 0
2537         m3u8_count = 0
2538         imgs_count = 0
2539
2540         srcs = set()
2541         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2542         for medium in media:
2543             src = medium.get('src')
2544             if not src or src in srcs:
2545                 continue
2546             srcs.add(src)
2547
2548             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2549             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2550             width = int_or_none(medium.get('width'))
2551             height = int_or_none(medium.get('height'))
2552             proto = medium.get('proto')
2553             ext = medium.get('ext')
2554             src_ext = determine_ext(src)
2555             streamer = medium.get('streamer') or base
2556
2557             if proto == 'rtmp' or streamer.startswith('rtmp'):
2558                 rtmp_count += 1
2559                 formats.append({
2560                     'url': streamer,
2561                     'play_path': src,
2562                     'ext': 'flv',
2563                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2564                     'tbr': bitrate,
2565                     'filesize': filesize,
2566                     'width': width,
2567                     'height': height,
2568                 })
2569                 if transform_rtmp_url:
2570                     streamer, src = transform_rtmp_url(streamer, src)
2571                     formats[-1].update({
2572                         'url': streamer,
2573                         'play_path': src,
2574                     })
2575                 continue
2576
2577             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2578             src_url = src_url.strip()
2579
2580             if proto == 'm3u8' or src_ext == 'm3u8':
2581                 m3u8_formats = self._extract_m3u8_formats(
2582                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2583                 if len(m3u8_formats) == 1:
2584                     m3u8_count += 1
2585                     m3u8_formats[0].update({
2586                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2587                         'tbr': bitrate,
2588                         'width': width,
2589                         'height': height,
2590                     })
2591                 formats.extend(m3u8_formats)
2592             elif src_ext == 'f4m':
2593                 f4m_url = src_url
2594                 if not f4m_params:
2595                     f4m_params = {
2596                         'hdcore': '3.2.0',
2597                         'plugin': 'flowplayer-3.2.0.1',
2598                     }
2599                 f4m_url += '&' if '?' in f4m_url else '?'
2600                 f4m_url += urllib.parse.urlencode(f4m_params)
2601                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2602             elif src_ext == 'mpd':
2603                 formats.extend(self._extract_mpd_formats(
2604                     src_url, video_id, mpd_id='dash', fatal=False))
2605             elif re.search(r'\.ism/[Mm]anifest', src_url):
2606                 formats.extend(self._extract_ism_formats(
2607                     src_url, video_id, ism_id='mss', fatal=False))
2608             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2609                 http_count += 1
2610                 formats.append({
2611                     'url': src_url,
2612                     'ext': ext or src_ext or 'flv',
2613                     'format_id': 'http-%d' % (bitrate or http_count),
2614                     'tbr': bitrate,
2615                     'filesize': filesize,
2616                     'width': width,
2617                     'height': height,
2618                 })
2619
2620         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2621             src = medium.get('src')
2622             if not src or src in srcs:
2623                 continue
2624             srcs.add(src)
2625
2626             imgs_count += 1
2627             formats.append({
2628                 'format_id': 'imagestream-%d' % (imgs_count),
2629                 'url': src,
2630                 'ext': mimetype2ext(medium.get('type')),
2631                 'acodec': 'none',
2632                 'vcodec': 'none',
2633                 'width': int_or_none(medium.get('width')),
2634                 'height': int_or_none(medium.get('height')),
2635                 'format_note': 'SMIL storyboards',
2636             })
2637
2638         return formats
2639
2640     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2641         urls = []
2642         subtitles = {}
2643         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2644             src = textstream.get('src')
2645             if not src or src in urls:
2646                 continue
2647             urls.append(src)
2648             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2649             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2650             subtitles.setdefault(lang, []).append({
2651                 'url': src,
2652                 'ext': ext,
2653             })
2654         return subtitles
2655
2656     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2657         res = self._download_xml_handle(
2658             xspf_url, playlist_id, 'Downloading xpsf playlist',
2659             'Unable to download xspf manifest', fatal=fatal)
2660         if res is False:
2661             return []
2662
2663         xspf, urlh = res
2664         xspf_url = urlh.geturl()
2665
2666         return self._parse_xspf(
2667             xspf, playlist_id, xspf_url=xspf_url,
2668             xspf_base_url=base_url(xspf_url))
2669
2670     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2671         NS_MAP = {
2672             'xspf': 'http://xspf.org/ns/0/',
2673             's1': 'http://static.streamone.nl/player/ns/0',
2674         }
2675
2676         entries = []
2677         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2678             title = xpath_text(
2679                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2680             description = xpath_text(
2681                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2682             thumbnail = xpath_text(
2683                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2684             duration = float_or_none(
2685                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2686
2687             formats = []
2688             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2689                 format_url = urljoin(xspf_base_url, location.text)
2690                 if not format_url:
2691                     continue
2692                 formats.append({
2693                     'url': format_url,
2694                     'manifest_url': xspf_url,
2695                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2696                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2697                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2698                 })
2699             self._sort_formats(formats)
2700
2701             entries.append({
2702                 'id': playlist_id,
2703                 'title': title,
2704                 'description': description,
2705                 'thumbnail': thumbnail,
2706                 'duration': duration,
2707                 'formats': formats,
2708             })
2709         return entries
2710
2711     def _extract_mpd_formats(self, *args, **kwargs):
2712         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2713         if subs:
2714             self._report_ignoring_subs('DASH')
2715         return fmts
2716
2717     def _extract_mpd_formats_and_subtitles(
2718             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2719             fatal=True, data=None, headers={}, query={}):
2720         res = self._download_xml_handle(
2721             mpd_url, video_id,
2722             note='Downloading MPD manifest' if note is None else note,
2723             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2724             fatal=fatal, data=data, headers=headers, query=query)
2725         if res is False:
2726             return [], {}
2727         mpd_doc, urlh = res
2728         if mpd_doc is None:
2729             return [], {}
2730
2731         # We could have been redirected to a new url when we retrieved our mpd file.
2732         mpd_url = urlh.geturl()
2733         mpd_base_url = base_url(mpd_url)
2734
2735         return self._parse_mpd_formats_and_subtitles(
2736             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2737
2738     def _parse_mpd_formats(self, *args, **kwargs):
2739         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2740         if subs:
2741             self._report_ignoring_subs('DASH')
2742         return fmts
2743
2744     def _parse_mpd_formats_and_subtitles(
2745             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2746         """
2747         Parse formats from MPD manifest.
2748         References:
2749          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2750             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2751          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2752         """
2753         if not self.get_param('dynamic_mpd', True):
2754             if mpd_doc.get('type') == 'dynamic':
2755                 return [], {}
2756
2757         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2758
2759         def _add_ns(path):
2760             return self._xpath_ns(path, namespace)
2761
2762         def is_drm_protected(element):
2763             return element.find(_add_ns('ContentProtection')) is not None
2764
2765         def extract_multisegment_info(element, ms_parent_info):
2766             ms_info = ms_parent_info.copy()
2767
2768             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2769             # common attributes and elements.  We will only extract relevant
2770             # for us.
2771             def extract_common(source):
2772                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2773                 if segment_timeline is not None:
2774                     s_e = segment_timeline.findall(_add_ns('S'))
2775                     if s_e:
2776                         ms_info['total_number'] = 0
2777                         ms_info['s'] = []
2778                         for s in s_e:
2779                             r = int(s.get('r', 0))
2780                             ms_info['total_number'] += 1 + r
2781                             ms_info['s'].append({
2782                                 't': int(s.get('t', 0)),
2783                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2784                                 'd': int(s.attrib['d']),
2785                                 'r': r,
2786                             })
2787                 start_number = source.get('startNumber')
2788                 if start_number:
2789                     ms_info['start_number'] = int(start_number)
2790                 timescale = source.get('timescale')
2791                 if timescale:
2792                     ms_info['timescale'] = int(timescale)
2793                 segment_duration = source.get('duration')
2794                 if segment_duration:
2795                     ms_info['segment_duration'] = float(segment_duration)
2796
2797             def extract_Initialization(source):
2798                 initialization = source.find(_add_ns('Initialization'))
2799                 if initialization is not None:
2800                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2801
2802             segment_list = element.find(_add_ns('SegmentList'))
2803             if segment_list is not None:
2804                 extract_common(segment_list)
2805                 extract_Initialization(segment_list)
2806                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2807                 if segment_urls_e:
2808                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2809             else:
2810                 segment_template = element.find(_add_ns('SegmentTemplate'))
2811                 if segment_template is not None:
2812                     extract_common(segment_template)
2813                     media = segment_template.get('media')
2814                     if media:
2815                         ms_info['media'] = media
2816                     initialization = segment_template.get('initialization')
2817                     if initialization:
2818                         ms_info['initialization'] = initialization
2819                     else:
2820                         extract_Initialization(segment_template)
2821             return ms_info
2822
2823         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2824         formats, subtitles = [], {}
2825         stream_numbers = collections.defaultdict(int)
2826         for period in mpd_doc.findall(_add_ns('Period')):
2827             period_duration = parse_duration(period.get('duration')) or mpd_duration
2828             period_ms_info = extract_multisegment_info(period, {
2829                 'start_number': 1,
2830                 'timescale': 1,
2831             })
2832             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2833                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2834                 for representation in adaptation_set.findall(_add_ns('Representation')):
2835                     representation_attrib = adaptation_set.attrib.copy()
2836                     representation_attrib.update(representation.attrib)
2837                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2838                     mime_type = representation_attrib['mimeType']
2839                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2840
2841                     codec_str = representation_attrib.get('codecs', '')
2842                     # Some kind of binary subtitle found in some youtube livestreams
2843                     if mime_type == 'application/x-rawcc':
2844                         codecs = {'scodec': codec_str}
2845                     else:
2846                         codecs = parse_codecs(codec_str)
2847                     if content_type not in ('video', 'audio', 'text'):
2848                         if mime_type == 'image/jpeg':
2849                             content_type = mime_type
2850                         elif codecs.get('vcodec', 'none') != 'none':
2851                             content_type = 'video'
2852                         elif codecs.get('acodec', 'none') != 'none':
2853                             content_type = 'audio'
2854                         elif codecs.get('scodec', 'none') != 'none':
2855                             content_type = 'text'
2856                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2857                             content_type = 'text'
2858                         else:
2859                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2860                             continue
2861
2862                     base_url = ''
2863                     for element in (representation, adaptation_set, period, mpd_doc):
2864                         base_url_e = element.find(_add_ns('BaseURL'))
2865                         if try_call(lambda: base_url_e.text) is not None:
2866                             base_url = base_url_e.text + base_url
2867                             if re.match(r'^https?://', base_url):
2868                                 break
2869                     if mpd_base_url and base_url.startswith('/'):
2870                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2871                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2872                         if not mpd_base_url.endswith('/'):
2873                             mpd_base_url += '/'
2874                         base_url = mpd_base_url + base_url
2875                     representation_id = representation_attrib.get('id')
2876                     lang = representation_attrib.get('lang')
2877                     url_el = representation.find(_add_ns('BaseURL'))
2878                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2879                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2880                     if representation_id is not None:
2881                         format_id = representation_id
2882                     else:
2883                         format_id = content_type
2884                     if mpd_id:
2885                         format_id = mpd_id + '-' + format_id
2886                     if content_type in ('video', 'audio'):
2887                         f = {
2888                             'format_id': format_id,
2889                             'manifest_url': mpd_url,
2890                             'ext': mimetype2ext(mime_type),
2891                             'width': int_or_none(representation_attrib.get('width')),
2892                             'height': int_or_none(representation_attrib.get('height')),
2893                             'tbr': float_or_none(bandwidth, 1000),
2894                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2895                             'fps': int_or_none(representation_attrib.get('frameRate')),
2896                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2897                             'format_note': 'DASH %s' % content_type,
2898                             'filesize': filesize,
2899                             'container': mimetype2ext(mime_type) + '_dash',
2900                             **codecs
2901                         }
2902                     elif content_type == 'text':
2903                         f = {
2904                             'ext': mimetype2ext(mime_type),
2905                             'manifest_url': mpd_url,
2906                             'filesize': filesize,
2907                         }
2908                     elif content_type == 'image/jpeg':
2909                         # See test case in VikiIE
2910                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2911                         f = {
2912                             'format_id': format_id,
2913                             'ext': 'mhtml',
2914                             'manifest_url': mpd_url,
2915                             'format_note': 'DASH storyboards (jpeg)',
2916                             'acodec': 'none',
2917                             'vcodec': 'none',
2918                         }
2919                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2920                         f['has_drm'] = True
2921                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2922
2923                     def prepare_template(template_name, identifiers):
2924                         tmpl = representation_ms_info[template_name]
2925                         if representation_id is not None:
2926                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2927                         # First of, % characters outside $...$ templates
2928                         # must be escaped by doubling for proper processing
2929                         # by % operator string formatting used further (see
2930                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2931                         t = ''
2932                         in_template = False
2933                         for c in tmpl:
2934                             t += c
2935                             if c == '$':
2936                                 in_template = not in_template
2937                             elif c == '%' and not in_template:
2938                                 t += c
2939                         # Next, $...$ templates are translated to their
2940                         # %(...) counterparts to be used with % operator
2941                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2942                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2943                         t.replace('$$', '$')
2944                         return t
2945
2946                     # @initialization is a regular template like @media one
2947                     # so it should be handled just the same way (see
2948                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2949                     if 'initialization' in representation_ms_info:
2950                         initialization_template = prepare_template(
2951                             'initialization',
2952                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2953                             # $Time$ shall not be included for @initialization thus
2954                             # only $Bandwidth$ remains
2955                             ('Bandwidth', ))
2956                         representation_ms_info['initialization_url'] = initialization_template % {
2957                             'Bandwidth': bandwidth,
2958                         }
2959
2960                     def location_key(location):
2961                         return 'url' if re.match(r'^https?://', location) else 'path'
2962
2963                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2964
2965                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2966                         media_location_key = location_key(media_template)
2967
2968                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2969                         # can't be used at the same time
2970                         if '%(Number' in media_template and 's' not in representation_ms_info:
2971                             segment_duration = None
2972                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2973                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2974                                 representation_ms_info['total_number'] = int(math.ceil(
2975                                     float_or_none(period_duration, segment_duration, default=0)))
2976                             representation_ms_info['fragments'] = [{
2977                                 media_location_key: media_template % {
2978                                     'Number': segment_number,
2979                                     'Bandwidth': bandwidth,
2980                                 },
2981                                 'duration': segment_duration,
2982                             } for segment_number in range(
2983                                 representation_ms_info['start_number'],
2984                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2985                         else:
2986                             # $Number*$ or $Time$ in media template with S list available
2987                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2988                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2989                             representation_ms_info['fragments'] = []
2990                             segment_time = 0
2991                             segment_d = None
2992                             segment_number = representation_ms_info['start_number']
2993
2994                             def add_segment_url():
2995                                 segment_url = media_template % {
2996                                     'Time': segment_time,
2997                                     'Bandwidth': bandwidth,
2998                                     'Number': segment_number,
2999                                 }
3000                                 representation_ms_info['fragments'].append({
3001                                     media_location_key: segment_url,
3002                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
3003                                 })
3004
3005                             for num, s in enumerate(representation_ms_info['s']):
3006                                 segment_time = s.get('t') or segment_time
3007                                 segment_d = s['d']
3008                                 add_segment_url()
3009                                 segment_number += 1
3010                                 for r in range(s.get('r', 0)):
3011                                     segment_time += segment_d
3012                                     add_segment_url()
3013                                     segment_number += 1
3014                                 segment_time += segment_d
3015                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
3016                         # No media template,
3017                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
3018                         # or any YouTube dashsegments video
3019                         fragments = []
3020                         segment_index = 0
3021                         timescale = representation_ms_info['timescale']
3022                         for s in representation_ms_info['s']:
3023                             duration = float_or_none(s['d'], timescale)
3024                             for r in range(s.get('r', 0) + 1):
3025                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
3026                                 fragments.append({
3027                                     location_key(segment_uri): segment_uri,
3028                                     'duration': duration,
3029                                 })
3030                                 segment_index += 1
3031                         representation_ms_info['fragments'] = fragments
3032                     elif 'segment_urls' in representation_ms_info:
3033                         # Segment URLs with no SegmentTimeline
3034                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3035                         # https://github.com/ytdl-org/youtube-dl/pull/14844
3036                         fragments = []
3037                         segment_duration = float_or_none(
3038                             representation_ms_info['segment_duration'],
3039                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3040                         for segment_url in representation_ms_info['segment_urls']:
3041                             fragment = {
3042                                 location_key(segment_url): segment_url,
3043                             }
3044                             if segment_duration:
3045                                 fragment['duration'] = segment_duration
3046                             fragments.append(fragment)
3047                         representation_ms_info['fragments'] = fragments
3048                     # If there is a fragments key available then we correctly recognized fragmented media.
3049                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3050                     # assumption is not necessarily correct since we may simply have no support for
3051                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3052                     if 'fragments' in representation_ms_info:
3053                         f.update({
3054                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3055                             'url': mpd_url or base_url,
3056                             'fragment_base_url': base_url,
3057                             'fragments': [],
3058                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3059                         })
3060                         if 'initialization_url' in representation_ms_info:
3061                             initialization_url = representation_ms_info['initialization_url']
3062                             if not f.get('url'):
3063                                 f['url'] = initialization_url
3064                             f['fragments'].append({location_key(initialization_url): initialization_url})
3065                         f['fragments'].extend(representation_ms_info['fragments'])
3066                         if not period_duration:
3067                             period_duration = try_get(
3068                                 representation_ms_info,
3069                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3070                     else:
3071                         # Assuming direct URL to unfragmented media.
3072                         f['url'] = base_url
3073                     if content_type in ('video', 'audio', 'image/jpeg'):
3074                         f['manifest_stream_number'] = stream_numbers[f['url']]
3075                         stream_numbers[f['url']] += 1
3076                         formats.append(f)
3077                     elif content_type == 'text':
3078                         subtitles.setdefault(lang or 'und', []).append(f)
3079
3080         return formats, subtitles
3081
3082     def _extract_ism_formats(self, *args, **kwargs):
3083         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3084         if subs:
3085             self._report_ignoring_subs('ISM')
3086         return fmts
3087
3088     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3089         res = self._download_xml_handle(
3090             ism_url, video_id,
3091             note='Downloading ISM manifest' if note is None else note,
3092             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3093             fatal=fatal, data=data, headers=headers, query=query)
3094         if res is False:
3095             return [], {}
3096         ism_doc, urlh = res
3097         if ism_doc is None:
3098             return [], {}
3099
3100         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3101
3102     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3103         """
3104         Parse formats from ISM manifest.
3105         References:
3106          1. [MS-SSTR]: Smooth Streaming Protocol,
3107             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3108         """
3109         if ism_doc.get('IsLive') == 'TRUE':
3110             return [], {}
3111
3112         duration = int(ism_doc.attrib['Duration'])
3113         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3114
3115         formats = []
3116         subtitles = {}
3117         for stream in ism_doc.findall('StreamIndex'):
3118             stream_type = stream.get('Type')
3119             if stream_type not in ('video', 'audio', 'text'):
3120                 continue
3121             url_pattern = stream.attrib['Url']
3122             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3123             stream_name = stream.get('Name')
3124             stream_language = stream.get('Language', 'und')
3125             for track in stream.findall('QualityLevel'):
3126                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3127                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3128                 # TODO: add support for WVC1 and WMAP
3129                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3130                     self.report_warning('%s is not a supported codec' % fourcc)
3131                     continue
3132                 tbr = int(track.attrib['Bitrate']) // 1000
3133                 # [1] does not mention Width and Height attributes. However,
3134                 # they're often present while MaxWidth and MaxHeight are
3135                 # missing, so should be used as fallbacks
3136                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3137                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3138                 sampling_rate = int_or_none(track.get('SamplingRate'))
3139
3140                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3141                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3142
3143                 fragments = []
3144                 fragment_ctx = {
3145                     'time': 0,
3146                 }
3147                 stream_fragments = stream.findall('c')
3148                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3149                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3150                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3151                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3152                     if not fragment_ctx['duration']:
3153                         try:
3154                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3155                         except IndexError:
3156                             next_fragment_time = duration
3157                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3158                     for _ in range(fragment_repeat):
3159                         fragments.append({
3160                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3161                             'duration': fragment_ctx['duration'] / stream_timescale,
3162                         })
3163                         fragment_ctx['time'] += fragment_ctx['duration']
3164
3165                 if stream_type == 'text':
3166                     subtitles.setdefault(stream_language, []).append({
3167                         'ext': 'ismt',
3168                         'protocol': 'ism',
3169                         'url': ism_url,
3170                         'manifest_url': ism_url,
3171                         'fragments': fragments,
3172                         '_download_params': {
3173                             'stream_type': stream_type,
3174                             'duration': duration,
3175                             'timescale': stream_timescale,
3176                             'fourcc': fourcc,
3177                             'language': stream_language,
3178                             'codec_private_data': track.get('CodecPrivateData'),
3179                         }
3180                     })
3181                 elif stream_type in ('video', 'audio'):
3182                     formats.append({
3183                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3184                         'url': ism_url,
3185                         'manifest_url': ism_url,
3186                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3187                         'width': width,
3188                         'height': height,
3189                         'tbr': tbr,
3190                         'asr': sampling_rate,
3191                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3192                         'acodec': 'none' if stream_type == 'video' else fourcc,
3193                         'protocol': 'ism',
3194                         'fragments': fragments,
3195                         'has_drm': ism_doc.find('Protection') is not None,
3196                         '_download_params': {
3197                             'stream_type': stream_type,
3198                             'duration': duration,
3199                             'timescale': stream_timescale,
3200                             'width': width or 0,
3201                             'height': height or 0,
3202                             'fourcc': fourcc,
3203                             'language': stream_language,
3204                             'codec_private_data': track.get('CodecPrivateData'),
3205                             'sampling_rate': sampling_rate,
3206                             'channels': int_or_none(track.get('Channels', 2)),
3207                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3208                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3209                         },
3210                     })
3211         return formats, subtitles
3212
3213     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3214         def absolute_url(item_url):
3215             return urljoin(base_url, item_url)
3216
3217         def parse_content_type(content_type):
3218             if not content_type:
3219                 return {}
3220             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3221             if ctr:
3222                 mimetype, codecs = ctr.groups()
3223                 f = parse_codecs(codecs)
3224                 f['ext'] = mimetype2ext(mimetype)
3225                 return f
3226             return {}
3227
3228         def _media_formats(src, cur_media_type, type_info=None):
3229             type_info = type_info or {}
3230             full_url = absolute_url(src)
3231             ext = type_info.get('ext') or determine_ext(full_url)
3232             if ext == 'm3u8':
3233                 is_plain_url = False
3234                 formats = self._extract_m3u8_formats(
3235                     full_url, video_id, ext='mp4',
3236                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3237                     preference=preference, quality=quality, fatal=False)
3238             elif ext == 'mpd':
3239                 is_plain_url = False
3240                 formats = self._extract_mpd_formats(
3241                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3242             else:
3243                 is_plain_url = True
3244                 formats = [{
3245                     'url': full_url,
3246                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3247                     'ext': ext,
3248                 }]
3249             return is_plain_url, formats
3250
3251         entries = []
3252         # amp-video and amp-audio are very similar to their HTML5 counterparts
3253         # so we will include them right here (see
3254         # https://www.ampproject.org/docs/reference/components/amp-video)
3255         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3256         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3257         media_tags = [(media_tag, media_tag_name, media_type, '')
3258                       for media_tag, media_tag_name, media_type
3259                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3260         media_tags.extend(re.findall(
3261             # We only allow video|audio followed by a whitespace or '>'.
3262             # Allowing more characters may end up in significant slow down (see
3263             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3264             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3265             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3266         for media_tag, _, media_type, media_content in media_tags:
3267             media_info = {
3268                 'formats': [],
3269                 'subtitles': {},
3270             }
3271             media_attributes = extract_attributes(media_tag)
3272             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3273             if src:
3274                 f = parse_content_type(media_attributes.get('type'))
3275                 _, formats = _media_formats(src, media_type, f)
3276                 media_info['formats'].extend(formats)
3277             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3278             if media_content:
3279                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3280                     s_attr = extract_attributes(source_tag)
3281                     # data-video-src and data-src are non standard but seen
3282                     # several times in the wild
3283                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3284                     if not src:
3285                         continue
3286                     f = parse_content_type(s_attr.get('type'))
3287                     is_plain_url, formats = _media_formats(src, media_type, f)
3288                     if is_plain_url:
3289                         # width, height, res, label and title attributes are
3290                         # all not standard but seen several times in the wild
3291                         labels = [
3292                             s_attr.get(lbl)
3293                             for lbl in ('label', 'title')
3294                             if str_or_none(s_attr.get(lbl))
3295                         ]
3296                         width = int_or_none(s_attr.get('width'))
3297                         height = (int_or_none(s_attr.get('height'))
3298                                   or int_or_none(s_attr.get('res')))
3299                         if not width or not height:
3300                             for lbl in labels:
3301                                 resolution = parse_resolution(lbl)
3302                                 if not resolution:
3303                                     continue
3304                                 width = width or resolution.get('width')
3305                                 height = height or resolution.get('height')
3306                         for lbl in labels:
3307                             tbr = parse_bitrate(lbl)
3308                             if tbr:
3309                                 break
3310                         else:
3311                             tbr = None
3312                         f.update({
3313                             'width': width,
3314                             'height': height,
3315                             'tbr': tbr,
3316                             'format_id': s_attr.get('label') or s_attr.get('title'),
3317                         })
3318                         f.update(formats[0])
3319                         media_info['formats'].append(f)
3320                     else:
3321                         media_info['formats'].extend(formats)
3322                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3323                     track_attributes = extract_attributes(track_tag)
3324                     kind = track_attributes.get('kind')
3325                     if not kind or kind in ('subtitles', 'captions'):
3326                         src = strip_or_none(track_attributes.get('src'))
3327                         if not src:
3328                             continue
3329                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3330                         media_info['subtitles'].setdefault(lang, []).append({
3331                             'url': absolute_url(src),
3332                         })
3333             for f in media_info['formats']:
3334                 f.setdefault('http_headers', {})['Referer'] = base_url
3335             if media_info['formats'] or media_info['subtitles']:
3336                 entries.append(media_info)
3337         return entries
3338
3339     def _extract_akamai_formats(self, *args, **kwargs):
3340         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3341         if subs:
3342             self._report_ignoring_subs('akamai')
3343         return fmts
3344
3345     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3346         signed = 'hdnea=' in manifest_url
3347         if not signed:
3348             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3349             manifest_url = re.sub(
3350                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3351                 '', manifest_url).strip('?')
3352
3353         formats = []
3354         subtitles = {}
3355
3356         hdcore_sign = 'hdcore=3.7.0'
3357         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3358         hds_host = hosts.get('hds')
3359         if hds_host:
3360             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3361         if 'hdcore=' not in f4m_url:
3362             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3363         f4m_formats = self._extract_f4m_formats(
3364             f4m_url, video_id, f4m_id='hds', fatal=False)
3365         for entry in f4m_formats:
3366             entry.update({'extra_param_to_segment_url': hdcore_sign})
3367         formats.extend(f4m_formats)
3368
3369         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3370         hls_host = hosts.get('hls')
3371         if hls_host:
3372             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3373         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3374             m3u8_url, video_id, 'mp4', 'm3u8_native',
3375             m3u8_id='hls', fatal=False)
3376         formats.extend(m3u8_formats)
3377         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3378
3379         http_host = hosts.get('http')
3380         if http_host and m3u8_formats and not signed:
3381             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3382             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3383             qualities_length = len(qualities)
3384             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3385                 i = 0
3386                 for f in m3u8_formats:
3387                     if f['vcodec'] != 'none':
3388                         for protocol in ('http', 'https'):
3389                             http_f = f.copy()
3390                             del http_f['manifest_url']
3391                             http_url = re.sub(
3392                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3393                             http_f.update({
3394                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3395                                 'url': http_url,
3396                                 'protocol': protocol,
3397                             })
3398                             formats.append(http_f)
3399                         i += 1
3400
3401         return formats, subtitles
3402
3403     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3404         query = urllib.parse.urlparse(url).query
3405         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3406         mobj = re.search(
3407             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3408         url_base = mobj.group('url')
3409         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3410         formats = []
3411
3412         def manifest_url(manifest):
3413             m_url = f'{http_base_url}/{manifest}'
3414             if query:
3415                 m_url += '?%s' % query
3416             return m_url
3417
3418         if 'm3u8' not in skip_protocols:
3419             formats.extend(self._extract_m3u8_formats(
3420                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3421                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3422         if 'f4m' not in skip_protocols:
3423             formats.extend(self._extract_f4m_formats(
3424                 manifest_url('manifest.f4m'),
3425                 video_id, f4m_id='hds', fatal=False))
3426         if 'dash' not in skip_protocols:
3427             formats.extend(self._extract_mpd_formats(
3428                 manifest_url('manifest.mpd'),
3429                 video_id, mpd_id='dash', fatal=False))
3430         if re.search(r'(?:/smil:|\.smil)', url_base):
3431             if 'smil' not in skip_protocols:
3432                 rtmp_formats = self._extract_smil_formats(
3433                     manifest_url('jwplayer.smil'),
3434                     video_id, fatal=False)
3435                 for rtmp_format in rtmp_formats:
3436                     rtsp_format = rtmp_format.copy()
3437                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3438                     del rtsp_format['play_path']
3439                     del rtsp_format['ext']
3440                     rtsp_format.update({
3441                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3442                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3443                         'protocol': 'rtsp',
3444                     })
3445                     formats.extend([rtmp_format, rtsp_format])
3446         else:
3447             for protocol in ('rtmp', 'rtsp'):
3448                 if protocol not in skip_protocols:
3449                     formats.append({
3450                         'url': f'{protocol}:{url_base}',
3451                         'format_id': protocol,
3452                         'protocol': protocol,
3453                     })
3454         return formats
3455
3456     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3457         mobj = re.search(
3458             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3459             webpage)
3460         if mobj:
3461             try:
3462                 jwplayer_data = self._parse_json(mobj.group('options'),
3463                                                  video_id=video_id,
3464                                                  transform_source=transform_source)
3465             except ExtractorError:
3466                 pass
3467             else:
3468                 if isinstance(jwplayer_data, dict):
3469                     return jwplayer_data
3470
3471     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3472         jwplayer_data = self._find_jwplayer_data(
3473             webpage, video_id, transform_source=js_to_json)
3474         return self._parse_jwplayer_data(
3475             jwplayer_data, video_id, *args, **kwargs)
3476
3477     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3478                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3479         # JWPlayer backward compatibility: flattened playlists
3480         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3481         if 'playlist' not in jwplayer_data:
3482             jwplayer_data = {'playlist': [jwplayer_data]}
3483
3484         entries = []
3485
3486         # JWPlayer backward compatibility: single playlist item
3487         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3488         if not isinstance(jwplayer_data['playlist'], list):
3489             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3490
3491         for video_data in jwplayer_data['playlist']:
3492             # JWPlayer backward compatibility: flattened sources
3493             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3494             if 'sources' not in video_data:
3495                 video_data['sources'] = [video_data]
3496
3497             this_video_id = video_id or video_data['mediaid']
3498
3499             formats = self._parse_jwplayer_formats(
3500                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3501                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3502
3503             subtitles = {}
3504             tracks = video_data.get('tracks')
3505             if tracks and isinstance(tracks, list):
3506                 for track in tracks:
3507                     if not isinstance(track, dict):
3508                         continue
3509                     track_kind = track.get('kind')
3510                     if not track_kind or not isinstance(track_kind, str):
3511                         continue
3512                     if track_kind.lower() not in ('captions', 'subtitles'):
3513                         continue
3514                     track_url = urljoin(base_url, track.get('file'))
3515                     if not track_url:
3516                         continue
3517                     subtitles.setdefault(track.get('label') or 'en', []).append({
3518                         'url': self._proto_relative_url(track_url)
3519                     })
3520
3521             entry = {
3522                 'id': this_video_id,
3523                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3524                 'description': clean_html(video_data.get('description')),
3525                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3526                 'timestamp': int_or_none(video_data.get('pubdate')),
3527                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3528                 'subtitles': subtitles,
3529             }
3530             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3531             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3532                 entry.update({
3533                     '_type': 'url_transparent',
3534                     'url': formats[0]['url'],
3535                 })
3536             else:
3537                 self._sort_formats(formats)
3538                 entry['formats'] = formats
3539             entries.append(entry)
3540         if len(entries) == 1:
3541             return entries[0]
3542         else:
3543             return self.playlist_result(entries)
3544
3545     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3546                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3547         urls = []
3548         formats = []
3549         for source in jwplayer_sources_data:
3550             if not isinstance(source, dict):
3551                 continue
3552             source_url = urljoin(
3553                 base_url, self._proto_relative_url(source.get('file')))
3554             if not source_url or source_url in urls:
3555                 continue
3556             urls.append(source_url)
3557             source_type = source.get('type') or ''
3558             ext = mimetype2ext(source_type) or determine_ext(source_url)
3559             if source_type == 'hls' or ext == 'm3u8':
3560                 formats.extend(self._extract_m3u8_formats(
3561                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3562                     m3u8_id=m3u8_id, fatal=False))
3563             elif source_type == 'dash' or ext == 'mpd':
3564                 formats.extend(self._extract_mpd_formats(
3565                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3566             elif ext == 'smil':
3567                 formats.extend(self._extract_smil_formats(
3568                     source_url, video_id, fatal=False))
3569             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3570             elif source_type.startswith('audio') or ext in (
3571                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3572                 formats.append({
3573                     'url': source_url,
3574                     'vcodec': 'none',
3575                     'ext': ext,
3576                 })
3577             else:
3578                 height = int_or_none(source.get('height'))
3579                 if height is None:
3580                     # Often no height is provided but there is a label in
3581                     # format like "1080p", "720p SD", or 1080.
3582                     height = int_or_none(self._search_regex(
3583                         r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
3584                         'height', default=None))
3585                 a_format = {
3586                     'url': source_url,
3587                     'width': int_or_none(source.get('width')),
3588                     'height': height,
3589                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3590                     'filesize': int_or_none(source.get('filesize')),
3591                     'ext': ext,
3592                 }
3593                 if source_url.startswith('rtmp'):
3594                     a_format['ext'] = 'flv'
3595                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3596                     # of jwplayer.flash.swf
3597                     rtmp_url_parts = re.split(
3598                         r'((?:mp4|mp3|flv):)', source_url, 1)
3599                     if len(rtmp_url_parts) == 3:
3600                         rtmp_url, prefix, play_path = rtmp_url_parts
3601                         a_format.update({
3602                             'url': rtmp_url,
3603                             'play_path': prefix + play_path,
3604                         })
3605                     if rtmp_params:
3606                         a_format.update(rtmp_params)
3607                 formats.append(a_format)
3608         return formats
3609
3610     def _live_title(self, name):
3611         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3612         return name
3613
3614     def _int(self, v, name, fatal=False, **kwargs):
3615         res = int_or_none(v, **kwargs)
3616         if res is None:
3617             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3618             if fatal:
3619                 raise ExtractorError(msg)
3620             else:
3621                 self.report_warning(msg)
3622         return res
3623
3624     def _float(self, v, name, fatal=False, **kwargs):
3625         res = float_or_none(v, **kwargs)
3626         if res is None:
3627             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3628             if fatal:
3629                 raise ExtractorError(msg)
3630             else:
3631                 self.report_warning(msg)
3632         return res
3633
3634     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3635                     path='/', secure=False, discard=False, rest={}, **kwargs):
3636         cookie = http.cookiejar.Cookie(
3637             0, name, value, port, port is not None, domain, True,
3638             domain.startswith('.'), path, True, secure, expire_time,
3639             discard, None, None, rest)
3640         self.cookiejar.set_cookie(cookie)
3641
3642     def _get_cookies(self, url):
3643         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3644         return LenientSimpleCookie(self._downloader._calc_cookies(url))
3645
3646     def _apply_first_set_cookie_header(self, url_handle, cookie):
3647         """
3648         Apply first Set-Cookie header instead of the last. Experimental.
3649
3650         Some sites (e.g. [1-3]) may serve two cookies under the same name
3651         in Set-Cookie header and expect the first (old) one to be set rather
3652         than second (new). However, as of RFC6265 the newer one cookie
3653         should be set into cookie store what actually happens.
3654         We will workaround this issue by resetting the cookie to
3655         the first one manually.
3656         1. https://new.vk.com/
3657         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3658         3. https://learning.oreilly.com/
3659         """
3660         for header, cookies in url_handle.headers.items():
3661             if header.lower() != 'set-cookie':
3662                 continue
3663             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3664             cookie_value = re.search(
3665                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3666             if cookie_value:
3667                 value, domain = cookie_value.groups()
3668                 self._set_cookie(domain, cookie, value)
3669                 break
3670
3671     @classmethod
3672     def get_testcases(cls, include_onlymatching=False):
3673         t = getattr(cls, '_TEST', None)
3674         if t:
3675             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3676             tests = [t]
3677         else:
3678             tests = getattr(cls, '_TESTS', [])
3679         for t in tests:
3680             if not include_onlymatching and t.get('only_matching', False):
3681                 continue
3682             t['name'] = cls.ie_key()
3683             yield t
3684
3685     @classmethod
3686     def get_webpage_testcases(cls):
3687         tests = getattr(cls, '_WEBPAGE_TESTS', [])
3688         for t in tests:
3689             t['name'] = cls.ie_key()
3690         return tests
3691
3692     @classproperty
3693     def age_limit(cls):
3694         """Get age limit from the testcases"""
3695         return max(traverse_obj(
3696             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3697             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3698
3699     @classmethod
3700     def is_suitable(cls, age_limit):
3701         """Test whether the extractor is generally suitable for the given age limit"""
3702         return not age_restricted(cls.age_limit, age_limit)
3703
3704     @classmethod
3705     def description(cls, *, markdown=True, search_examples=None):
3706         """Description of the extractor"""
3707         desc = ''
3708         if cls._NETRC_MACHINE:
3709             if markdown:
3710                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3711             else:
3712                 desc += f' [{cls._NETRC_MACHINE}]'
3713         if cls.IE_DESC is False:
3714             desc += ' [HIDDEN]'
3715         elif cls.IE_DESC:
3716             desc += f' {cls.IE_DESC}'
3717         if cls.SEARCH_KEY:
3718             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3719             if search_examples:
3720                 _COUNTS = ('', '5', '10', 'all')
3721                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3722         if not cls.working():
3723             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3724
3725         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3726         return f'{name}:{desc}' if desc else name
3727
3728     def extract_subtitles(self, *args, **kwargs):
3729         if (self.get_param('writesubtitles', False)
3730                 or self.get_param('listsubtitles')):
3731             return self._get_subtitles(*args, **kwargs)
3732         return {}
3733
3734     def _get_subtitles(self, *args, **kwargs):
3735         raise NotImplementedError('This method must be implemented by subclasses')
3736
3737     def extract_comments(self, *args, **kwargs):
3738         if not self.get_param('getcomments'):
3739             return None
3740         generator = self._get_comments(*args, **kwargs)
3741
3742         def extractor():
3743             comments = []
3744             interrupted = True
3745             try:
3746                 while True:
3747                     comments.append(next(generator))
3748             except StopIteration:
3749                 interrupted = False
3750             except KeyboardInterrupt:
3751                 self.to_screen('Interrupted by user')
3752             except Exception as e:
3753                 if self.get_param('ignoreerrors') is not True:
3754                     raise
3755                 self._downloader.report_error(e)
3756             comment_count = len(comments)
3757             self.to_screen(f'Extracted {comment_count} comments')
3758             return {
3759                 'comments': comments,
3760                 'comment_count': None if interrupted else comment_count
3761             }
3762         return extractor
3763
3764     def _get_comments(self, *args, **kwargs):
3765         raise NotImplementedError('This method must be implemented by subclasses')
3766
3767     @staticmethod
3768     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3769         """ Merge subtitle items for one language. Items with duplicated URLs/data
3770         will be dropped. """
3771         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3772         ret = list(subtitle_list1)
3773         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3774         return ret
3775
3776     @classmethod
3777     def _merge_subtitles(cls, *dicts, target=None):
3778         """ Merge subtitle dictionaries, language by language. """
3779         if target is None:
3780             target = {}
3781         for d in dicts:
3782             for lang, subs in d.items():
3783                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3784         return target
3785
3786     def extract_automatic_captions(self, *args, **kwargs):
3787         if (self.get_param('writeautomaticsub', False)
3788                 or self.get_param('listsubtitles')):
3789             return self._get_automatic_captions(*args, **kwargs)
3790         return {}
3791
3792     def _get_automatic_captions(self, *args, **kwargs):
3793         raise NotImplementedError('This method must be implemented by subclasses')
3794
3795     @functools.cached_property
3796     def _cookies_passed(self):
3797         """Whether cookies have been passed to YoutubeDL"""
3798         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3799
3800     def mark_watched(self, *args, **kwargs):
3801         if not self.get_param('mark_watched', False):
3802             return
3803         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3804             self._mark_watched(*args, **kwargs)
3805
3806     def _mark_watched(self, *args, **kwargs):
3807         raise NotImplementedError('This method must be implemented by subclasses')
3808
3809     def geo_verification_headers(self):
3810         headers = {}
3811         geo_verification_proxy = self.get_param('geo_verification_proxy')
3812         if geo_verification_proxy:
3813             headers['Ytdl-request-proxy'] = geo_verification_proxy
3814         return headers
3815
3816     @staticmethod
3817     def _generic_id(url):
3818         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3819
3820     @staticmethod
3821     def _generic_title(url):
3822         return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3823
3824     @staticmethod
3825     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3826         all_known = all(map(
3827             lambda x: x is not None,
3828             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3829         return (
3830             'private' if is_private
3831             else 'premium_only' if needs_premium
3832             else 'subscriber_only' if needs_subscription
3833             else 'needs_auth' if needs_auth
3834             else 'unlisted' if is_unlisted
3835             else 'public' if all_known
3836             else None)
3837
3838     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3839         '''
3840         @returns            A list of values for the extractor argument given by "key"
3841                             or "default" if no such key is present
3842         @param default      The default value to return when the key is not present (default: [])
3843         @param casesense    When false, the values are converted to lower case
3844         '''
3845         val = traverse_obj(
3846             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3847         if val is None:
3848             return [] if default is NO_DEFAULT else default
3849         return list(val) if casesense else [x.lower() for x in val]
3850
3851     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3852         if not playlist_id or not video_id:
3853             return not video_id
3854
3855         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3856         if no_playlist is not None:
3857             return not no_playlist
3858
3859         video_id = '' if video_id is True else f' {video_id}'
3860         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3861         if self.get_param('noplaylist'):
3862             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3863             return False
3864         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3865         return True
3866
3867     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3868         RetryManager.report_retry(
3869             err, _count or int(fatal), _retries,
3870             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3871             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3872
3873     def RetryManager(self, **kwargs):
3874         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3875
3876     @classmethod
3877     def extract_from_webpage(cls, ydl, url, webpage):
3878         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3879               else ydl.get_info_extractor(cls.ie_key()))
3880         for info in ie._extract_from_webpage(url, webpage) or []:
3881             # url = None since we do not want to set (webpage/original)_url
3882             ydl.add_default_extra_info(info, ie, None)
3883             yield info
3884
3885     @classmethod
3886     def _extract_from_webpage(cls, url, webpage):
3887         for embed_url in orderedSet(
3888                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3889             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3890
3891     @classmethod
3892     def _extract_embed_urls(cls, url, webpage):
3893         """@returns all the embed urls on the webpage"""
3894         if '_EMBED_URL_RE' not in cls.__dict__:
3895             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3896             for idx, regex in enumerate(cls._EMBED_REGEX):
3897                 assert regex.count('(?P<url>') == 1, \
3898                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3899             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3900
3901         for regex in cls._EMBED_URL_RE:
3902             for mobj in regex.finditer(webpage):
3903                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3904                 if cls._VALID_URL is False or cls.suitable(embed_url):
3905                     yield embed_url
3906
3907     class StopExtraction(Exception):
3908         pass
3909
3910     @classmethod
3911     def _extract_url(cls, webpage):  # TODO: Remove
3912         """Only for compatibility with some older extractors"""
3913         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3914
3915     @classmethod
3916     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3917         if plugin_name:
3918             mro = inspect.getmro(cls)
3919             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3920             cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key
3921             while getattr(super_class, '__wrapped__', None):
3922                 super_class = super_class.__wrapped__
3923             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3924
3925         return super().__init_subclass__(**kwargs)
3926
3927
3928 class SearchInfoExtractor(InfoExtractor):
3929     """
3930     Base class for paged search queries extractors.
3931     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3932     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3933     """
3934
3935     _MAX_RESULTS = float('inf')
3936
3937     @classproperty
3938     def _VALID_URL(cls):
3939         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3940
3941     def _real_extract(self, query):
3942         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3943         if prefix == '':
3944             return self._get_n_results(query, 1)
3945         elif prefix == 'all':
3946             return self._get_n_results(query, self._MAX_RESULTS)
3947         else:
3948             n = int(prefix)
3949             if n <= 0:
3950                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3951             elif n > self._MAX_RESULTS:
3952                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3953                 n = self._MAX_RESULTS
3954             return self._get_n_results(query, n)
3955
3956     def _get_n_results(self, query, n):
3957         """Get a specified number of results for a query.
3958         Either this function or _search_results must be overridden by subclasses """
3959         return self.playlist_result(
3960             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3961             query, query)
3962
3963     def _search_results(self, query):
3964         """Returns an iterator of search results"""
3965         raise NotImplementedError('This method must be implemented by subclasses')
3966
3967     @classproperty
3968     def SEARCH_KEY(cls):
3969         return cls._SEARCH_KEY
3970
3971
3972 class UnsupportedURLIE(InfoExtractor):
3973     _VALID_URL = '.*'
3974     _ENABLED = False
3975     IE_DESC = False
3976
3977     def _real_extract(self, url):
3978         raise UnsupportedError(url)