yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import sys
  17 import time
  18 import types
  19 import urllib.parse
  20 import urllib.request
  21 import xml.etree.ElementTree
  22
  23 from ..compat import functools  # isort: split
  24 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  25 from ..cookies import LenientSimpleCookie
  26 from ..downloader import FileDownloader
  27 from ..downloader.f4m import get_base_url, remove_encrypted_media
  28 from ..utils import (
  29     IDENTITY,
  30     JSON_LD_RE,
  31     NO_DEFAULT,
  32     ExtractorError,
  33     GeoRestrictedError,
  34     GeoUtils,
  35     LenientJSONDecoder,
  36     RegexNotFoundError,
  37     RetryManager,
  38     UnsupportedError,
  39     age_restricted,
  40     base_url,
  41     bug_reports_message,
  42     classproperty,
  43     clean_html,
  44     determine_ext,
  45     determine_protocol,
  46     dict_get,
  47     encode_data_uri,
  48     error_to_compat_str,
  49     extract_attributes,
  50     filter_dict,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     int_or_none,
  55     join_nonempty,
  56     js_to_json,
  57     mimetype2ext,
  58     network_exceptions,
  59     orderedSet,
  60     parse_bitrate,
  61     parse_codecs,
  62     parse_duration,
  63     parse_iso8601,
  64     parse_m3u8_attributes,
  65     parse_resolution,
  66     sanitize_filename,
  67     sanitize_url,
  68     sanitized_Request,
  69     smuggle_url,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     traverse_obj,
  74     try_call,
  75     try_get,
  76     unescapeHTML,
  77     unified_strdate,
  78     unified_timestamp,
  79     update_Request,
  80     update_url_query,
  81     url_basename,
  82     url_or_none,
  83     urljoin,
  84     variadic,
  85     xpath_element,
  86     xpath_text,
  87     xpath_with_ns,
  88 )
  89
  90
  91 class InfoExtractor:
  92     """Information Extractor class.
  93
  94     Information extractors are the classes that, given a URL, extract
  95     information about the video (or videos) the URL refers to. This
  96     information includes the real video URL, the video title, author and
  97     others. The information is stored in a dictionary which is then
  98     passed to the YoutubeDL. The YoutubeDL processes this
  99     information possibly downloading the video to the file system, among
 100     other possible outcomes.
 101
 102     The type field determines the type of the result.
 103     By far the most common value (and the default if _type is missing) is
 104     "video", which indicates a single video.
 105
 106     For a video, the dictionaries must include the following fields:
 107
 108     id:             Video identifier.
 109     title:          Video title, unescaped. Set to an empty string if video has
 110                     no title as opposed to "None" which signifies that the
 111                     extractor failed to obtain a title
 112
 113     Additionally, it must contain either a formats entry or a url one:
 114
 115     formats:        A list of dictionaries for each format available, ordered
 116                     from worst to best quality.
 117
 118                     Potential fields:
 119                     * url        The mandatory URL representing the media:
 120                                    for plain file media - HTTP URL of this file,
 121                                    for RTMP - RTMP URL,
 122                                    for HLS - URL of the M3U8 media playlist,
 123                                    for HDS - URL of the F4M manifest,
 124                                    for DASH
 125                                      - HTTP URL to plain file media (in case of
 126                                        unfragmented media)
 127                                      - URL of the MPD manifest or base URL
 128                                        representing the media if MPD manifest
 129                                        is parsed from a string (in case of
 130                                        fragmented media)
 131                                    for MSS - URL of the ISM manifest.
 132                     * manifest_url
 133                                  The URL of the manifest file in case of
 134                                  fragmented media:
 135                                    for HLS - URL of the M3U8 master playlist,
 136                                    for HDS - URL of the F4M manifest,
 137                                    for DASH - URL of the MPD manifest,
 138                                    for MSS - URL of the ISM manifest.
 139                     * manifest_stream_number  (For internal use only)
 140                                  The index of the stream in the manifest file
 141                     * ext        Will be calculated from URL if missing
 142                     * format     A human-readable description of the format
 143                                  ("mp4 container with h264/opus").
 144                                  Calculated from the format_id, width, height.
 145                                  and format_note fields if missing.
 146                     * format_id  A short description of the format
 147                                  ("mp4_h264_opus" or "19").
 148                                 Technically optional, but strongly recommended.
 149                     * format_note Additional info about the format
 150                                  ("3D" or "DASH video")
 151                     * width      Width of the video, if known
 152                     * height     Height of the video, if known
 153                     * resolution Textual description of width and height
 154                     * dynamic_range The dynamic range of the video. One of:
 155                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 156                     * tbr        Average bitrate of audio and video in KBit/s
 157                     * abr        Average audio bitrate in KBit/s
 158                     * acodec     Name of the audio codec in use
 159                     * asr        Audio sampling rate in Hertz
 160                     * audio_channels  Number of audio channels
 161                     * vbr        Average video bitrate in KBit/s
 162                     * fps        Frame rate
 163                     * vcodec     Name of the video codec in use
 164                     * container  Name of the container format
 165                     * filesize   The number of bytes, if known in advance
 166                     * filesize_approx  An estimate for the number of bytes
 167                     * player_url SWF Player URL (used for rtmpdump).
 168                     * protocol   The protocol that will be used for the actual
 169                                  download, lower-case. One of "http", "https" or
 170                                  one of the protocols defined in downloader.PROTOCOL_MAP
 171                     * fragment_base_url
 172                                  Base URL for fragments. Each fragment's path
 173                                  value (if present) will be relative to
 174                                  this URL.
 175                     * fragments  A list of fragments of a fragmented media.
 176                                  Each fragment entry must contain either an url
 177                                  or a path. If an url is present it should be
 178                                  considered by a client. Otherwise both path and
 179                                  fragment_base_url must be present. Here is
 180                                  the list of all potential fields:
 181                                  * "url" - fragment's URL
 182                                  * "path" - fragment's path relative to
 183                                             fragment_base_url
 184                                  * "duration" (optional, int or float)
 185                                  * "filesize" (optional, int)
 186                     * is_from_start  Is a live format that can be downloaded
 187                                 from the start. Boolean
 188                     * preference Order number of this format. If this field is
 189                                  present and not None, the formats get sorted
 190                                  by this field, regardless of all other values.
 191                                  -1 for default (order by other properties),
 192                                  -2 or smaller for less than default.
 193                                  < -1000 to hide the format (if there is
 194                                     another one which is strictly better)
 195                     * language   Language code, e.g. "de" or "en-US".
 196                     * language_preference  Is this in the language mentioned in
 197                                  the URL?
 198                                  10 if it's what the URL is about,
 199                                  -1 for default (don't know),
 200                                  -10 otherwise, other values reserved for now.
 201                     * quality    Order number of the video quality of this
 202                                  format, irrespective of the file format.
 203                                  -1 for default (order by other properties),
 204                                  -2 or smaller for less than default.
 205                     * source_preference  Order number for this video source
 206                                   (quality takes higher priority)
 207                                  -1 for default (order by other properties),
 208                                  -2 or smaller for less than default.
 209                     * http_headers  A dictionary of additional HTTP headers
 210                                  to add to the request.
 211                     * stretched_ratio  If given and not 1, indicates that the
 212                                  video's pixels are not square.
 213                                  width : height ratio as float.
 214                     * no_resume  The server does not support resuming the
 215                                  (HTTP or RTMP) download. Boolean.
 216                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 217                     * downloader_options  A dictionary of downloader options
 218                                  (For internal use only)
 219                                  * http_chunk_size Chunk size for HTTP downloads
 220                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 221                     RTMP formats can also have the additional fields: page_url,
 222                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 223                     rtmp_protocol, rtmp_real_time
 224
 225     url:            Final video URL.
 226     ext:            Video filename extension.
 227     format:         The video format, defaults to ext (used for --get-format)
 228     player_url:     SWF Player URL (used for rtmpdump).
 229
 230     The following fields are optional:
 231
 232     direct:         True if a direct video file was given (must only be set by GenericIE)
 233     alt_title:      A secondary title of the video.
 234     display_id      An alternative identifier for the video, not necessarily
 235                     unique, but available before title. Typically, id is
 236                     something like "4234987", title "Dancing naked mole rats",
 237                     and display_id "dancing-naked-mole-rats"
 238     thumbnails:     A list of dictionaries, with the following entries:
 239                         * "id" (optional, string) - Thumbnail format ID
 240                         * "url"
 241                         * "preference" (optional, int) - quality of the image
 242                         * "width" (optional, int)
 243                         * "height" (optional, int)
 244                         * "resolution" (optional, string "{width}x{height}",
 245                                         deprecated)
 246                         * "filesize" (optional, int)
 247                         * "http_headers" (dict) - HTTP headers for the request
 248     thumbnail:      Full URL to a video thumbnail image.
 249     description:    Full video description.
 250     uploader:       Full name of the video uploader.
 251     license:        License name the video is licensed under.
 252     creator:        The creator of the video.
 253     timestamp:      UNIX timestamp of the moment the video was uploaded
 254     upload_date:    Video upload date in UTC (YYYYMMDD).
 255                     If not explicitly set, calculated from timestamp
 256     release_timestamp: UNIX timestamp of the moment the video was released.
 257                     If it is not clear whether to use timestamp or this, use the former
 258     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 259                     If not explicitly set, calculated from release_timestamp
 260     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 261     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 262                     If not explicitly set, calculated from modified_timestamp
 263     uploader_id:    Nickname or id of the video uploader.
 264     uploader_url:   Full URL to a personal webpage of the video uploader.
 265     channel:        Full name of the channel the video is uploaded on.
 266                     Note that channel fields may or may not repeat uploader
 267                     fields. This depends on a particular extractor.
 268     channel_id:     Id of the channel.
 269     channel_url:    Full URL to a channel webpage.
 270     channel_follower_count: Number of followers of the channel.
 271     location:       Physical location where the video was filmed.
 272     subtitles:      The available subtitles as a dictionary in the format
 273                     {tag: subformats}. "tag" is usually a language code, and
 274                     "subformats" is a list sorted from lower to higher
 275                     preference, each element is a dictionary with the "ext"
 276                     entry and one of:
 277                         * "data": The subtitles file contents
 278                         * "url": A URL pointing to the subtitles file
 279                     It can optionally also have:
 280                         * "name": Name or description of the subtitles
 281                         * "http_headers": A dictionary of additional HTTP headers
 282                                   to add to the request.
 283                     "ext" will be calculated from URL if missing
 284     automatic_captions: Like 'subtitles'; contains automatically generated
 285                     captions instead of normal subtitles
 286     duration:       Length of the video in seconds, as an integer or float.
 287     view_count:     How many users have watched the video on the platform.
 288     concurrent_view_count: How many users are currently watching the video on the platform.
 289     like_count:     Number of positive ratings of the video
 290     dislike_count:  Number of negative ratings of the video
 291     repost_count:   Number of reposts of the video
 292     average_rating: Average rating give by users, the scale used depends on the webpage
 293     comment_count:  Number of comments on the video
 294     comments:       A list of comments, each with one or more of the following
 295                     properties (all but one of text or html optional):
 296                         * "author" - human-readable name of the comment author
 297                         * "author_id" - user ID of the comment author
 298                         * "author_thumbnail" - The thumbnail of the comment author
 299                         * "id" - Comment ID
 300                         * "html" - Comment as HTML
 301                         * "text" - Plain text of the comment
 302                         * "timestamp" - UNIX timestamp of comment
 303                         * "parent" - ID of the comment this one is replying to.
 304                                      Set to "root" to indicate that this is a
 305                                      comment to the original video.
 306                         * "like_count" - Number of positive ratings of the comment
 307                         * "dislike_count" - Number of negative ratings of the comment
 308                         * "is_favorited" - Whether the comment is marked as
 309                                            favorite by the video uploader
 310                         * "author_is_uploader" - Whether the comment is made by
 311                                                  the video uploader
 312     age_limit:      Age restriction for the video, as an integer (years)
 313     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 314                     should allow to get the same result again. (It will be set
 315                     by YoutubeDL if it's missing)
 316     categories:     A list of categories that the video falls in, for example
 317                     ["Sports", "Berlin"]
 318     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 319     cast:           A list of the video cast
 320     is_live:        True, False, or None (=unknown). Whether this video is a
 321                     live stream that goes on instead of a fixed-length video.
 322     was_live:       True, False, or None (=unknown). Whether this video was
 323                     originally a live stream.
 324     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 325                     or 'post_live' (was live, but VOD is not yet processed)
 326                     If absent, automatically set from is_live, was_live
 327     start_time:     Time in seconds where the reproduction should start, as
 328                     specified in the URL.
 329     end_time:       Time in seconds where the reproduction should end, as
 330                     specified in the URL.
 331     chapters:       A list of dictionaries, with the following entries:
 332                         * "start_time" - The start time of the chapter in seconds
 333                         * "end_time" - The end time of the chapter in seconds
 334                         * "title" (optional, string)
 335     playable_in_embed: Whether this video is allowed to play in embedded
 336                     players on other sites. Can be True (=always allowed),
 337                     False (=never allowed), None (=unknown), or a string
 338                     specifying the criteria for embedability; e.g. 'whitelist'
 339     availability:   Under what condition the video is available. One of
 340                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 341                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 342                     to set it
 343     _old_archive_ids: A list of old archive ids needed for backward compatibility
 344     __post_extractor: A function to be called just before the metadata is
 345                     written to either disk, logger or console. The function
 346                     must return a dict which will be added to the info_dict.
 347                     This is usefull for additional information that is
 348                     time-consuming to extract. Note that the fields thus
 349                     extracted will not be available to output template and
 350                     match_filter. So, only "comments" and "comment_count" are
 351                     currently allowed to be extracted via this method.
 352
 353     The following fields should only be used when the video belongs to some logical
 354     chapter or section:
 355
 356     chapter:        Name or title of the chapter the video belongs to.
 357     chapter_number: Number of the chapter the video belongs to, as an integer.
 358     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 359
 360     The following fields should only be used when the video is an episode of some
 361     series, programme or podcast:
 362
 363     series:         Title of the series or programme the video episode belongs to.
 364     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 365     season:         Title of the season the video episode belongs to.
 366     season_number:  Number of the season the video episode belongs to, as an integer.
 367     season_id:      Id of the season the video episode belongs to, as a unicode string.
 368     episode:        Title of the video episode. Unlike mandatory video title field,
 369                     this field should denote the exact title of the video episode
 370                     without any kind of decoration.
 371     episode_number: Number of the video episode within a season, as an integer.
 372     episode_id:     Id of the video episode, as a unicode string.
 373
 374     The following fields should only be used when the media is a track or a part of
 375     a music album:
 376
 377     track:          Title of the track.
 378     track_number:   Number of the track within an album or a disc, as an integer.
 379     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 380                     as a unicode string.
 381     artist:         Artist(s) of the track.
 382     genre:          Genre(s) of the track.
 383     album:          Title of the album the track belongs to.
 384     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 385     album_artist:   List of all artists appeared on the album (e.g.
 386                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 387                     and compilations).
 388     disc_number:    Number of the disc or other physical medium the track belongs to,
 389                     as an integer.
 390     release_year:   Year (YYYY) when the album was released.
 391     composer:       Composer of the piece
 392
 393     The following fields should only be set for clips that should be cut from the original video:
 394
 395     section_start:  Start time of the section in seconds
 396     section_end:    End time of the section in seconds
 397
 398     The following fields should only be set for storyboards:
 399     rows:           Number of rows in each storyboard fragment, as an integer
 400     columns:        Number of columns in each storyboard fragment, as an integer
 401
 402     Unless mentioned otherwise, the fields should be Unicode strings.
 403
 404     Unless mentioned otherwise, None is equivalent to absence of information.
 405
 406
 407     _type "playlist" indicates multiple videos.
 408     There must be a key "entries", which is a list, an iterable, or a PagedList
 409     object, each element of which is a valid dictionary by this specification.
 410
 411     Additionally, playlists can have "id", "title", and any other relevant
 412     attributes with the same semantics as videos (see above).
 413
 414     It can also have the following optional fields:
 415
 416     playlist_count: The total number of videos in a playlist. If not given,
 417                     YoutubeDL tries to calculate it from "entries"
 418
 419
 420     _type "multi_video" indicates that there are multiple videos that
 421     form a single show, for examples multiple acts of an opera or TV episode.
 422     It must have an entries key like a playlist and contain all the keys
 423     required for a video at the same time.
 424
 425
 426     _type "url" indicates that the video must be extracted from another
 427     location, possibly by a different extractor. Its only required key is:
 428     "url" - the next URL to extract.
 429     The key "ie_key" can be set to the class name (minus the trailing "IE",
 430     e.g. "Youtube") if the extractor class is known in advance.
 431     Additionally, the dictionary may have any properties of the resolved entity
 432     known in advance, for example "title" if the title of the referred video is
 433     known ahead of time.
 434
 435
 436     _type "url_transparent" entities have the same specification as "url", but
 437     indicate that the given additional information is more precise than the one
 438     associated with the resolved URL.
 439     This is useful when a site employs a video service that hosts the video and
 440     its technical metadata, but that video service does not embed a useful
 441     title, description etc.
 442
 443
 444     Subclasses of this should also be added to the list of extractors and
 445     should define a _VALID_URL regexp and, re-define the _real_extract() and
 446     (optionally) _real_initialize() methods.
 447
 448     Subclasses may also override suitable() if necessary, but ensure the function
 449     signature is preserved and that this function imports everything it needs
 450     (except other extractors), so that lazy_extractors works correctly.
 451
 452     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 453     the HTML of Generic webpages. It may also override _extract_embed_urls
 454     or _extract_from_webpage as necessary. While these are normally classmethods,
 455     _extract_from_webpage is allowed to be an instance method.
 456
 457     _extract_from_webpage may raise self.StopExtraction() to stop further
 458     processing of the webpage and obtain exclusive rights to it. This is useful
 459     when the extractor cannot reliably be matched using just the URL,
 460     e.g. invidious/peertube instances
 461
 462     Embed-only extractors can be defined by setting _VALID_URL = False.
 463
 464     To support username + password (or netrc) login, the extractor must define a
 465     _NETRC_MACHINE and re-define _perform_login(username, password) and
 466     (optionally) _initialize_pre_login() methods. The _perform_login method will
 467     be called between _initialize_pre_login and _real_initialize if credentials
 468     are passed by the user. In cases where it is necessary to have the login
 469     process as part of the extraction rather than initialization, _perform_login
 470     can be left undefined.
 471
 472     _GEO_BYPASS attribute may be set to False in order to disable
 473     geo restriction bypass mechanisms for a particular extractor.
 474     Though it won't disable explicit geo restriction bypass based on
 475     country code provided with geo_bypass_country.
 476
 477     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 478     countries for this extractor. One of these countries will be used by
 479     geo restriction bypass mechanism right away in order to bypass
 480     geo restriction, of course, if the mechanism is not disabled.
 481
 482     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 483     IP blocks in CIDR notation for this extractor. One of these IP blocks
 484     will be used by geo restriction bypass mechanism similarly
 485     to _GEO_COUNTRIES.
 486
 487     The _ENABLED attribute should be set to False for IEs that
 488     are disabled by default and must be explicitly enabled.
 489
 490     The _WORKING attribute should be set to False for broken IEs
 491     in order to warn the users and skip the tests.
 492     """
 493
 494     _ready = False
 495     _downloader = None
 496     _x_forwarded_for_ip = None
 497     _GEO_BYPASS = True
 498     _GEO_COUNTRIES = None
 499     _GEO_IP_BLOCKS = None
 500     _WORKING = True
 501     _ENABLED = True
 502     _NETRC_MACHINE = None
 503     IE_DESC = None
 504     SEARCH_KEY = None
 505     _VALID_URL = None
 506     _EMBED_REGEX = []
 507
 508     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 509         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 510         return {
 511             None: '',
 512             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 513             'password': f'Use {password_hint}',
 514             'cookies': (
 515                 'Use --cookies-from-browser or --cookies for the authentication. '
 516                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 517         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 518
 519     def __init__(self, downloader=None):
 520         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 521         If a downloader is not passed during initialization,
 522         it must be set using "set_downloader()" before "extract()" is called"""
 523         self._ready = False
 524         self._x_forwarded_for_ip = None
 525         self._printed_messages = set()
 526         self.set_downloader(downloader)
 527
 528     @classmethod
 529     def _match_valid_url(cls, url):
 530         if cls._VALID_URL is False:
 531             return None
 532         # This does not use has/getattr intentionally - we want to know whether
 533         # we have cached the regexp for *this* class, whereas getattr would also
 534         # match the superclass
 535         if '_VALID_URL_RE' not in cls.__dict__:
 536             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 537         return cls._VALID_URL_RE.match(url)
 538
 539     @classmethod
 540     def suitable(cls, url):
 541         """Receives a URL and returns True if suitable for this IE."""
 542         # This function must import everything it needs (except other extractors),
 543         # so that lazy_extractors works correctly
 544         return cls._match_valid_url(url) is not None
 545
 546     @classmethod
 547     def _match_id(cls, url):
 548         return cls._match_valid_url(url).group('id')
 549
 550     @classmethod
 551     def get_temp_id(cls, url):
 552         try:
 553             return cls._match_id(url)
 554         except (IndexError, AttributeError):
 555             return None
 556
 557     @classmethod
 558     def working(cls):
 559         """Getter method for _WORKING."""
 560         return cls._WORKING
 561
 562     @classmethod
 563     def supports_login(cls):
 564         return bool(cls._NETRC_MACHINE)
 565
 566     def initialize(self):
 567         """Initializes an instance (authentication, etc)."""
 568         self._printed_messages = set()
 569         self._initialize_geo_bypass({
 570             'countries': self._GEO_COUNTRIES,
 571             'ip_blocks': self._GEO_IP_BLOCKS,
 572         })
 573         if not self._ready:
 574             self._initialize_pre_login()
 575             if self.supports_login():
 576                 username, password = self._get_login_info()
 577                 if username:
 578                     self._perform_login(username, password)
 579             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 580                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 581             self._real_initialize()
 582             self._ready = True
 583
 584     def _initialize_geo_bypass(self, geo_bypass_context):
 585         """
 586         Initialize geo restriction bypass mechanism.
 587
 588         This method is used to initialize geo bypass mechanism based on faking
 589         X-Forwarded-For HTTP header. A random country from provided country list
 590         is selected and a random IP belonging to this country is generated. This
 591         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 592         HTTP requests.
 593
 594         This method will be used for initial geo bypass mechanism initialization
 595         during the instance initialization with _GEO_COUNTRIES and
 596         _GEO_IP_BLOCKS.
 597
 598         You may also manually call it from extractor's code if geo bypass
 599         information is not available beforehand (e.g. obtained during
 600         extraction) or due to some other reason. In this case you should pass
 601         this information in geo bypass context passed as first argument. It may
 602         contain following fields:
 603
 604         countries:  List of geo unrestricted countries (similar
 605                     to _GEO_COUNTRIES)
 606         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 607                     (similar to _GEO_IP_BLOCKS)
 608
 609         """
 610         if not self._x_forwarded_for_ip:
 611
 612             # Geo bypass mechanism is explicitly disabled by user
 613             if not self.get_param('geo_bypass', True):
 614                 return
 615
 616             if not geo_bypass_context:
 617                 geo_bypass_context = {}
 618
 619             # Backward compatibility: previously _initialize_geo_bypass
 620             # expected a list of countries, some 3rd party code may still use
 621             # it this way
 622             if isinstance(geo_bypass_context, (list, tuple)):
 623                 geo_bypass_context = {
 624                     'countries': geo_bypass_context,
 625                 }
 626
 627             # The whole point of geo bypass mechanism is to fake IP
 628             # as X-Forwarded-For HTTP header based on some IP block or
 629             # country code.
 630
 631             # Path 1: bypassing based on IP block in CIDR notation
 632
 633             # Explicit IP block specified by user, use it right away
 634             # regardless of whether extractor is geo bypassable or not
 635             ip_block = self.get_param('geo_bypass_ip_block', None)
 636
 637             # Otherwise use random IP block from geo bypass context but only
 638             # if extractor is known as geo bypassable
 639             if not ip_block:
 640                 ip_blocks = geo_bypass_context.get('ip_blocks')
 641                 if self._GEO_BYPASS and ip_blocks:
 642                     ip_block = random.choice(ip_blocks)
 643
 644             if ip_block:
 645                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 646                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 647                 return
 648
 649             # Path 2: bypassing based on country code
 650
 651             # Explicit country code specified by user, use it right away
 652             # regardless of whether extractor is geo bypassable or not
 653             country = self.get_param('geo_bypass_country', None)
 654
 655             # Otherwise use random country code from geo bypass context but
 656             # only if extractor is known as geo bypassable
 657             if not country:
 658                 countries = geo_bypass_context.get('countries')
 659                 if self._GEO_BYPASS and countries:
 660                     country = random.choice(countries)
 661
 662             if country:
 663                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 664                 self._downloader.write_debug(
 665                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 666
 667     def extract(self, url):
 668         """Extracts URL information and returns it in list of dicts."""
 669         try:
 670             for _ in range(2):
 671                 try:
 672                     self.initialize()
 673                     self.write_debug('Extracting URL: %s' % url)
 674                     ie_result = self._real_extract(url)
 675                     if ie_result is None:
 676                         return None
 677                     if self._x_forwarded_for_ip:
 678                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 679                     subtitles = ie_result.get('subtitles') or {}
 680                     if 'no-live-chat' in self.get_param('compat_opts'):
 681                         for lang in ('live_chat', 'comments', 'danmaku'):
 682                             subtitles.pop(lang, None)
 683                     return ie_result
 684                 except GeoRestrictedError as e:
 685                     if self.__maybe_fake_ip_and_retry(e.countries):
 686                         continue
 687                     raise
 688         except UnsupportedError:
 689             raise
 690         except ExtractorError as e:
 691             kwargs = {
 692                 'video_id': e.video_id or self.get_temp_id(url),
 693                 'ie': self.IE_NAME,
 694                 'tb': e.traceback or sys.exc_info()[2],
 695                 'expected': e.expected,
 696                 'cause': e.cause
 697             }
 698             if hasattr(e, 'countries'):
 699                 kwargs['countries'] = e.countries
 700             raise type(e)(e.orig_msg, **kwargs)
 701         except http.client.IncompleteRead as e:
 702             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 703         except (KeyError, StopIteration) as e:
 704             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 705
 706     def __maybe_fake_ip_and_retry(self, countries):
 707         if (not self.get_param('geo_bypass_country', None)
 708                 and self._GEO_BYPASS
 709                 and self.get_param('geo_bypass', True)
 710                 and not self._x_forwarded_for_ip
 711                 and countries):
 712             country_code = random.choice(countries)
 713             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 714             if self._x_forwarded_for_ip:
 715                 self.report_warning(
 716                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 717                     % (self._x_forwarded_for_ip, country_code.upper()))
 718                 return True
 719         return False
 720
 721     def set_downloader(self, downloader):
 722         """Sets a YoutubeDL instance as the downloader for this IE."""
 723         self._downloader = downloader
 724
 725     @property
 726     def cache(self):
 727         return self._downloader.cache
 728
 729     @property
 730     def cookiejar(self):
 731         return self._downloader.cookiejar
 732
 733     def _initialize_pre_login(self):
 734         """ Initialization before login. Redefine in subclasses."""
 735         pass
 736
 737     def _perform_login(self, username, password):
 738         """ Login with username and password. Redefine in subclasses."""
 739         pass
 740
 741     def _real_initialize(self):
 742         """Real initialization process. Redefine in subclasses."""
 743         pass
 744
 745     def _real_extract(self, url):
 746         """Real extraction process. Redefine in subclasses."""
 747         raise NotImplementedError('This method must be implemented by subclasses')
 748
 749     @classmethod
 750     def ie_key(cls):
 751         """A string for getting the InfoExtractor with get_info_extractor"""
 752         return cls.__name__[:-2]
 753
 754     @classproperty
 755     def IE_NAME(cls):
 756         return cls.__name__[:-2]
 757
 758     @staticmethod
 759     def __can_accept_status_code(err, expected_status):
 760         assert isinstance(err, urllib.error.HTTPError)
 761         if expected_status is None:
 762             return False
 763         elif callable(expected_status):
 764             return expected_status(err.code) is True
 765         else:
 766             return err.code in variadic(expected_status)
 767
 768     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 769         if isinstance(url_or_request, urllib.request.Request):
 770             return update_Request(url_or_request, data=data, headers=headers, query=query)
 771         if query:
 772             url_or_request = update_url_query(url_or_request, query)
 773         return sanitized_Request(url_or_request, data, headers or {})
 774
 775     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 776         """
 777         Return the response handle.
 778
 779         See _download_webpage docstring for arguments specification.
 780         """
 781         if not self._downloader._first_webpage_request:
 782             sleep_interval = self.get_param('sleep_interval_requests') or 0
 783             if sleep_interval > 0:
 784                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 785                 time.sleep(sleep_interval)
 786         else:
 787             self._downloader._first_webpage_request = False
 788
 789         if note is None:
 790             self.report_download_webpage(video_id)
 791         elif note is not False:
 792             if video_id is None:
 793                 self.to_screen(str(note))
 794             else:
 795                 self.to_screen(f'{video_id}: {note}')
 796
 797         # Some sites check X-Forwarded-For HTTP header in order to figure out
 798         # the origin of the client behind proxy. This allows bypassing geo
 799         # restriction by faking this header's value to IP that belongs to some
 800         # geo unrestricted country. We will do so once we encounter any
 801         # geo restriction error.
 802         if self._x_forwarded_for_ip:
 803             headers = (headers or {}).copy()
 804             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 805
 806         try:
 807             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 808         except network_exceptions as err:
 809             if isinstance(err, urllib.error.HTTPError):
 810                 if self.__can_accept_status_code(err, expected_status):
 811                     # Retain reference to error to prevent file object from
 812                     # being closed before it can be read. Works around the
 813                     # effects of <https://bugs.python.org/issue15002>
 814                     # introduced in Python 3.4.1.
 815                     err.fp._error = err
 816                     return err.fp
 817
 818             if errnote is False:
 819                 return False
 820             if errnote is None:
 821                 errnote = 'Unable to download webpage'
 822
 823             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 824             if fatal:
 825                 raise ExtractorError(errmsg, cause=err)
 826             else:
 827                 self.report_warning(errmsg)
 828                 return False
 829
 830     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 831                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 832         """
 833         Return a tuple (page content as string, URL handle).
 834
 835         Arguments:
 836         url_or_request -- plain text URL as a string or
 837             a urllib.request.Request object
 838         video_id -- Video/playlist/item identifier (string)
 839
 840         Keyword arguments:
 841         note -- note printed before downloading (string)
 842         errnote -- note printed in case of an error (string)
 843         fatal -- flag denoting whether error should be considered fatal,
 844             i.e. whether it should cause ExtractionError to be raised,
 845             otherwise a warning will be reported and extraction continued
 846         encoding -- encoding for a page content decoding, guessed automatically
 847             when not explicitly specified
 848         data -- POST data (bytes)
 849         headers -- HTTP headers (dict)
 850         query -- URL query (dict)
 851         expected_status -- allows to accept failed HTTP requests (non 2xx
 852             status code) by explicitly specifying a set of accepted status
 853             codes. Can be any of the following entities:
 854                 - an integer type specifying an exact failed status code to
 855                   accept
 856                 - a list or a tuple of integer types specifying a list of
 857                   failed status codes to accept
 858                 - a callable accepting an actual failed status code and
 859                   returning True if it should be accepted
 860             Note that this argument does not affect success status codes (2xx)
 861             which are always accepted.
 862         """
 863
 864         # Strip hashes from the URL (#1038)
 865         if isinstance(url_or_request, str):
 866             url_or_request = url_or_request.partition('#')[0]
 867
 868         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 869         if urlh is False:
 870             assert not fatal
 871             return False
 872         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 873         return (content, urlh)
 874
 875     @staticmethod
 876     def _guess_encoding_from_content(content_type, webpage_bytes):
 877         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 878         if m:
 879             encoding = m.group(1)
 880         else:
 881             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 882                           webpage_bytes[:1024])
 883             if m:
 884                 encoding = m.group(1).decode('ascii')
 885             elif webpage_bytes.startswith(b'\xff\xfe'):
 886                 encoding = 'utf-16'
 887             else:
 888                 encoding = 'utf-8'
 889
 890         return encoding
 891
 892     def __check_blocked(self, content):
 893         first_block = content[:512]
 894         if ('<title>Access to this site is blocked</title>' in content
 895                 and 'Websense' in first_block):
 896             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 897             blocked_iframe = self._html_search_regex(
 898                 r'<iframe src="([^"]+)"', content,
 899                 'Websense information URL', default=None)
 900             if blocked_iframe:
 901                 msg += ' Visit %s for more details' % blocked_iframe
 902             raise ExtractorError(msg, expected=True)
 903         if '<title>The URL you requested has been blocked</title>' in first_block:
 904             msg = (
 905                 'Access to this webpage has been blocked by Indian censorship. '
 906                 'Use a VPN or proxy server (with --proxy) to route around it.')
 907             block_msg = self._html_search_regex(
 908                 r'</h1><p>(.*?)</p>',
 909                 content, 'block message', default=None)
 910             if block_msg:
 911                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 912             raise ExtractorError(msg, expected=True)
 913         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 914                 and 'blocklist.rkn.gov.ru' in content):
 915             raise ExtractorError(
 916                 'Access to this webpage has been blocked by decision of the Russian government. '
 917                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 918                 expected=True)
 919
 920     def _request_dump_filename(self, url, video_id):
 921         basen = f'{video_id}_{url}'
 922         trim_length = self.get_param('trim_file_name') or 240
 923         if len(basen) > trim_length:
 924             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 925             basen = basen[:trim_length - len(h)] + h
 926         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 927         # Working around MAX_PATH limitation on Windows (see
 928         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 929         if compat_os_name == 'nt':
 930             absfilepath = os.path.abspath(filename)
 931             if len(absfilepath) > 259:
 932                 filename = fR'\\?\{absfilepath}'
 933         return filename
 934
 935     def __decode_webpage(self, webpage_bytes, encoding, headers):
 936         if not encoding:
 937             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 938         try:
 939             return webpage_bytes.decode(encoding, 'replace')
 940         except LookupError:
 941             return webpage_bytes.decode('utf-8', 'replace')
 942
 943     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 944         webpage_bytes = urlh.read()
 945         if prefix is not None:
 946             webpage_bytes = prefix + webpage_bytes
 947         if self.get_param('dump_intermediate_pages', False):
 948             self.to_screen('Dumping request to ' + urlh.geturl())
 949             dump = base64.b64encode(webpage_bytes).decode('ascii')
 950             self._downloader.to_screen(dump)
 951         if self.get_param('write_pages'):
 952             filename = self._request_dump_filename(urlh.geturl(), video_id)
 953             self.to_screen(f'Saving request to {filename}')
 954             with open(filename, 'wb') as outf:
 955                 outf.write(webpage_bytes)
 956
 957         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 958         self.__check_blocked(content)
 959
 960         return content
 961
 962     def __print_error(self, errnote, fatal, video_id, err):
 963         if fatal:
 964             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 965         elif errnote:
 966             self.report_warning(f'{video_id}: {errnote}: {err}')
 967
 968     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 969         if transform_source:
 970             xml_string = transform_source(xml_string)
 971         try:
 972             return compat_etree_fromstring(xml_string.encode('utf-8'))
 973         except xml.etree.ElementTree.ParseError as ve:
 974             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 975
 976     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
 977         try:
 978             return json.loads(
 979                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 980         except ValueError as ve:
 981             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
 982
 983     def _parse_socket_response_as_json(self, data, *args, **kwargs):
 984         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
 985
 986     def __create_download_methods(name, parser, note, errnote, return_value):
 987
 988         def parse(ie, content, *args, errnote=errnote, **kwargs):
 989             if parser is None:
 990                 return content
 991             if errnote is False:
 992                 kwargs['errnote'] = errnote
 993             # parser is fetched by name so subclasses can override it
 994             return getattr(ie, parser)(content, *args, **kwargs)
 995
 996         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 997                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 998             res = self._download_webpage_handle(
 999                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1000                 data=data, headers=headers, query=query, expected_status=expected_status)
1001             if res is False:
1002                 return res
1003             content, urlh = res
1004             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1005
1006         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1007                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1008             if self.get_param('load_pages'):
1009                 url_or_request = self._create_request(url_or_request, data, headers, query)
1010                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1011                 self.to_screen(f'Loading request from {filename}')
1012                 try:
1013                     with open(filename, 'rb') as dumpf:
1014                         webpage_bytes = dumpf.read()
1015                 except OSError as e:
1016                     self.report_warning(f'Unable to load request from disk: {e}')
1017                 else:
1018                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1019                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1020             kwargs = {
1021                 'note': note,
1022                 'errnote': errnote,
1023                 'transform_source': transform_source,
1024                 'fatal': fatal,
1025                 'encoding': encoding,
1026                 'data': data,
1027                 'headers': headers,
1028                 'query': query,
1029                 'expected_status': expected_status,
1030             }
1031             if parser is None:
1032                 kwargs.pop('transform_source')
1033             # The method is fetched by name so subclasses can override _download_..._handle
1034             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1035             return res if res is False else res[0]
1036
1037         def impersonate(func, name, return_value):
1038             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1039             func.__doc__ = f'''
1040                 @param transform_source     Apply this transformation before parsing
1041                 @returns                    {return_value}
1042
1043                 See _download_webpage_handle docstring for other arguments specification
1044             '''
1045
1046         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1047         impersonate(download_content, f'_download_{name}', f'{return_value}')
1048         return download_handle, download_content
1049
1050     _download_xml_handle, _download_xml = __create_download_methods(
1051         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1052     _download_json_handle, _download_json = __create_download_methods(
1053         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1054     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1055         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1056     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1057
1058     def _download_webpage(
1059             self, url_or_request, video_id, note=None, errnote=None,
1060             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1061         """
1062         Return the data of the page as a string.
1063
1064         Keyword arguments:
1065         tries -- number of tries
1066         timeout -- sleep interval between tries
1067
1068         See _download_webpage_handle docstring for other arguments specification.
1069         """
1070
1071         R''' # NB: These are unused; should they be deprecated?
1072         if tries != 1:
1073             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1074         if timeout is NO_DEFAULT:
1075             timeout = 5
1076         else:
1077             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1078         '''
1079
1080         try_count = 0
1081         while True:
1082             try:
1083                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1084             except http.client.IncompleteRead as e:
1085                 try_count += 1
1086                 if try_count >= tries:
1087                     raise e
1088                 self._sleep(timeout, video_id)
1089
1090     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1091         idstr = format_field(video_id, None, '%s: ')
1092         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1093         if only_once:
1094             if f'WARNING: {msg}' in self._printed_messages:
1095                 return
1096             self._printed_messages.add(f'WARNING: {msg}')
1097         self._downloader.report_warning(msg, *args, **kwargs)
1098
1099     def to_screen(self, msg, *args, **kwargs):
1100         """Print msg to screen, prefixing it with '[ie_name]'"""
1101         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1102
1103     def write_debug(self, msg, *args, **kwargs):
1104         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1105
1106     def get_param(self, name, default=None, *args, **kwargs):
1107         if self._downloader:
1108             return self._downloader.params.get(name, default, *args, **kwargs)
1109         return default
1110
1111     def report_drm(self, video_id, partial=NO_DEFAULT):
1112         if partial is not NO_DEFAULT:
1113             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1114         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1115
1116     def report_extraction(self, id_or_name):
1117         """Report information extraction."""
1118         self.to_screen('%s: Extracting information' % id_or_name)
1119
1120     def report_download_webpage(self, video_id):
1121         """Report webpage download."""
1122         self.to_screen('%s: Downloading webpage' % video_id)
1123
1124     def report_age_confirmation(self):
1125         """Report attempt to confirm age."""
1126         self.to_screen('Confirming age')
1127
1128     def report_login(self):
1129         """Report attempt to log in."""
1130         self.to_screen('Logging in')
1131
1132     def raise_login_required(
1133             self, msg='This video is only available for registered users',
1134             metadata_available=False, method=NO_DEFAULT):
1135         if metadata_available and (
1136                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1137             self.report_warning(msg)
1138             return
1139         msg += format_field(self._login_hint(method), None, '. %s')
1140         raise ExtractorError(msg, expected=True)
1141
1142     def raise_geo_restricted(
1143             self, msg='This video is not available from your location due to geo restriction',
1144             countries=None, metadata_available=False):
1145         if metadata_available and (
1146                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1147             self.report_warning(msg)
1148         else:
1149             raise GeoRestrictedError(msg, countries=countries)
1150
1151     def raise_no_formats(self, msg, expected=False, video_id=None):
1152         if expected and (
1153                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1154             self.report_warning(msg, video_id)
1155         elif isinstance(msg, ExtractorError):
1156             raise msg
1157         else:
1158             raise ExtractorError(msg, expected=expected, video_id=video_id)
1159
1160     # Methods for following #608
1161     @staticmethod
1162     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1163         """Returns a URL that points to a page that should be processed"""
1164         if ie is not None:
1165             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1166         if video_id is not None:
1167             kwargs['id'] = video_id
1168         if video_title is not None:
1169             kwargs['title'] = video_title
1170         return {
1171             **kwargs,
1172             '_type': 'url_transparent' if url_transparent else 'url',
1173             'url': url,
1174         }
1175
1176     @classmethod
1177     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1178                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1179         return cls.playlist_result(
1180             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1181             playlist_id, playlist_title, **kwargs)
1182
1183     @staticmethod
1184     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1185         """Returns a playlist"""
1186         if playlist_id:
1187             kwargs['id'] = playlist_id
1188         if playlist_title:
1189             kwargs['title'] = playlist_title
1190         if playlist_description is not None:
1191             kwargs['description'] = playlist_description
1192         return {
1193             **kwargs,
1194             '_type': 'multi_video' if multi_video else 'playlist',
1195             'entries': entries,
1196         }
1197
1198     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1199         """
1200         Perform a regex search on the given string, using a single or a list of
1201         patterns returning the first matching group.
1202         In case of failure return a default value or raise a WARNING or a
1203         RegexNotFoundError, depending on fatal, specifying the field name.
1204         """
1205         if string is None:
1206             mobj = None
1207         elif isinstance(pattern, (str, re.Pattern)):
1208             mobj = re.search(pattern, string, flags)
1209         else:
1210             for p in pattern:
1211                 mobj = re.search(p, string, flags)
1212                 if mobj:
1213                     break
1214
1215         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1216
1217         if mobj:
1218             if group is None:
1219                 # return the first matching group
1220                 return next(g for g in mobj.groups() if g is not None)
1221             elif isinstance(group, (list, tuple)):
1222                 return tuple(mobj.group(g) for g in group)
1223             else:
1224                 return mobj.group(group)
1225         elif default is not NO_DEFAULT:
1226             return default
1227         elif fatal:
1228             raise RegexNotFoundError('Unable to extract %s' % _name)
1229         else:
1230             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1231             return None
1232
1233     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1234                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1235         """Searches string for the JSON object specified by start_pattern"""
1236         # NB: end_pattern is only used to reduce the size of the initial match
1237         if default is NO_DEFAULT:
1238             default, has_default = {}, False
1239         else:
1240             fatal, has_default = False, True
1241
1242         json_string = self._search_regex(
1243             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1244             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1245         if not json_string:
1246             return default
1247
1248         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1249         try:
1250             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1251         except ExtractorError as e:
1252             if fatal:
1253                 raise ExtractorError(
1254                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1255             elif not has_default:
1256                 self.report_warning(
1257                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1258         return default
1259
1260     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1261         """
1262         Like _search_regex, but strips HTML tags and unescapes entities.
1263         """
1264         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1265         if res:
1266             return clean_html(res).strip()
1267         else:
1268             return res
1269
1270     def _get_netrc_login_info(self, netrc_machine=None):
1271         username = None
1272         password = None
1273         netrc_machine = netrc_machine or self._NETRC_MACHINE
1274
1275         if self.get_param('usenetrc', False):
1276             try:
1277                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1278                 if os.path.isdir(netrc_file):
1279                     netrc_file = os.path.join(netrc_file, '.netrc')
1280                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1281                 if info is not None:
1282                     username = info[0]
1283                     password = info[2]
1284                 else:
1285                     raise netrc.NetrcParseError(
1286                         'No authenticators for %s' % netrc_machine)
1287             except (OSError, netrc.NetrcParseError) as err:
1288                 self.report_warning(
1289                     'parsing .netrc: %s' % error_to_compat_str(err))
1290
1291         return username, password
1292
1293     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1294         """
1295         Get the login info as (username, password)
1296         First look for the manually specified credentials using username_option
1297         and password_option as keys in params dictionary. If no such credentials
1298         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1299         value.
1300         If there's no info available, return (None, None)
1301         """
1302
1303         # Attempt to use provided username and password or .netrc data
1304         username = self.get_param(username_option)
1305         if username is not None:
1306             password = self.get_param(password_option)
1307         else:
1308             username, password = self._get_netrc_login_info(netrc_machine)
1309
1310         return username, password
1311
1312     def _get_tfa_info(self, note='two-factor verification code'):
1313         """
1314         Get the two-factor authentication info
1315         TODO - asking the user will be required for sms/phone verify
1316         currently just uses the command line option
1317         If there's no info available, return None
1318         """
1319
1320         tfa = self.get_param('twofactor')
1321         if tfa is not None:
1322             return tfa
1323
1324         return getpass.getpass('Type %s and press [Return]: ' % note)
1325
1326     # Helper functions for extracting OpenGraph info
1327     @staticmethod
1328     def _og_regexes(prop):
1329         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1330         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1331                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1332         template = r'<meta[^>]+?%s[^>]+?%s'
1333         return [
1334             template % (property_re, content_re),
1335             template % (content_re, property_re),
1336         ]
1337
1338     @staticmethod
1339     def _meta_regex(prop):
1340         return r'''(?isx)<meta
1341                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1342                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1343
1344     def _og_search_property(self, prop, html, name=None, **kargs):
1345         prop = variadic(prop)
1346         if name is None:
1347             name = 'OpenGraph %s' % prop[0]
1348         og_regexes = []
1349         for p in prop:
1350             og_regexes.extend(self._og_regexes(p))
1351         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1352         if escaped is None:
1353             return None
1354         return unescapeHTML(escaped)
1355
1356     def _og_search_thumbnail(self, html, **kargs):
1357         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1358
1359     def _og_search_description(self, html, **kargs):
1360         return self._og_search_property('description', html, fatal=False, **kargs)
1361
1362     def _og_search_title(self, html, *, fatal=False, **kargs):
1363         return self._og_search_property('title', html, fatal=fatal, **kargs)
1364
1365     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1366         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1367         if secure:
1368             regexes = self._og_regexes('video:secure_url') + regexes
1369         return self._html_search_regex(regexes, html, name, **kargs)
1370
1371     def _og_search_url(self, html, **kargs):
1372         return self._og_search_property('url', html, **kargs)
1373
1374     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1375         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1376
1377     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1378         name = variadic(name)
1379         if display_name is None:
1380             display_name = name[0]
1381         return self._html_search_regex(
1382             [self._meta_regex(n) for n in name],
1383             html, display_name, fatal=fatal, group='content', **kwargs)
1384
1385     def _dc_search_uploader(self, html):
1386         return self._html_search_meta('dc.creator', html, 'uploader')
1387
1388     @staticmethod
1389     def _rta_search(html):
1390         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1391         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1392                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1393                      html):
1394             return 18
1395
1396         # And then there are the jokers who advertise that they use RTA, but actually don't.
1397         AGE_LIMIT_MARKERS = [
1398             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1399         ]
1400         if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
1401             return 18
1402         return 0
1403
1404     def _media_rating_search(self, html):
1405         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1406         rating = self._html_search_meta('rating', html)
1407
1408         if not rating:
1409             return None
1410
1411         RATING_TABLE = {
1412             'safe for kids': 0,
1413             'general': 8,
1414             '14 years': 14,
1415             'mature': 17,
1416             'restricted': 19,
1417         }
1418         return RATING_TABLE.get(rating.lower())
1419
1420     def _family_friendly_search(self, html):
1421         # See http://schema.org/VideoObject
1422         family_friendly = self._html_search_meta(
1423             'isFamilyFriendly', html, default=None)
1424
1425         if not family_friendly:
1426             return None
1427
1428         RATING_TABLE = {
1429             '1': 0,
1430             'true': 0,
1431             '0': 18,
1432             'false': 18,
1433         }
1434         return RATING_TABLE.get(family_friendly.lower())
1435
1436     def _twitter_search_player(self, html):
1437         return self._html_search_meta('twitter:player', html,
1438                                       'twitter card player')
1439
1440     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1441         """Yield all json ld objects in the html"""
1442         if default is not NO_DEFAULT:
1443             fatal = False
1444         for mobj in re.finditer(JSON_LD_RE, html):
1445             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1446             for json_ld in variadic(json_ld_item):
1447                 if isinstance(json_ld, dict):
1448                     yield json_ld
1449
1450     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1451         """Search for a video in any json ld in the html"""
1452         if default is not NO_DEFAULT:
1453             fatal = False
1454         info = self._json_ld(
1455             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1456             video_id, fatal=fatal, expected_type=expected_type)
1457         if info:
1458             return info
1459         if default is not NO_DEFAULT:
1460             return default
1461         elif fatal:
1462             raise RegexNotFoundError('Unable to extract JSON-LD')
1463         else:
1464             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1465             return {}
1466
1467     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1468         if isinstance(json_ld, str):
1469             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1470         if not json_ld:
1471             return {}
1472         info = {}
1473
1474         INTERACTION_TYPE_MAP = {
1475             'CommentAction': 'comment',
1476             'AgreeAction': 'like',
1477             'DisagreeAction': 'dislike',
1478             'LikeAction': 'like',
1479             'DislikeAction': 'dislike',
1480             'ListenAction': 'view',
1481             'WatchAction': 'view',
1482             'ViewAction': 'view',
1483         }
1484
1485         def is_type(e, *expected_types):
1486             type = variadic(traverse_obj(e, '@type'))
1487             return any(x in type for x in expected_types)
1488
1489         def extract_interaction_type(e):
1490             interaction_type = e.get('interactionType')
1491             if isinstance(interaction_type, dict):
1492                 interaction_type = interaction_type.get('@type')
1493             return str_or_none(interaction_type)
1494
1495         def extract_interaction_statistic(e):
1496             interaction_statistic = e.get('interactionStatistic')
1497             if isinstance(interaction_statistic, dict):
1498                 interaction_statistic = [interaction_statistic]
1499             if not isinstance(interaction_statistic, list):
1500                 return
1501             for is_e in interaction_statistic:
1502                 if not is_type(is_e, 'InteractionCounter'):
1503                     continue
1504                 interaction_type = extract_interaction_type(is_e)
1505                 if not interaction_type:
1506                     continue
1507                 # For interaction count some sites provide string instead of
1508                 # an integer (as per spec) with non digit characters (e.g. ",")
1509                 # so extracting count with more relaxed str_to_int
1510                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1511                 if interaction_count is None:
1512                     continue
1513                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1514                 if not count_kind:
1515                     continue
1516                 count_key = '%s_count' % count_kind
1517                 if info.get(count_key) is not None:
1518                     continue
1519                 info[count_key] = interaction_count
1520
1521         def extract_chapter_information(e):
1522             chapters = [{
1523                 'title': part.get('name'),
1524                 'start_time': part.get('startOffset'),
1525                 'end_time': part.get('endOffset'),
1526             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1527             for idx, (last_c, current_c, next_c) in enumerate(zip(
1528                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1529                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1530                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1531                 if None in current_c.values():
1532                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1533                     return
1534             if chapters:
1535                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1536                 info['chapters'] = chapters
1537
1538         def extract_video_object(e):
1539             author = e.get('author')
1540             info.update({
1541                 'url': url_or_none(e.get('contentUrl')),
1542                 'ext': mimetype2ext(e.get('encodingFormat')),
1543                 'title': unescapeHTML(e.get('name')),
1544                 'description': unescapeHTML(e.get('description')),
1545                 'thumbnails': [{'url': unescapeHTML(url)}
1546                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1547                                if url_or_none(url)],
1548                 'duration': parse_duration(e.get('duration')),
1549                 'timestamp': unified_timestamp(e.get('uploadDate')),
1550                 # author can be an instance of 'Organization' or 'Person' types.
1551                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1552                 # however some websites are using 'Text' type instead.
1553                 # 1. https://schema.org/VideoObject
1554                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1555                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1556                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1557                 'tbr': int_or_none(e.get('bitrate')),
1558                 'width': int_or_none(e.get('width')),
1559                 'height': int_or_none(e.get('height')),
1560                 'view_count': int_or_none(e.get('interactionCount')),
1561                 'tags': try_call(lambda: e.get('keywords').split(',')),
1562             })
1563             if is_type(e, 'AudioObject'):
1564                 info.update({
1565                     'vcodec': 'none',
1566                     'abr': int_or_none(e.get('bitrate')),
1567                 })
1568             extract_interaction_statistic(e)
1569             extract_chapter_information(e)
1570
1571         def traverse_json_ld(json_ld, at_top_level=True):
1572             for e in variadic(json_ld):
1573                 if not isinstance(e, dict):
1574                     continue
1575                 if at_top_level and '@context' not in e:
1576                     continue
1577                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1578                     traverse_json_ld(e['@graph'], at_top_level=False)
1579                     continue
1580                 if expected_type is not None and not is_type(e, expected_type):
1581                     continue
1582                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1583                 if rating is not None:
1584                     info['average_rating'] = rating
1585                 if is_type(e, 'TVEpisode', 'Episode'):
1586                     episode_name = unescapeHTML(e.get('name'))
1587                     info.update({
1588                         'episode': episode_name,
1589                         'episode_number': int_or_none(e.get('episodeNumber')),
1590                         'description': unescapeHTML(e.get('description')),
1591                     })
1592                     if not info.get('title') and episode_name:
1593                         info['title'] = episode_name
1594                     part_of_season = e.get('partOfSeason')
1595                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1596                         info.update({
1597                             'season': unescapeHTML(part_of_season.get('name')),
1598                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1599                         })
1600                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1601                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1602                         info['series'] = unescapeHTML(part_of_series.get('name'))
1603                 elif is_type(e, 'Movie'):
1604                     info.update({
1605                         'title': unescapeHTML(e.get('name')),
1606                         'description': unescapeHTML(e.get('description')),
1607                         'duration': parse_duration(e.get('duration')),
1608                         'timestamp': unified_timestamp(e.get('dateCreated')),
1609                     })
1610                 elif is_type(e, 'Article', 'NewsArticle'):
1611                     info.update({
1612                         'timestamp': parse_iso8601(e.get('datePublished')),
1613                         'title': unescapeHTML(e.get('headline')),
1614                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1615                     })
1616                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1617                         extract_video_object(e['video'][0])
1618                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1619                         extract_video_object(e['subjectOf'][0])
1620                 elif is_type(e, 'VideoObject', 'AudioObject'):
1621                     extract_video_object(e)
1622                     if expected_type is None:
1623                         continue
1624                     else:
1625                         break
1626                 video = e.get('video')
1627                 if is_type(video, 'VideoObject'):
1628                     extract_video_object(video)
1629                 if expected_type is None:
1630                     continue
1631                 else:
1632                     break
1633
1634         traverse_json_ld(json_ld)
1635         return filter_dict(info)
1636
1637     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1638         return self._parse_json(
1639             self._search_regex(
1640                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1641                 webpage, 'next.js data', fatal=fatal, **kw),
1642             video_id, transform_source=transform_source, fatal=fatal)
1643
1644     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1645         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1646         rectx = re.escape(context_name)
1647         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1648         js, arg_keys, arg_vals = self._search_regex(
1649             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1650             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1651             default=NO_DEFAULT if fatal else (None, None, None))
1652         if js is None:
1653             return {}
1654
1655         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1656
1657         for key, val in args.items():
1658             if val in ('undefined', 'void 0'):
1659                 args[key] = 'null'
1660
1661         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1662         return traverse_obj(ret, traverse) or {}
1663
1664     @staticmethod
1665     def _hidden_inputs(html):
1666         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1667         hidden_inputs = {}
1668         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1669             attrs = extract_attributes(input)
1670             if not input:
1671                 continue
1672             if attrs.get('type') not in ('hidden', 'submit'):
1673                 continue
1674             name = attrs.get('name') or attrs.get('id')
1675             value = attrs.get('value')
1676             if name and value is not None:
1677                 hidden_inputs[name] = value
1678         return hidden_inputs
1679
1680     def _form_hidden_inputs(self, form_id, html):
1681         form = self._search_regex(
1682             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1683             html, '%s form' % form_id, group='form')
1684         return self._hidden_inputs(form)
1685
1686     class FormatSort:
1687         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1688
1689         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1690                    'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
1691                    'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1692         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1693                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1694                         'fps', 'fs_approx', 'source', 'id')
1695
1696         settings = {
1697             'vcodec': {'type': 'ordered', 'regex': True,
1698                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1699             'acodec': {'type': 'ordered', 'regex': True,
1700                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1701             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1702                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1703             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1704                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1705             'vext': {'type': 'ordered', 'field': 'video_ext',
1706                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1707                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1708             'aext': {'type': 'ordered', 'field': 'audio_ext',
1709                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1710                      'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
1711             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1712             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1713                            'field': ('vcodec', 'acodec'),
1714                            'function': lambda it: int(any(v != 'none' for v in it))},
1715             'ie_pref': {'priority': True, 'type': 'extractor'},
1716             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1717             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1718             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1719             'quality': {'convert': 'float', 'default': -1},
1720             'filesize': {'convert': 'bytes'},
1721             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1722             'id': {'convert': 'string', 'field': 'format_id'},
1723             'height': {'convert': 'float_none'},
1724             'width': {'convert': 'float_none'},
1725             'fps': {'convert': 'float_none'},
1726             'channels': {'convert': 'float_none', 'field': 'audio_channels'},
1727             'tbr': {'convert': 'float_none'},
1728             'vbr': {'convert': 'float_none'},
1729             'abr': {'convert': 'float_none'},
1730             'asr': {'convert': 'float_none'},
1731             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1732
1733             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1734             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1735             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1736             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1737             'res': {'type': 'multiple', 'field': ('height', 'width'),
1738                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1739
1740             # Actual field names
1741             'format_id': {'type': 'alias', 'field': 'id'},
1742             'preference': {'type': 'alias', 'field': 'ie_pref'},
1743             'language_preference': {'type': 'alias', 'field': 'lang'},
1744             'source_preference': {'type': 'alias', 'field': 'source'},
1745             'protocol': {'type': 'alias', 'field': 'proto'},
1746             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1747             'audio_channels': {'type': 'alias', 'field': 'channels'},
1748
1749             # Deprecated
1750             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1751             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1752             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1753             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1754             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1755             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1756             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1757             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1758             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1759             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1760             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1761             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1762             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1763             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1764             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1765             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1766             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1767             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1768             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1769             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1770         }
1771
1772         def __init__(self, ie, field_preference):
1773             self._order = []
1774             self.ydl = ie._downloader
1775             self.evaluate_params(self.ydl.params, field_preference)
1776             if ie.get_param('verbose'):
1777                 self.print_verbose_info(self.ydl.write_debug)
1778
1779         def _get_field_setting(self, field, key):
1780             if field not in self.settings:
1781                 if key in ('forced', 'priority'):
1782                     return False
1783                 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
1784                                             'deprecated and may be removed in a future version')
1785                 self.settings[field] = {}
1786             propObj = self.settings[field]
1787             if key not in propObj:
1788                 type = propObj.get('type')
1789                 if key == 'field':
1790                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1791                 elif key == 'convert':
1792                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1793                 else:
1794                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1795                 propObj[key] = default
1796             return propObj[key]
1797
1798         def _resolve_field_value(self, field, value, convertNone=False):
1799             if value is None:
1800                 if not convertNone:
1801                     return None
1802             else:
1803                 value = value.lower()
1804             conversion = self._get_field_setting(field, 'convert')
1805             if conversion == 'ignore':
1806                 return None
1807             if conversion == 'string':
1808                 return value
1809             elif conversion == 'float_none':
1810                 return float_or_none(value)
1811             elif conversion == 'bytes':
1812                 return FileDownloader.parse_bytes(value)
1813             elif conversion == 'order':
1814                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1815                 use_regex = self._get_field_setting(field, 'regex')
1816                 list_length = len(order_list)
1817                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1818                 if use_regex and value is not None:
1819                     for i, regex in enumerate(order_list):
1820                         if regex and re.match(regex, value):
1821                             return list_length - i
1822                     return list_length - empty_pos  # not in list
1823                 else:  # not regex or  value = None
1824                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1825             else:
1826                 if value.isnumeric():
1827                     return float(value)
1828                 else:
1829                     self.settings[field]['convert'] = 'string'
1830                     return value
1831
1832         def evaluate_params(self, params, sort_extractor):
1833             self._use_free_order = params.get('prefer_free_formats', False)
1834             self._sort_user = params.get('format_sort', [])
1835             self._sort_extractor = sort_extractor
1836
1837             def add_item(field, reverse, closest, limit_text):
1838                 field = field.lower()
1839                 if field in self._order:
1840                     return
1841                 self._order.append(field)
1842                 limit = self._resolve_field_value(field, limit_text)
1843                 data = {
1844                     'reverse': reverse,
1845                     'closest': False if limit is None else closest,
1846                     'limit_text': limit_text,
1847                     'limit': limit}
1848                 if field in self.settings:
1849                     self.settings[field].update(data)
1850                 else:
1851                     self.settings[field] = data
1852
1853             sort_list = (
1854                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1855                 + (tuple() if params.get('format_sort_force', False)
1856                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1857                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1858
1859             for item in sort_list:
1860                 match = re.match(self.regex, item)
1861                 if match is None:
1862                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1863                 field = match.group('field')
1864                 if field is None:
1865                     continue
1866                 if self._get_field_setting(field, 'type') == 'alias':
1867                     alias, field = field, self._get_field_setting(field, 'field')
1868                     if self._get_field_setting(alias, 'deprecated'):
1869                         self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
1870                                                     f'be removed in a future version. Please use {field} instead')
1871                 reverse = match.group('reverse') is not None
1872                 closest = match.group('separator') == '~'
1873                 limit_text = match.group('limit')
1874
1875                 has_limit = limit_text is not None
1876                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1877                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1878
1879                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1880                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1881                 limit_count = len(limits)
1882                 for (i, f) in enumerate(fields):
1883                     add_item(f, reverse, closest,
1884                              limits[i] if i < limit_count
1885                              else limits[0] if has_limit and not has_multiple_limits
1886                              else None)
1887
1888         def print_verbose_info(self, write_debug):
1889             if self._sort_user:
1890                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1891             if self._sort_extractor:
1892                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1893             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1894                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1895                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1896                               self._get_field_setting(field, 'limit_text'),
1897                               self._get_field_setting(field, 'limit'))
1898                 if self._get_field_setting(field, 'limit_text') is not None else '')
1899                 for field in self._order if self._get_field_setting(field, 'visible')]))
1900
1901         def _calculate_field_preference_from_value(self, format, field, type, value):
1902             reverse = self._get_field_setting(field, 'reverse')
1903             closest = self._get_field_setting(field, 'closest')
1904             limit = self._get_field_setting(field, 'limit')
1905
1906             if type == 'extractor':
1907                 maximum = self._get_field_setting(field, 'max')
1908                 if value is None or (maximum is not None and value >= maximum):
1909                     value = -1
1910             elif type == 'boolean':
1911                 in_list = self._get_field_setting(field, 'in_list')
1912                 not_in_list = self._get_field_setting(field, 'not_in_list')
1913                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1914             elif type == 'ordered':
1915                 value = self._resolve_field_value(field, value, True)
1916
1917             # try to convert to number
1918             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1919             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1920             if is_num:
1921                 value = val_num
1922
1923             return ((-10, 0) if value is None
1924                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1925                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1926                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1927                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1928                     else (-1, value, 0))
1929
1930         def _calculate_field_preference(self, format, field):
1931             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1932             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1933             if type == 'multiple':
1934                 type = 'field'  # Only 'field' is allowed in multiple for now
1935                 actual_fields = self._get_field_setting(field, 'field')
1936
1937                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1938             else:
1939                 value = get_value(field)
1940             return self._calculate_field_preference_from_value(format, field, type, value)
1941
1942         def calculate_preference(self, format):
1943             # Determine missing protocol
1944             if not format.get('protocol'):
1945                 format['protocol'] = determine_protocol(format)
1946
1947             # Determine missing ext
1948             if not format.get('ext') and 'url' in format:
1949                 format['ext'] = determine_ext(format['url'])
1950             if format.get('vcodec') == 'none':
1951                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1952                 format['video_ext'] = 'none'
1953             else:
1954                 format['video_ext'] = format['ext']
1955                 format['audio_ext'] = 'none'
1956             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1957             #    format['preference'] = -1000
1958
1959             # Determine missing bitrates
1960             if format.get('tbr') is None:
1961                 if format.get('vbr') is not None and format.get('abr') is not None:
1962                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1963             else:
1964                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1965                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1966                 if format.get('acodec') != 'none' and format.get('abr') is None:
1967                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1968
1969             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1970
1971     def _sort_formats(self, formats, field_preference=[]):
1972         if not formats:
1973             return
1974         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1975
1976     def _check_formats(self, formats, video_id):
1977         if formats:
1978             formats[:] = filter(
1979                 lambda f: self._is_valid_url(
1980                     f['url'], video_id,
1981                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1982                 formats)
1983
1984     @staticmethod
1985     def _remove_duplicate_formats(formats):
1986         format_urls = set()
1987         unique_formats = []
1988         for f in formats:
1989             if f['url'] not in format_urls:
1990                 format_urls.add(f['url'])
1991                 unique_formats.append(f)
1992         formats[:] = unique_formats
1993
1994     def _is_valid_url(self, url, video_id, item='video', headers={}):
1995         url = self._proto_relative_url(url, scheme='http:')
1996         # For now assume non HTTP(S) URLs always valid
1997         if not (url.startswith('http://') or url.startswith('https://')):
1998             return True
1999         try:
2000             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
2001             return True
2002         except ExtractorError as e:
2003             self.to_screen(
2004                 '%s: %s URL is invalid, skipping: %s'
2005                 % (video_id, item, error_to_compat_str(e.cause)))
2006             return False
2007
2008     def http_scheme(self):
2009         """ Either "http:" or "https:", depending on the user's preferences """
2010         return (
2011             'http:'
2012             if self.get_param('prefer_insecure', False)
2013             else 'https:')
2014
2015     def _proto_relative_url(self, url, scheme=None):
2016         scheme = scheme or self.http_scheme()
2017         assert scheme.endswith(':')
2018         return sanitize_url(url, scheme=scheme[:-1])
2019
2020     def _sleep(self, timeout, video_id, msg_template=None):
2021         if msg_template is None:
2022             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
2023         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
2024         self.to_screen(msg)
2025         time.sleep(timeout)
2026
2027     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2028                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
2029                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
2030         res = self._download_xml_handle(
2031             manifest_url, video_id, 'Downloading f4m manifest',
2032             'Unable to download f4m manifest',
2033             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
2034             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
2035             transform_source=transform_source,
2036             fatal=fatal, data=data, headers=headers, query=query)
2037         if res is False:
2038             return []
2039
2040         manifest, urlh = res
2041         manifest_url = urlh.geturl()
2042
2043         return self._parse_f4m_formats(
2044             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2045             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2046
2047     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2048                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2049                            fatal=True, m3u8_id=None):
2050         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2051             return []
2052
2053         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2054         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2055         if akamai_pv is not None and ';' in akamai_pv.text:
2056             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2057             if playerVerificationChallenge.strip() != '':
2058                 return []
2059
2060         formats = []
2061         manifest_version = '1.0'
2062         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2063         if not media_nodes:
2064             manifest_version = '2.0'
2065             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2066         # Remove unsupported DRM protected media from final formats
2067         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2068         media_nodes = remove_encrypted_media(media_nodes)
2069         if not media_nodes:
2070             return formats
2071
2072         manifest_base_url = get_base_url(manifest)
2073
2074         bootstrap_info = xpath_element(
2075             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2076             'bootstrap info', default=None)
2077
2078         vcodec = None
2079         mime_type = xpath_text(
2080             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2081             'base URL', default=None)
2082         if mime_type and mime_type.startswith('audio/'):
2083             vcodec = 'none'
2084
2085         for i, media_el in enumerate(media_nodes):
2086             tbr = int_or_none(media_el.attrib.get('bitrate'))
2087             width = int_or_none(media_el.attrib.get('width'))
2088             height = int_or_none(media_el.attrib.get('height'))
2089             format_id = join_nonempty(f4m_id, tbr or i)
2090             # If <bootstrapInfo> is present, the specified f4m is a
2091             # stream-level manifest, and only set-level manifests may refer to
2092             # external resources.  See section 11.4 and section 4 of F4M spec
2093             if bootstrap_info is None:
2094                 media_url = None
2095                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2096                 if manifest_version == '2.0':
2097                     media_url = media_el.attrib.get('href')
2098                 if media_url is None:
2099                     media_url = media_el.attrib.get('url')
2100                 if not media_url:
2101                     continue
2102                 manifest_url = (
2103                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2104                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2105                 # If media_url is itself a f4m manifest do the recursive extraction
2106                 # since bitrates in parent manifest (this one) and media_url manifest
2107                 # may differ leading to inability to resolve the format by requested
2108                 # bitrate in f4m downloader
2109                 ext = determine_ext(manifest_url)
2110                 if ext == 'f4m':
2111                     f4m_formats = self._extract_f4m_formats(
2112                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2113                         transform_source=transform_source, fatal=fatal)
2114                     # Sometimes stream-level manifest contains single media entry that
2115                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2116                     # At the same time parent's media entry in set-level manifest may
2117                     # contain it. We will copy it from parent in such cases.
2118                     if len(f4m_formats) == 1:
2119                         f = f4m_formats[0]
2120                         f.update({
2121                             'tbr': f.get('tbr') or tbr,
2122                             'width': f.get('width') or width,
2123                             'height': f.get('height') or height,
2124                             'format_id': f.get('format_id') if not tbr else format_id,
2125                             'vcodec': vcodec,
2126                         })
2127                     formats.extend(f4m_formats)
2128                     continue
2129                 elif ext == 'm3u8':
2130                     formats.extend(self._extract_m3u8_formats(
2131                         manifest_url, video_id, 'mp4', preference=preference,
2132                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2133                     continue
2134             formats.append({
2135                 'format_id': format_id,
2136                 'url': manifest_url,
2137                 'manifest_url': manifest_url,
2138                 'ext': 'flv' if bootstrap_info is not None else None,
2139                 'protocol': 'f4m',
2140                 'tbr': tbr,
2141                 'width': width,
2142                 'height': height,
2143                 'vcodec': vcodec,
2144                 'preference': preference,
2145                 'quality': quality,
2146             })
2147         return formats
2148
2149     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2150         return {
2151             'format_id': join_nonempty(m3u8_id, 'meta'),
2152             'url': m3u8_url,
2153             'ext': ext,
2154             'protocol': 'm3u8',
2155             'preference': preference - 100 if preference else -100,
2156             'quality': quality,
2157             'resolution': 'multiple',
2158             'format_note': 'Quality selection URL',
2159         }
2160
2161     def _report_ignoring_subs(self, name):
2162         self.report_warning(bug_reports_message(
2163             f'Ignoring subtitle tracks found in the {name} manifest; '
2164             'if any subtitle tracks are missing,'
2165         ), only_once=True)
2166
2167     def _extract_m3u8_formats(self, *args, **kwargs):
2168         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2169         if subs:
2170             self._report_ignoring_subs('HLS')
2171         return fmts
2172
2173     def _extract_m3u8_formats_and_subtitles(
2174             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2175             preference=None, quality=None, m3u8_id=None, note=None,
2176             errnote=None, fatal=True, live=False, data=None, headers={},
2177             query={}):
2178
2179         res = self._download_webpage_handle(
2180             m3u8_url, video_id,
2181             note='Downloading m3u8 information' if note is None else note,
2182             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2183             fatal=fatal, data=data, headers=headers, query=query)
2184
2185         if res is False:
2186             return [], {}
2187
2188         m3u8_doc, urlh = res
2189         m3u8_url = urlh.geturl()
2190
2191         return self._parse_m3u8_formats_and_subtitles(
2192             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2193             preference=preference, quality=quality, m3u8_id=m3u8_id,
2194             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2195             headers=headers, query=query, video_id=video_id)
2196
2197     def _parse_m3u8_formats_and_subtitles(
2198             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2199             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2200             errnote=None, fatal=True, data=None, headers={}, query={},
2201             video_id=None):
2202         formats, subtitles = [], {}
2203
2204         has_drm = re.search('|'.join([
2205             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2206             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2207         ]), m3u8_doc)
2208
2209         def format_url(url):
2210             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2211
2212         if self.get_param('hls_split_discontinuity', False):
2213             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2214                 if not m3u8_doc:
2215                     if not manifest_url:
2216                         return []
2217                     m3u8_doc = self._download_webpage(
2218                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2219                         note=False, errnote='Failed to download m3u8 playlist information')
2220                     if m3u8_doc is False:
2221                         return []
2222                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2223
2224         else:
2225             def _extract_m3u8_playlist_indices(*args, **kwargs):
2226                 return [None]
2227
2228         # References:
2229         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2230         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2231         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2232
2233         # We should try extracting formats only from master playlists [1, 4.3.4],
2234         # i.e. playlists that describe available qualities. On the other hand
2235         # media playlists [1, 4.3.3] should be returned as is since they contain
2236         # just the media without qualities renditions.
2237         # Fortunately, master playlist can be easily distinguished from media
2238         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2239         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2240         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2241         # media playlist and MUST NOT appear in master playlist thus we can
2242         # clearly detect media playlist with this criterion.
2243
2244         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2245             formats = [{
2246                 'format_id': join_nonempty(m3u8_id, idx),
2247                 'format_index': idx,
2248                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2249                 'ext': ext,
2250                 'protocol': entry_protocol,
2251                 'preference': preference,
2252                 'quality': quality,
2253                 'has_drm': has_drm,
2254             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2255
2256             return formats, subtitles
2257
2258         groups = {}
2259         last_stream_inf = {}
2260
2261         def extract_media(x_media_line):
2262             media = parse_m3u8_attributes(x_media_line)
2263             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2264             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2265             if not (media_type and group_id and name):
2266                 return
2267             groups.setdefault(group_id, []).append(media)
2268             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2269             if media_type == 'SUBTITLES':
2270                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2271                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2272                 # However, lack of URI has been spotted in the wild.
2273                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2274                 if not media.get('URI'):
2275                     return
2276                 url = format_url(media['URI'])
2277                 sub_info = {
2278                     'url': url,
2279                     'ext': determine_ext(url),
2280                 }
2281                 if sub_info['ext'] == 'm3u8':
2282                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2283                     # files may contain is WebVTT:
2284                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2285                     sub_info['ext'] = 'vtt'
2286                     sub_info['protocol'] = 'm3u8_native'
2287                 lang = media.get('LANGUAGE') or 'und'
2288                 subtitles.setdefault(lang, []).append(sub_info)
2289             if media_type not in ('VIDEO', 'AUDIO'):
2290                 return
2291             media_url = media.get('URI')
2292             if media_url:
2293                 manifest_url = format_url(media_url)
2294                 formats.extend({
2295                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2296                     'format_note': name,
2297                     'format_index': idx,
2298                     'url': manifest_url,
2299                     'manifest_url': m3u8_url,
2300                     'language': media.get('LANGUAGE'),
2301                     'ext': ext,
2302                     'protocol': entry_protocol,
2303                     'preference': preference,
2304                     'quality': quality,
2305                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2306                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2307
2308         def build_stream_name():
2309             # Despite specification does not mention NAME attribute for
2310             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2311             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2312             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2313             stream_name = last_stream_inf.get('NAME')
2314             if stream_name:
2315                 return stream_name
2316             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2317             # from corresponding rendition group
2318             stream_group_id = last_stream_inf.get('VIDEO')
2319             if not stream_group_id:
2320                 return
2321             stream_group = groups.get(stream_group_id)
2322             if not stream_group:
2323                 return stream_group_id
2324             rendition = stream_group[0]
2325             return rendition.get('NAME') or stream_group_id
2326
2327         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2328         # chance to detect video only formats when EXT-X-STREAM-INF tags
2329         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2330         for line in m3u8_doc.splitlines():
2331             if line.startswith('#EXT-X-MEDIA:'):
2332                 extract_media(line)
2333
2334         for line in m3u8_doc.splitlines():
2335             if line.startswith('#EXT-X-STREAM-INF:'):
2336                 last_stream_inf = parse_m3u8_attributes(line)
2337             elif line.startswith('#') or not line.strip():
2338                 continue
2339             else:
2340                 tbr = float_or_none(
2341                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2342                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2343                 manifest_url = format_url(line.strip())
2344
2345                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2346                     format_id = [m3u8_id, None, idx]
2347                     # Bandwidth of live streams may differ over time thus making
2348                     # format_id unpredictable. So it's better to keep provided
2349                     # format_id intact.
2350                     if not live:
2351                         stream_name = build_stream_name()
2352                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2353                     f = {
2354                         'format_id': join_nonempty(*format_id),
2355                         'format_index': idx,
2356                         'url': manifest_url,
2357                         'manifest_url': m3u8_url,
2358                         'tbr': tbr,
2359                         'ext': ext,
2360                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2361                         'protocol': entry_protocol,
2362                         'preference': preference,
2363                         'quality': quality,
2364                     }
2365                     resolution = last_stream_inf.get('RESOLUTION')
2366                     if resolution:
2367                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2368                         if mobj:
2369                             f['width'] = int(mobj.group('width'))
2370                             f['height'] = int(mobj.group('height'))
2371                     # Unified Streaming Platform
2372                     mobj = re.search(
2373                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2374                     if mobj:
2375                         abr, vbr = mobj.groups()
2376                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2377                         f.update({
2378                             'vbr': vbr,
2379                             'abr': abr,
2380                         })
2381                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2382                     f.update(codecs)
2383                     audio_group_id = last_stream_inf.get('AUDIO')
2384                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2385                     # references a rendition group MUST have a CODECS attribute.
2386                     # However, this is not always respected. E.g. [2]
2387                     # contains EXT-X-STREAM-INF tag which references AUDIO
2388                     # rendition group but does not have CODECS and despite
2389                     # referencing an audio group it represents a complete
2390                     # (with audio and video) format. So, for such cases we will
2391                     # ignore references to rendition groups and treat them
2392                     # as complete formats.
2393                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2394                         audio_group = groups.get(audio_group_id)
2395                         if audio_group and audio_group[0].get('URI'):
2396                             # TODO: update acodec for audio only formats with
2397                             # the same GROUP-ID
2398                             f['acodec'] = 'none'
2399                     if not f.get('ext'):
2400                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2401                     formats.append(f)
2402
2403                     # for DailyMotion
2404                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2405                     if progressive_uri:
2406                         http_f = f.copy()
2407                         del http_f['manifest_url']
2408                         http_f.update({
2409                             'format_id': f['format_id'].replace('hls-', 'http-'),
2410                             'protocol': 'http',
2411                             'url': progressive_uri,
2412                         })
2413                         formats.append(http_f)
2414
2415                 last_stream_inf = {}
2416         return formats, subtitles
2417
2418     def _extract_m3u8_vod_duration(
2419             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2420
2421         m3u8_vod = self._download_webpage(
2422             m3u8_vod_url, video_id,
2423             note='Downloading m3u8 VOD manifest' if note is None else note,
2424             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2425             fatal=False, data=data, headers=headers, query=query)
2426
2427         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2428
2429     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2430         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2431             return None
2432
2433         return int(sum(
2434             float(line[len('#EXTINF:'):].split(',')[0])
2435             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2436
2437     @staticmethod
2438     def _xpath_ns(path, namespace=None):
2439         if not namespace:
2440             return path
2441         out = []
2442         for c in path.split('/'):
2443             if not c or c == '.':
2444                 out.append(c)
2445             else:
2446                 out.append('{%s}%s' % (namespace, c))
2447         return '/'.join(out)
2448
2449     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2450         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2451         if res is False:
2452             assert not fatal
2453             return [], {}
2454
2455         smil, urlh = res
2456         smil_url = urlh.geturl()
2457
2458         namespace = self._parse_smil_namespace(smil)
2459
2460         fmts = self._parse_smil_formats(
2461             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2462         subs = self._parse_smil_subtitles(
2463             smil, namespace=namespace)
2464
2465         return fmts, subs
2466
2467     def _extract_smil_formats(self, *args, **kwargs):
2468         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2469         if subs:
2470             self._report_ignoring_subs('SMIL')
2471         return fmts
2472
2473     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2474         res = self._download_smil(smil_url, video_id, fatal=fatal)
2475         if res is False:
2476             return {}
2477
2478         smil, urlh = res
2479         smil_url = urlh.geturl()
2480
2481         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2482
2483     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2484         return self._download_xml_handle(
2485             smil_url, video_id, 'Downloading SMIL file',
2486             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2487
2488     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2489         namespace = self._parse_smil_namespace(smil)
2490
2491         formats = self._parse_smil_formats(
2492             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2493         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2494
2495         video_id = os.path.splitext(url_basename(smil_url))[0]
2496         title = None
2497         description = None
2498         upload_date = None
2499         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2500             name = meta.attrib.get('name')
2501             content = meta.attrib.get('content')
2502             if not name or not content:
2503                 continue
2504             if not title and name == 'title':
2505                 title = content
2506             elif not description and name in ('description', 'abstract'):
2507                 description = content
2508             elif not upload_date and name == 'date':
2509                 upload_date = unified_strdate(content)
2510
2511         thumbnails = [{
2512             'id': image.get('type'),
2513             'url': image.get('src'),
2514             'width': int_or_none(image.get('width')),
2515             'height': int_or_none(image.get('height')),
2516         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2517
2518         return {
2519             'id': video_id,
2520             'title': title or video_id,
2521             'description': description,
2522             'upload_date': upload_date,
2523             'thumbnails': thumbnails,
2524             'formats': formats,
2525             'subtitles': subtitles,
2526         }
2527
2528     def _parse_smil_namespace(self, smil):
2529         return self._search_regex(
2530             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2531
2532     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2533         base = smil_url
2534         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2535             b = meta.get('base') or meta.get('httpBase')
2536             if b:
2537                 base = b
2538                 break
2539
2540         formats = []
2541         rtmp_count = 0
2542         http_count = 0
2543         m3u8_count = 0
2544         imgs_count = 0
2545
2546         srcs = set()
2547         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2548         for medium in media:
2549             src = medium.get('src')
2550             if not src or src in srcs:
2551                 continue
2552             srcs.add(src)
2553
2554             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2555             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2556             width = int_or_none(medium.get('width'))
2557             height = int_or_none(medium.get('height'))
2558             proto = medium.get('proto')
2559             ext = medium.get('ext')
2560             src_ext = determine_ext(src)
2561             streamer = medium.get('streamer') or base
2562
2563             if proto == 'rtmp' or streamer.startswith('rtmp'):
2564                 rtmp_count += 1
2565                 formats.append({
2566                     'url': streamer,
2567                     'play_path': src,
2568                     'ext': 'flv',
2569                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2570                     'tbr': bitrate,
2571                     'filesize': filesize,
2572                     'width': width,
2573                     'height': height,
2574                 })
2575                 if transform_rtmp_url:
2576                     streamer, src = transform_rtmp_url(streamer, src)
2577                     formats[-1].update({
2578                         'url': streamer,
2579                         'play_path': src,
2580                     })
2581                 continue
2582
2583             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2584             src_url = src_url.strip()
2585
2586             if proto == 'm3u8' or src_ext == 'm3u8':
2587                 m3u8_formats = self._extract_m3u8_formats(
2588                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2589                 if len(m3u8_formats) == 1:
2590                     m3u8_count += 1
2591                     m3u8_formats[0].update({
2592                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2593                         'tbr': bitrate,
2594                         'width': width,
2595                         'height': height,
2596                     })
2597                 formats.extend(m3u8_formats)
2598             elif src_ext == 'f4m':
2599                 f4m_url = src_url
2600                 if not f4m_params:
2601                     f4m_params = {
2602                         'hdcore': '3.2.0',
2603                         'plugin': 'flowplayer-3.2.0.1',
2604                     }
2605                 f4m_url += '&' if '?' in f4m_url else '?'
2606                 f4m_url += urllib.parse.urlencode(f4m_params)
2607                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2608             elif src_ext == 'mpd':
2609                 formats.extend(self._extract_mpd_formats(
2610                     src_url, video_id, mpd_id='dash', fatal=False))
2611             elif re.search(r'\.ism/[Mm]anifest', src_url):
2612                 formats.extend(self._extract_ism_formats(
2613                     src_url, video_id, ism_id='mss', fatal=False))
2614             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2615                 http_count += 1
2616                 formats.append({
2617                     'url': src_url,
2618                     'ext': ext or src_ext or 'flv',
2619                     'format_id': 'http-%d' % (bitrate or http_count),
2620                     'tbr': bitrate,
2621                     'filesize': filesize,
2622                     'width': width,
2623                     'height': height,
2624                 })
2625
2626         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2627             src = medium.get('src')
2628             if not src or src in srcs:
2629                 continue
2630             srcs.add(src)
2631
2632             imgs_count += 1
2633             formats.append({
2634                 'format_id': 'imagestream-%d' % (imgs_count),
2635                 'url': src,
2636                 'ext': mimetype2ext(medium.get('type')),
2637                 'acodec': 'none',
2638                 'vcodec': 'none',
2639                 'width': int_or_none(medium.get('width')),
2640                 'height': int_or_none(medium.get('height')),
2641                 'format_note': 'SMIL storyboards',
2642             })
2643
2644         return formats
2645
2646     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2647         urls = []
2648         subtitles = {}
2649         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2650             src = textstream.get('src')
2651             if not src or src in urls:
2652                 continue
2653             urls.append(src)
2654             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2655             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2656             subtitles.setdefault(lang, []).append({
2657                 'url': src,
2658                 'ext': ext,
2659             })
2660         return subtitles
2661
2662     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2663         res = self._download_xml_handle(
2664             xspf_url, playlist_id, 'Downloading xpsf playlist',
2665             'Unable to download xspf manifest', fatal=fatal)
2666         if res is False:
2667             return []
2668
2669         xspf, urlh = res
2670         xspf_url = urlh.geturl()
2671
2672         return self._parse_xspf(
2673             xspf, playlist_id, xspf_url=xspf_url,
2674             xspf_base_url=base_url(xspf_url))
2675
2676     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2677         NS_MAP = {
2678             'xspf': 'http://xspf.org/ns/0/',
2679             's1': 'http://static.streamone.nl/player/ns/0',
2680         }
2681
2682         entries = []
2683         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2684             title = xpath_text(
2685                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2686             description = xpath_text(
2687                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2688             thumbnail = xpath_text(
2689                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2690             duration = float_or_none(
2691                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2692
2693             formats = []
2694             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2695                 format_url = urljoin(xspf_base_url, location.text)
2696                 if not format_url:
2697                     continue
2698                 formats.append({
2699                     'url': format_url,
2700                     'manifest_url': xspf_url,
2701                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2702                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2703                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2704                 })
2705             self._sort_formats(formats)
2706
2707             entries.append({
2708                 'id': playlist_id,
2709                 'title': title,
2710                 'description': description,
2711                 'thumbnail': thumbnail,
2712                 'duration': duration,
2713                 'formats': formats,
2714             })
2715         return entries
2716
2717     def _extract_mpd_formats(self, *args, **kwargs):
2718         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2719         if subs:
2720             self._report_ignoring_subs('DASH')
2721         return fmts
2722
2723     def _extract_mpd_formats_and_subtitles(
2724             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2725             fatal=True, data=None, headers={}, query={}):
2726         res = self._download_xml_handle(
2727             mpd_url, video_id,
2728             note='Downloading MPD manifest' if note is None else note,
2729             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2730             fatal=fatal, data=data, headers=headers, query=query)
2731         if res is False:
2732             return [], {}
2733         mpd_doc, urlh = res
2734         if mpd_doc is None:
2735             return [], {}
2736
2737         # We could have been redirected to a new url when we retrieved our mpd file.
2738         mpd_url = urlh.geturl()
2739         mpd_base_url = base_url(mpd_url)
2740
2741         return self._parse_mpd_formats_and_subtitles(
2742             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2743
2744     def _parse_mpd_formats(self, *args, **kwargs):
2745         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2746         if subs:
2747             self._report_ignoring_subs('DASH')
2748         return fmts
2749
2750     def _parse_mpd_formats_and_subtitles(
2751             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2752         """
2753         Parse formats from MPD manifest.
2754         References:
2755          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2756             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2757          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2758         """
2759         if not self.get_param('dynamic_mpd', True):
2760             if mpd_doc.get('type') == 'dynamic':
2761                 return [], {}
2762
2763         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2764
2765         def _add_ns(path):
2766             return self._xpath_ns(path, namespace)
2767
2768         def is_drm_protected(element):
2769             return element.find(_add_ns('ContentProtection')) is not None
2770
2771         def extract_multisegment_info(element, ms_parent_info):
2772             ms_info = ms_parent_info.copy()
2773
2774             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2775             # common attributes and elements.  We will only extract relevant
2776             # for us.
2777             def extract_common(source):
2778                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2779                 if segment_timeline is not None:
2780                     s_e = segment_timeline.findall(_add_ns('S'))
2781                     if s_e:
2782                         ms_info['total_number'] = 0
2783                         ms_info['s'] = []
2784                         for s in s_e:
2785                             r = int(s.get('r', 0))
2786                             ms_info['total_number'] += 1 + r
2787                             ms_info['s'].append({
2788                                 't': int(s.get('t', 0)),
2789                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2790                                 'd': int(s.attrib['d']),
2791                                 'r': r,
2792                             })
2793                 start_number = source.get('startNumber')
2794                 if start_number:
2795                     ms_info['start_number'] = int(start_number)
2796                 timescale = source.get('timescale')
2797                 if timescale:
2798                     ms_info['timescale'] = int(timescale)
2799                 segment_duration = source.get('duration')
2800                 if segment_duration:
2801                     ms_info['segment_duration'] = float(segment_duration)
2802
2803             def extract_Initialization(source):
2804                 initialization = source.find(_add_ns('Initialization'))
2805                 if initialization is not None:
2806                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2807
2808             segment_list = element.find(_add_ns('SegmentList'))
2809             if segment_list is not None:
2810                 extract_common(segment_list)
2811                 extract_Initialization(segment_list)
2812                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2813                 if segment_urls_e:
2814                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2815             else:
2816                 segment_template = element.find(_add_ns('SegmentTemplate'))
2817                 if segment_template is not None:
2818                     extract_common(segment_template)
2819                     media = segment_template.get('media')
2820                     if media:
2821                         ms_info['media'] = media
2822                     initialization = segment_template.get('initialization')
2823                     if initialization:
2824                         ms_info['initialization'] = initialization
2825                     else:
2826                         extract_Initialization(segment_template)
2827             return ms_info
2828
2829         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2830         formats, subtitles = [], {}
2831         stream_numbers = collections.defaultdict(int)
2832         for period in mpd_doc.findall(_add_ns('Period')):
2833             period_duration = parse_duration(period.get('duration')) or mpd_duration
2834             period_ms_info = extract_multisegment_info(period, {
2835                 'start_number': 1,
2836                 'timescale': 1,
2837             })
2838             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2839                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2840                 for representation in adaptation_set.findall(_add_ns('Representation')):
2841                     representation_attrib = adaptation_set.attrib.copy()
2842                     representation_attrib.update(representation.attrib)
2843                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2844                     mime_type = representation_attrib['mimeType']
2845                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2846
2847                     codec_str = representation_attrib.get('codecs', '')
2848                     # Some kind of binary subtitle found in some youtube livestreams
2849                     if mime_type == 'application/x-rawcc':
2850                         codecs = {'scodec': codec_str}
2851                     else:
2852                         codecs = parse_codecs(codec_str)
2853                     if content_type not in ('video', 'audio', 'text'):
2854                         if mime_type == 'image/jpeg':
2855                             content_type = mime_type
2856                         elif codecs.get('vcodec', 'none') != 'none':
2857                             content_type = 'video'
2858                         elif codecs.get('acodec', 'none') != 'none':
2859                             content_type = 'audio'
2860                         elif codecs.get('scodec', 'none') != 'none':
2861                             content_type = 'text'
2862                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2863                             content_type = 'text'
2864                         else:
2865                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2866                             continue
2867
2868                     base_url = ''
2869                     for element in (representation, adaptation_set, period, mpd_doc):
2870                         base_url_e = element.find(_add_ns('BaseURL'))
2871                         if try_call(lambda: base_url_e.text) is not None:
2872                             base_url = base_url_e.text + base_url
2873                             if re.match(r'^https?://', base_url):
2874                                 break
2875                     if mpd_base_url and base_url.startswith('/'):
2876                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2877                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2878                         if not mpd_base_url.endswith('/'):
2879                             mpd_base_url += '/'
2880                         base_url = mpd_base_url + base_url
2881                     representation_id = representation_attrib.get('id')
2882                     lang = representation_attrib.get('lang')
2883                     url_el = representation.find(_add_ns('BaseURL'))
2884                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2885                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2886                     if representation_id is not None:
2887                         format_id = representation_id
2888                     else:
2889                         format_id = content_type
2890                     if mpd_id:
2891                         format_id = mpd_id + '-' + format_id
2892                     if content_type in ('video', 'audio'):
2893                         f = {
2894                             'format_id': format_id,
2895                             'manifest_url': mpd_url,
2896                             'ext': mimetype2ext(mime_type),
2897                             'width': int_or_none(representation_attrib.get('width')),
2898                             'height': int_or_none(representation_attrib.get('height')),
2899                             'tbr': float_or_none(bandwidth, 1000),
2900                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2901                             'fps': int_or_none(representation_attrib.get('frameRate')),
2902                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2903                             'format_note': 'DASH %s' % content_type,
2904                             'filesize': filesize,
2905                             'container': mimetype2ext(mime_type) + '_dash',
2906                             **codecs
2907                         }
2908                     elif content_type == 'text':
2909                         f = {
2910                             'ext': mimetype2ext(mime_type),
2911                             'manifest_url': mpd_url,
2912                             'filesize': filesize,
2913                         }
2914                     elif content_type == 'image/jpeg':
2915                         # See test case in VikiIE
2916                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2917                         f = {
2918                             'format_id': format_id,
2919                             'ext': 'mhtml',
2920                             'manifest_url': mpd_url,
2921                             'format_note': 'DASH storyboards (jpeg)',
2922                             'acodec': 'none',
2923                             'vcodec': 'none',
2924                         }
2925                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2926                         f['has_drm'] = True
2927                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2928
2929                     def prepare_template(template_name, identifiers):
2930                         tmpl = representation_ms_info[template_name]
2931                         if representation_id is not None:
2932                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2933                         # First of, % characters outside $...$ templates
2934                         # must be escaped by doubling for proper processing
2935                         # by % operator string formatting used further (see
2936                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2937                         t = ''
2938                         in_template = False
2939                         for c in tmpl:
2940                             t += c
2941                             if c == '$':
2942                                 in_template = not in_template
2943                             elif c == '%' and not in_template:
2944                                 t += c
2945                         # Next, $...$ templates are translated to their
2946                         # %(...) counterparts to be used with % operator
2947                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2948                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2949                         t.replace('$$', '$')
2950                         return t
2951
2952                     # @initialization is a regular template like @media one
2953                     # so it should be handled just the same way (see
2954                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2955                     if 'initialization' in representation_ms_info:
2956                         initialization_template = prepare_template(
2957                             'initialization',
2958                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2959                             # $Time$ shall not be included for @initialization thus
2960                             # only $Bandwidth$ remains
2961                             ('Bandwidth', ))
2962                         representation_ms_info['initialization_url'] = initialization_template % {
2963                             'Bandwidth': bandwidth,
2964                         }
2965
2966                     def location_key(location):
2967                         return 'url' if re.match(r'^https?://', location) else 'path'
2968
2969                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2970
2971                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2972                         media_location_key = location_key(media_template)
2973
2974                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2975                         # can't be used at the same time
2976                         if '%(Number' in media_template and 's' not in representation_ms_info:
2977                             segment_duration = None
2978                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2979                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2980                                 representation_ms_info['total_number'] = int(math.ceil(
2981                                     float_or_none(period_duration, segment_duration, default=0)))
2982                             representation_ms_info['fragments'] = [{
2983                                 media_location_key: media_template % {
2984                                     'Number': segment_number,
2985                                     'Bandwidth': bandwidth,
2986                                 },
2987                                 'duration': segment_duration,
2988                             } for segment_number in range(
2989                                 representation_ms_info['start_number'],
2990                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2991                         else:
2992                             # $Number*$ or $Time$ in media template with S list available
2993                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2994                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2995                             representation_ms_info['fragments'] = []
2996                             segment_time = 0
2997                             segment_d = None
2998                             segment_number = representation_ms_info['start_number']
2999
3000                             def add_segment_url():
3001                                 segment_url = media_template % {
3002                                     'Time': segment_time,
3003                                     'Bandwidth': bandwidth,
3004                                     'Number': segment_number,
3005                                 }
3006                                 representation_ms_info['fragments'].append({
3007                                     media_location_key: segment_url,
3008                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
3009                                 })
3010
3011                             for num, s in enumerate(representation_ms_info['s']):
3012                                 segment_time = s.get('t') or segment_time
3013                                 segment_d = s['d']
3014                                 add_segment_url()
3015                                 segment_number += 1
3016                                 for r in range(s.get('r', 0)):
3017                                     segment_time += segment_d
3018                                     add_segment_url()
3019                                     segment_number += 1
3020                                 segment_time += segment_d
3021                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
3022                         # No media template,
3023                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
3024                         # or any YouTube dashsegments video
3025                         fragments = []
3026                         segment_index = 0
3027                         timescale = representation_ms_info['timescale']
3028                         for s in representation_ms_info['s']:
3029                             duration = float_or_none(s['d'], timescale)
3030                             for r in range(s.get('r', 0) + 1):
3031                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
3032                                 fragments.append({
3033                                     location_key(segment_uri): segment_uri,
3034                                     'duration': duration,
3035                                 })
3036                                 segment_index += 1
3037                         representation_ms_info['fragments'] = fragments
3038                     elif 'segment_urls' in representation_ms_info:
3039                         # Segment URLs with no SegmentTimeline
3040                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3041                         # https://github.com/ytdl-org/youtube-dl/pull/14844
3042                         fragments = []
3043                         segment_duration = float_or_none(
3044                             representation_ms_info['segment_duration'],
3045                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3046                         for segment_url in representation_ms_info['segment_urls']:
3047                             fragment = {
3048                                 location_key(segment_url): segment_url,
3049                             }
3050                             if segment_duration:
3051                                 fragment['duration'] = segment_duration
3052                             fragments.append(fragment)
3053                         representation_ms_info['fragments'] = fragments
3054                     # If there is a fragments key available then we correctly recognized fragmented media.
3055                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3056                     # assumption is not necessarily correct since we may simply have no support for
3057                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3058                     if 'fragments' in representation_ms_info:
3059                         f.update({
3060                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3061                             'url': mpd_url or base_url,
3062                             'fragment_base_url': base_url,
3063                             'fragments': [],
3064                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3065                         })
3066                         if 'initialization_url' in representation_ms_info:
3067                             initialization_url = representation_ms_info['initialization_url']
3068                             if not f.get('url'):
3069                                 f['url'] = initialization_url
3070                             f['fragments'].append({location_key(initialization_url): initialization_url})
3071                         f['fragments'].extend(representation_ms_info['fragments'])
3072                         if not period_duration:
3073                             period_duration = try_get(
3074                                 representation_ms_info,
3075                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3076                     else:
3077                         # Assuming direct URL to unfragmented media.
3078                         f['url'] = base_url
3079                     if content_type in ('video', 'audio', 'image/jpeg'):
3080                         f['manifest_stream_number'] = stream_numbers[f['url']]
3081                         stream_numbers[f['url']] += 1
3082                         formats.append(f)
3083                     elif content_type == 'text':
3084                         subtitles.setdefault(lang or 'und', []).append(f)
3085
3086         return formats, subtitles
3087
3088     def _extract_ism_formats(self, *args, **kwargs):
3089         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3090         if subs:
3091             self._report_ignoring_subs('ISM')
3092         return fmts
3093
3094     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3095         res = self._download_xml_handle(
3096             ism_url, video_id,
3097             note='Downloading ISM manifest' if note is None else note,
3098             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3099             fatal=fatal, data=data, headers=headers, query=query)
3100         if res is False:
3101             return [], {}
3102         ism_doc, urlh = res
3103         if ism_doc is None:
3104             return [], {}
3105
3106         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3107
3108     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3109         """
3110         Parse formats from ISM manifest.
3111         References:
3112          1. [MS-SSTR]: Smooth Streaming Protocol,
3113             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3114         """
3115         if ism_doc.get('IsLive') == 'TRUE':
3116             return [], {}
3117
3118         duration = int(ism_doc.attrib['Duration'])
3119         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3120
3121         formats = []
3122         subtitles = {}
3123         for stream in ism_doc.findall('StreamIndex'):
3124             stream_type = stream.get('Type')
3125             if stream_type not in ('video', 'audio', 'text'):
3126                 continue
3127             url_pattern = stream.attrib['Url']
3128             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3129             stream_name = stream.get('Name')
3130             stream_language = stream.get('Language', 'und')
3131             for track in stream.findall('QualityLevel'):
3132                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3133                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3134                 # TODO: add support for WVC1 and WMAP
3135                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3136                     self.report_warning('%s is not a supported codec' % fourcc)
3137                     continue
3138                 tbr = int(track.attrib['Bitrate']) // 1000
3139                 # [1] does not mention Width and Height attributes. However,
3140                 # they're often present while MaxWidth and MaxHeight are
3141                 # missing, so should be used as fallbacks
3142                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3143                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3144                 sampling_rate = int_or_none(track.get('SamplingRate'))
3145
3146                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3147                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3148
3149                 fragments = []
3150                 fragment_ctx = {
3151                     'time': 0,
3152                 }
3153                 stream_fragments = stream.findall('c')
3154                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3155                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3156                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3157                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3158                     if not fragment_ctx['duration']:
3159                         try:
3160                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3161                         except IndexError:
3162                             next_fragment_time = duration
3163                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3164                     for _ in range(fragment_repeat):
3165                         fragments.append({
3166                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3167                             'duration': fragment_ctx['duration'] / stream_timescale,
3168                         })
3169                         fragment_ctx['time'] += fragment_ctx['duration']
3170
3171                 if stream_type == 'text':
3172                     subtitles.setdefault(stream_language, []).append({
3173                         'ext': 'ismt',
3174                         'protocol': 'ism',
3175                         'url': ism_url,
3176                         'manifest_url': ism_url,
3177                         'fragments': fragments,
3178                         '_download_params': {
3179                             'stream_type': stream_type,
3180                             'duration': duration,
3181                             'timescale': stream_timescale,
3182                             'fourcc': fourcc,
3183                             'language': stream_language,
3184                             'codec_private_data': track.get('CodecPrivateData'),
3185                         }
3186                     })
3187                 elif stream_type in ('video', 'audio'):
3188                     formats.append({
3189                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3190                         'url': ism_url,
3191                         'manifest_url': ism_url,
3192                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3193                         'width': width,
3194                         'height': height,
3195                         'tbr': tbr,
3196                         'asr': sampling_rate,
3197                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3198                         'acodec': 'none' if stream_type == 'video' else fourcc,
3199                         'protocol': 'ism',
3200                         'fragments': fragments,
3201                         'has_drm': ism_doc.find('Protection') is not None,
3202                         '_download_params': {
3203                             'stream_type': stream_type,
3204                             'duration': duration,
3205                             'timescale': stream_timescale,
3206                             'width': width or 0,
3207                             'height': height or 0,
3208                             'fourcc': fourcc,
3209                             'language': stream_language,
3210                             'codec_private_data': track.get('CodecPrivateData'),
3211                             'sampling_rate': sampling_rate,
3212                             'channels': int_or_none(track.get('Channels', 2)),
3213                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3214                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3215                         },
3216                     })
3217         return formats, subtitles
3218
3219     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3220         def absolute_url(item_url):
3221             return urljoin(base_url, item_url)
3222
3223         def parse_content_type(content_type):
3224             if not content_type:
3225                 return {}
3226             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3227             if ctr:
3228                 mimetype, codecs = ctr.groups()
3229                 f = parse_codecs(codecs)
3230                 f['ext'] = mimetype2ext(mimetype)
3231                 return f
3232             return {}
3233
3234         def _media_formats(src, cur_media_type, type_info=None):
3235             type_info = type_info or {}
3236             full_url = absolute_url(src)
3237             ext = type_info.get('ext') or determine_ext(full_url)
3238             if ext == 'm3u8':
3239                 is_plain_url = False
3240                 formats = self._extract_m3u8_formats(
3241                     full_url, video_id, ext='mp4',
3242                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3243                     preference=preference, quality=quality, fatal=False)
3244             elif ext == 'mpd':
3245                 is_plain_url = False
3246                 formats = self._extract_mpd_formats(
3247                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3248             else:
3249                 is_plain_url = True
3250                 formats = [{
3251                     'url': full_url,
3252                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3253                     'ext': ext,
3254                 }]
3255             return is_plain_url, formats
3256
3257         entries = []
3258         # amp-video and amp-audio are very similar to their HTML5 counterparts
3259         # so we will include them right here (see
3260         # https://www.ampproject.org/docs/reference/components/amp-video)
3261         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3262         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3263         media_tags = [(media_tag, media_tag_name, media_type, '')
3264                       for media_tag, media_tag_name, media_type
3265                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3266         media_tags.extend(re.findall(
3267             # We only allow video|audio followed by a whitespace or '>'.
3268             # Allowing more characters may end up in significant slow down (see
3269             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3270             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3271             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3272         for media_tag, _, media_type, media_content in media_tags:
3273             media_info = {
3274                 'formats': [],
3275                 'subtitles': {},
3276             }
3277             media_attributes = extract_attributes(media_tag)
3278             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3279             if src:
3280                 f = parse_content_type(media_attributes.get('type'))
3281                 _, formats = _media_formats(src, media_type, f)
3282                 media_info['formats'].extend(formats)
3283             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3284             if media_content:
3285                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3286                     s_attr = extract_attributes(source_tag)
3287                     # data-video-src and data-src are non standard but seen
3288                     # several times in the wild
3289                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3290                     if not src:
3291                         continue
3292                     f = parse_content_type(s_attr.get('type'))
3293                     is_plain_url, formats = _media_formats(src, media_type, f)
3294                     if is_plain_url:
3295                         # width, height, res, label and title attributes are
3296                         # all not standard but seen several times in the wild
3297                         labels = [
3298                             s_attr.get(lbl)
3299                             for lbl in ('label', 'title')
3300                             if str_or_none(s_attr.get(lbl))
3301                         ]
3302                         width = int_or_none(s_attr.get('width'))
3303                         height = (int_or_none(s_attr.get('height'))
3304                                   or int_or_none(s_attr.get('res')))
3305                         if not width or not height:
3306                             for lbl in labels:
3307                                 resolution = parse_resolution(lbl)
3308                                 if not resolution:
3309                                     continue
3310                                 width = width or resolution.get('width')
3311                                 height = height or resolution.get('height')
3312                         for lbl in labels:
3313                             tbr = parse_bitrate(lbl)
3314                             if tbr:
3315                                 break
3316                         else:
3317                             tbr = None
3318                         f.update({
3319                             'width': width,
3320                             'height': height,
3321                             'tbr': tbr,
3322                             'format_id': s_attr.get('label') or s_attr.get('title'),
3323                         })
3324                         f.update(formats[0])
3325                         media_info['formats'].append(f)
3326                     else:
3327                         media_info['formats'].extend(formats)
3328                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3329                     track_attributes = extract_attributes(track_tag)
3330                     kind = track_attributes.get('kind')
3331                     if not kind or kind in ('subtitles', 'captions'):
3332                         src = strip_or_none(track_attributes.get('src'))
3333                         if not src:
3334                             continue
3335                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3336                         media_info['subtitles'].setdefault(lang, []).append({
3337                             'url': absolute_url(src),
3338                         })
3339             for f in media_info['formats']:
3340                 f.setdefault('http_headers', {})['Referer'] = base_url
3341             if media_info['formats'] or media_info['subtitles']:
3342                 entries.append(media_info)
3343         return entries
3344
3345     def _extract_akamai_formats(self, *args, **kwargs):
3346         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3347         if subs:
3348             self._report_ignoring_subs('akamai')
3349         return fmts
3350
3351     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3352         signed = 'hdnea=' in manifest_url
3353         if not signed:
3354             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3355             manifest_url = re.sub(
3356                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3357                 '', manifest_url).strip('?')
3358
3359         formats = []
3360         subtitles = {}
3361
3362         hdcore_sign = 'hdcore=3.7.0'
3363         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3364         hds_host = hosts.get('hds')
3365         if hds_host:
3366             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3367         if 'hdcore=' not in f4m_url:
3368             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3369         f4m_formats = self._extract_f4m_formats(
3370             f4m_url, video_id, f4m_id='hds', fatal=False)
3371         for entry in f4m_formats:
3372             entry.update({'extra_param_to_segment_url': hdcore_sign})
3373         formats.extend(f4m_formats)
3374
3375         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3376         hls_host = hosts.get('hls')
3377         if hls_host:
3378             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3379         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3380             m3u8_url, video_id, 'mp4', 'm3u8_native',
3381             m3u8_id='hls', fatal=False)
3382         formats.extend(m3u8_formats)
3383         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3384
3385         http_host = hosts.get('http')
3386         if http_host and m3u8_formats and not signed:
3387             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3388             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3389             qualities_length = len(qualities)
3390             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3391                 i = 0
3392                 for f in m3u8_formats:
3393                     if f['vcodec'] != 'none':
3394                         for protocol in ('http', 'https'):
3395                             http_f = f.copy()
3396                             del http_f['manifest_url']
3397                             http_url = re.sub(
3398                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3399                             http_f.update({
3400                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3401                                 'url': http_url,
3402                                 'protocol': protocol,
3403                             })
3404                             formats.append(http_f)
3405                         i += 1
3406
3407         return formats, subtitles
3408
3409     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3410         query = urllib.parse.urlparse(url).query
3411         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3412         mobj = re.search(
3413             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3414         url_base = mobj.group('url')
3415         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3416         formats = []
3417
3418         def manifest_url(manifest):
3419             m_url = f'{http_base_url}/{manifest}'
3420             if query:
3421                 m_url += '?%s' % query
3422             return m_url
3423
3424         if 'm3u8' not in skip_protocols:
3425             formats.extend(self._extract_m3u8_formats(
3426                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3427                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3428         if 'f4m' not in skip_protocols:
3429             formats.extend(self._extract_f4m_formats(
3430                 manifest_url('manifest.f4m'),
3431                 video_id, f4m_id='hds', fatal=False))
3432         if 'dash' not in skip_protocols:
3433             formats.extend(self._extract_mpd_formats(
3434                 manifest_url('manifest.mpd'),
3435                 video_id, mpd_id='dash', fatal=False))
3436         if re.search(r'(?:/smil:|\.smil)', url_base):
3437             if 'smil' not in skip_protocols:
3438                 rtmp_formats = self._extract_smil_formats(
3439                     manifest_url('jwplayer.smil'),
3440                     video_id, fatal=False)
3441                 for rtmp_format in rtmp_formats:
3442                     rtsp_format = rtmp_format.copy()
3443                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3444                     del rtsp_format['play_path']
3445                     del rtsp_format['ext']
3446                     rtsp_format.update({
3447                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3448                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3449                         'protocol': 'rtsp',
3450                     })
3451                     formats.extend([rtmp_format, rtsp_format])
3452         else:
3453             for protocol in ('rtmp', 'rtsp'):
3454                 if protocol not in skip_protocols:
3455                     formats.append({
3456                         'url': f'{protocol}:{url_base}',
3457                         'format_id': protocol,
3458                         'protocol': protocol,
3459                     })
3460         return formats
3461
3462     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3463         mobj = re.search(
3464             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3465             webpage)
3466         if mobj:
3467             try:
3468                 jwplayer_data = self._parse_json(mobj.group('options'),
3469                                                  video_id=video_id,
3470                                                  transform_source=transform_source)
3471             except ExtractorError:
3472                 pass
3473             else:
3474                 if isinstance(jwplayer_data, dict):
3475                     return jwplayer_data
3476
3477     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3478         jwplayer_data = self._find_jwplayer_data(
3479             webpage, video_id, transform_source=js_to_json)
3480         return self._parse_jwplayer_data(
3481             jwplayer_data, video_id, *args, **kwargs)
3482
3483     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3484                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3485         # JWPlayer backward compatibility: flattened playlists
3486         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3487         if 'playlist' not in jwplayer_data:
3488             jwplayer_data = {'playlist': [jwplayer_data]}
3489
3490         entries = []
3491
3492         # JWPlayer backward compatibility: single playlist item
3493         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3494         if not isinstance(jwplayer_data['playlist'], list):
3495             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3496
3497         for video_data in jwplayer_data['playlist']:
3498             # JWPlayer backward compatibility: flattened sources
3499             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3500             if 'sources' not in video_data:
3501                 video_data['sources'] = [video_data]
3502
3503             this_video_id = video_id or video_data['mediaid']
3504
3505             formats = self._parse_jwplayer_formats(
3506                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3507                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3508
3509             subtitles = {}
3510             tracks = video_data.get('tracks')
3511             if tracks and isinstance(tracks, list):
3512                 for track in tracks:
3513                     if not isinstance(track, dict):
3514                         continue
3515                     track_kind = track.get('kind')
3516                     if not track_kind or not isinstance(track_kind, str):
3517                         continue
3518                     if track_kind.lower() not in ('captions', 'subtitles'):
3519                         continue
3520                     track_url = urljoin(base_url, track.get('file'))
3521                     if not track_url:
3522                         continue
3523                     subtitles.setdefault(track.get('label') or 'en', []).append({
3524                         'url': self._proto_relative_url(track_url)
3525                     })
3526
3527             entry = {
3528                 'id': this_video_id,
3529                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3530                 'description': clean_html(video_data.get('description')),
3531                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3532                 'timestamp': int_or_none(video_data.get('pubdate')),
3533                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3534                 'subtitles': subtitles,
3535             }
3536             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3537             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3538                 entry.update({
3539                     '_type': 'url_transparent',
3540                     'url': formats[0]['url'],
3541                 })
3542             else:
3543                 self._sort_formats(formats)
3544                 entry['formats'] = formats
3545             entries.append(entry)
3546         if len(entries) == 1:
3547             return entries[0]
3548         else:
3549             return self.playlist_result(entries)
3550
3551     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3552                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3553         urls = []
3554         formats = []
3555         for source in jwplayer_sources_data:
3556             if not isinstance(source, dict):
3557                 continue
3558             source_url = urljoin(
3559                 base_url, self._proto_relative_url(source.get('file')))
3560             if not source_url or source_url in urls:
3561                 continue
3562             urls.append(source_url)
3563             source_type = source.get('type') or ''
3564             ext = mimetype2ext(source_type) or determine_ext(source_url)
3565             if source_type == 'hls' or ext == 'm3u8':
3566                 formats.extend(self._extract_m3u8_formats(
3567                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3568                     m3u8_id=m3u8_id, fatal=False))
3569             elif source_type == 'dash' or ext == 'mpd':
3570                 formats.extend(self._extract_mpd_formats(
3571                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3572             elif ext == 'smil':
3573                 formats.extend(self._extract_smil_formats(
3574                     source_url, video_id, fatal=False))
3575             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3576             elif source_type.startswith('audio') or ext in (
3577                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3578                 formats.append({
3579                     'url': source_url,
3580                     'vcodec': 'none',
3581                     'ext': ext,
3582                 })
3583             else:
3584                 height = int_or_none(source.get('height'))
3585                 if height is None:
3586                     # Often no height is provided but there is a label in
3587                     # format like "1080p", "720p SD", or 1080.
3588                     height = int_or_none(self._search_regex(
3589                         r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
3590                         'height', default=None))
3591                 a_format = {
3592                     'url': source_url,
3593                     'width': int_or_none(source.get('width')),
3594                     'height': height,
3595                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3596                     'filesize': int_or_none(source.get('filesize')),
3597                     'ext': ext,
3598                 }
3599                 if source_url.startswith('rtmp'):
3600                     a_format['ext'] = 'flv'
3601                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3602                     # of jwplayer.flash.swf
3603                     rtmp_url_parts = re.split(
3604                         r'((?:mp4|mp3|flv):)', source_url, 1)
3605                     if len(rtmp_url_parts) == 3:
3606                         rtmp_url, prefix, play_path = rtmp_url_parts
3607                         a_format.update({
3608                             'url': rtmp_url,
3609                             'play_path': prefix + play_path,
3610                         })
3611                     if rtmp_params:
3612                         a_format.update(rtmp_params)
3613                 formats.append(a_format)
3614         return formats
3615
3616     def _live_title(self, name):
3617         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3618         return name
3619
3620     def _int(self, v, name, fatal=False, **kwargs):
3621         res = int_or_none(v, **kwargs)
3622         if res is None:
3623             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3624             if fatal:
3625                 raise ExtractorError(msg)
3626             else:
3627                 self.report_warning(msg)
3628         return res
3629
3630     def _float(self, v, name, fatal=False, **kwargs):
3631         res = float_or_none(v, **kwargs)
3632         if res is None:
3633             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3634             if fatal:
3635                 raise ExtractorError(msg)
3636             else:
3637                 self.report_warning(msg)
3638         return res
3639
3640     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3641                     path='/', secure=False, discard=False, rest={}, **kwargs):
3642         cookie = http.cookiejar.Cookie(
3643             0, name, value, port, port is not None, domain, True,
3644             domain.startswith('.'), path, True, secure, expire_time,
3645             discard, None, None, rest)
3646         self.cookiejar.set_cookie(cookie)
3647
3648     def _get_cookies(self, url):
3649         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3650         return LenientSimpleCookie(self._downloader._calc_cookies(url))
3651
3652     def _apply_first_set_cookie_header(self, url_handle, cookie):
3653         """
3654         Apply first Set-Cookie header instead of the last. Experimental.
3655
3656         Some sites (e.g. [1-3]) may serve two cookies under the same name
3657         in Set-Cookie header and expect the first (old) one to be set rather
3658         than second (new). However, as of RFC6265 the newer one cookie
3659         should be set into cookie store what actually happens.
3660         We will workaround this issue by resetting the cookie to
3661         the first one manually.
3662         1. https://new.vk.com/
3663         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3664         3. https://learning.oreilly.com/
3665         """
3666         for header, cookies in url_handle.headers.items():
3667             if header.lower() != 'set-cookie':
3668                 continue
3669             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3670             cookie_value = re.search(
3671                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3672             if cookie_value:
3673                 value, domain = cookie_value.groups()
3674                 self._set_cookie(domain, cookie, value)
3675                 break
3676
3677     @classmethod
3678     def get_testcases(cls, include_onlymatching=False):
3679         t = getattr(cls, '_TEST', None)
3680         if t:
3681             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3682             tests = [t]
3683         else:
3684             tests = getattr(cls, '_TESTS', [])
3685         for t in tests:
3686             if not include_onlymatching and t.get('only_matching', False):
3687                 continue
3688             t['name'] = cls.ie_key()
3689             yield t
3690
3691     @classmethod
3692     def get_webpage_testcases(cls):
3693         tests = getattr(cls, '_WEBPAGE_TESTS', [])
3694         for t in tests:
3695             t['name'] = cls.ie_key()
3696         return tests
3697
3698     @classproperty
3699     def age_limit(cls):
3700         """Get age limit from the testcases"""
3701         return max(traverse_obj(
3702             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3703             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3704
3705     @classmethod
3706     def is_suitable(cls, age_limit):
3707         """Test whether the extractor is generally suitable for the given age limit"""
3708         return not age_restricted(cls.age_limit, age_limit)
3709
3710     @classmethod
3711     def description(cls, *, markdown=True, search_examples=None):
3712         """Description of the extractor"""
3713         desc = ''
3714         if cls._NETRC_MACHINE:
3715             if markdown:
3716                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3717             else:
3718                 desc += f' [{cls._NETRC_MACHINE}]'
3719         if cls.IE_DESC is False:
3720             desc += ' [HIDDEN]'
3721         elif cls.IE_DESC:
3722             desc += f' {cls.IE_DESC}'
3723         if cls.SEARCH_KEY:
3724             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3725             if search_examples:
3726                 _COUNTS = ('', '5', '10', 'all')
3727                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3728         if not cls.working():
3729             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3730
3731         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3732         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3733         return f'{name}:{desc}' if desc else name
3734
3735     def extract_subtitles(self, *args, **kwargs):
3736         if (self.get_param('writesubtitles', False)
3737                 or self.get_param('listsubtitles')):
3738             return self._get_subtitles(*args, **kwargs)
3739         return {}
3740
3741     def _get_subtitles(self, *args, **kwargs):
3742         raise NotImplementedError('This method must be implemented by subclasses')
3743
3744     class CommentsDisabled(Exception):
3745         """Raise in _get_comments if comments are disabled for the video"""
3746
3747     def extract_comments(self, *args, **kwargs):
3748         if not self.get_param('getcomments'):
3749             return None
3750         generator = self._get_comments(*args, **kwargs)
3751
3752         def extractor():
3753             comments = []
3754             interrupted = True
3755             try:
3756                 while True:
3757                     comments.append(next(generator))
3758             except StopIteration:
3759                 interrupted = False
3760             except KeyboardInterrupt:
3761                 self.to_screen('Interrupted by user')
3762             except self.CommentsDisabled:
3763                 return {'comments': None, 'comment_count': None}
3764             except Exception as e:
3765                 if self.get_param('ignoreerrors') is not True:
3766                     raise
3767                 self._downloader.report_error(e)
3768             comment_count = len(comments)
3769             self.to_screen(f'Extracted {comment_count} comments')
3770             return {
3771                 'comments': comments,
3772                 'comment_count': None if interrupted else comment_count
3773             }
3774         return extractor
3775
3776     def _get_comments(self, *args, **kwargs):
3777         raise NotImplementedError('This method must be implemented by subclasses')
3778
3779     @staticmethod
3780     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3781         """ Merge subtitle items for one language. Items with duplicated URLs/data
3782         will be dropped. """
3783         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3784         ret = list(subtitle_list1)
3785         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3786         return ret
3787
3788     @classmethod
3789     def _merge_subtitles(cls, *dicts, target=None):
3790         """ Merge subtitle dictionaries, language by language. """
3791         if target is None:
3792             target = {}
3793         for d in dicts:
3794             for lang, subs in d.items():
3795                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3796         return target
3797
3798     def extract_automatic_captions(self, *args, **kwargs):
3799         if (self.get_param('writeautomaticsub', False)
3800                 or self.get_param('listsubtitles')):
3801             return self._get_automatic_captions(*args, **kwargs)
3802         return {}
3803
3804     def _get_automatic_captions(self, *args, **kwargs):
3805         raise NotImplementedError('This method must be implemented by subclasses')
3806
3807     @functools.cached_property
3808     def _cookies_passed(self):
3809         """Whether cookies have been passed to YoutubeDL"""
3810         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3811
3812     def mark_watched(self, *args, **kwargs):
3813         if not self.get_param('mark_watched', False):
3814             return
3815         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3816             self._mark_watched(*args, **kwargs)
3817
3818     def _mark_watched(self, *args, **kwargs):
3819         raise NotImplementedError('This method must be implemented by subclasses')
3820
3821     def geo_verification_headers(self):
3822         headers = {}
3823         geo_verification_proxy = self.get_param('geo_verification_proxy')
3824         if geo_verification_proxy:
3825             headers['Ytdl-request-proxy'] = geo_verification_proxy
3826         return headers
3827
3828     @staticmethod
3829     def _generic_id(url):
3830         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3831
3832     def _generic_title(self, url='', webpage='', *, default=None):
3833         return (self._og_search_title(webpage, default=None)
3834                 or self._html_extract_title(webpage, default=None)
3835                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3836                 or default)
3837
3838     @staticmethod
3839     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3840         all_known = all(map(
3841             lambda x: x is not None,
3842             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3843         return (
3844             'private' if is_private
3845             else 'premium_only' if needs_premium
3846             else 'subscriber_only' if needs_subscription
3847             else 'needs_auth' if needs_auth
3848             else 'unlisted' if is_unlisted
3849             else 'public' if all_known
3850             else None)
3851
3852     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3853         '''
3854         @returns            A list of values for the extractor argument given by "key"
3855                             or "default" if no such key is present
3856         @param default      The default value to return when the key is not present (default: [])
3857         @param casesense    When false, the values are converted to lower case
3858         '''
3859         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3860         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3861         if val is None:
3862             return [] if default is NO_DEFAULT else default
3863         return list(val) if casesense else [x.lower() for x in val]
3864
3865     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3866         if not playlist_id or not video_id:
3867             return not video_id
3868
3869         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3870         if no_playlist is not None:
3871             return not no_playlist
3872
3873         video_id = '' if video_id is True else f' {video_id}'
3874         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3875         if self.get_param('noplaylist'):
3876             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3877             return False
3878         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3879         return True
3880
3881     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3882         RetryManager.report_retry(
3883             err, _count or int(fatal), _retries,
3884             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3885             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3886
3887     def RetryManager(self, **kwargs):
3888         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3889
3890     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3891         display_id = traverse_obj(info_dict, 'display_id', 'id')
3892         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3893         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3894             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3895
3896     @classmethod
3897     def extract_from_webpage(cls, ydl, url, webpage):
3898         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3899               else ydl.get_info_extractor(cls.ie_key()))
3900         for info in ie._extract_from_webpage(url, webpage) or []:
3901             # url = None since we do not want to set (webpage/original)_url
3902             ydl.add_default_extra_info(info, ie, None)
3903             yield info
3904
3905     @classmethod
3906     def _extract_from_webpage(cls, url, webpage):
3907         for embed_url in orderedSet(
3908                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3909             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3910
3911     @classmethod
3912     def _extract_embed_urls(cls, url, webpage):
3913         """@returns all the embed urls on the webpage"""
3914         if '_EMBED_URL_RE' not in cls.__dict__:
3915             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3916             for idx, regex in enumerate(cls._EMBED_REGEX):
3917                 assert regex.count('(?P<url>') == 1, \
3918                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3919             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3920
3921         for regex in cls._EMBED_URL_RE:
3922             for mobj in regex.finditer(webpage):
3923                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3924                 if cls._VALID_URL is False or cls.suitable(embed_url):
3925                     yield embed_url
3926
3927     class StopExtraction(Exception):
3928         pass
3929
3930     @classmethod
3931     def _extract_url(cls, webpage):  # TODO: Remove
3932         """Only for compatibility with some older extractors"""
3933         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3934
3935     @classmethod
3936     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3937         if plugin_name:
3938             mro = inspect.getmro(cls)
3939             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3940             cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key
3941             while getattr(super_class, '__wrapped__', None):
3942                 super_class = super_class.__wrapped__
3943             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3944
3945         return super().__init_subclass__(**kwargs)
3946
3947
3948 class SearchInfoExtractor(InfoExtractor):
3949     """
3950     Base class for paged search queries extractors.
3951     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3952     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3953     """
3954
3955     _MAX_RESULTS = float('inf')
3956
3957     @classproperty
3958     def _VALID_URL(cls):
3959         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3960
3961     def _real_extract(self, query):
3962         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3963         if prefix == '':
3964             return self._get_n_results(query, 1)
3965         elif prefix == 'all':
3966             return self._get_n_results(query, self._MAX_RESULTS)
3967         else:
3968             n = int(prefix)
3969             if n <= 0:
3970                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3971             elif n > self._MAX_RESULTS:
3972                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3973                 n = self._MAX_RESULTS
3974             return self._get_n_results(query, n)
3975
3976     def _get_n_results(self, query, n):
3977         """Get a specified number of results for a query.
3978         Either this function or _search_results must be overridden by subclasses """
3979         return self.playlist_result(
3980             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3981             query, query)
3982
3983     def _search_results(self, query):
3984         """Returns an iterator of search results"""
3985         raise NotImplementedError('This method must be implemented by subclasses')
3986
3987     @classproperty
3988     def SEARCH_KEY(cls):
3989         return cls._SEARCH_KEY
3990
3991
3992 class UnsupportedURLIE(InfoExtractor):
3993     _VALID_URL = '.*'
3994     _ENABLED = False
3995     IE_DESC = False
3996
3997     def _real_extract(self, url):
3998         raise UnsupportedError(url)