yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import sys
  17 import time
  18 import types
  19 import urllib.parse
  20 import urllib.request
  21 import xml.etree.ElementTree
  22
  23 from ..compat import functools  # isort: split
  24 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  25 from ..cookies import LenientSimpleCookie
  26 from ..downloader import FileDownloader
  27 from ..downloader.f4m import get_base_url, remove_encrypted_media
  28 from ..utils import (
  29     IDENTITY,
  30     JSON_LD_RE,
  31     NO_DEFAULT,
  32     ExtractorError,
  33     GeoRestrictedError,
  34     GeoUtils,
  35     LenientJSONDecoder,
  36     RegexNotFoundError,
  37     RetryManager,
  38     UnsupportedError,
  39     age_restricted,
  40     base_url,
  41     bug_reports_message,
  42     classproperty,
  43     clean_html,
  44     determine_ext,
  45     determine_protocol,
  46     dict_get,
  47     encode_data_uri,
  48     error_to_compat_str,
  49     extract_attributes,
  50     filter_dict,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     int_or_none,
  55     join_nonempty,
  56     js_to_json,
  57     mimetype2ext,
  58     network_exceptions,
  59     orderedSet,
  60     parse_bitrate,
  61     parse_codecs,
  62     parse_duration,
  63     parse_iso8601,
  64     parse_m3u8_attributes,
  65     parse_resolution,
  66     sanitize_filename,
  67     sanitize_url,
  68     sanitized_Request,
  69     str_or_none,
  70     str_to_int,
  71     strip_or_none,
  72     traverse_obj,
  73     try_call,
  74     try_get,
  75     unescapeHTML,
  76     unified_strdate,
  77     unified_timestamp,
  78     update_Request,
  79     update_url_query,
  80     url_basename,
  81     url_or_none,
  82     urljoin,
  83     variadic,
  84     xpath_element,
  85     xpath_text,
  86     xpath_with_ns,
  87 )
  88
  89
  90 class InfoExtractor:
  91     """Information Extractor class.
  92
  93     Information extractors are the classes that, given a URL, extract
  94     information about the video (or videos) the URL refers to. This
  95     information includes the real video URL, the video title, author and
  96     others. The information is stored in a dictionary which is then
  97     passed to the YoutubeDL. The YoutubeDL processes this
  98     information possibly downloading the video to the file system, among
  99     other possible outcomes.
 100
 101     The type field determines the type of the result.
 102     By far the most common value (and the default if _type is missing) is
 103     "video", which indicates a single video.
 104
 105     For a video, the dictionaries must include the following fields:
 106
 107     id:             Video identifier.
 108     title:          Video title, unescaped. Set to an empty string if video has
 109                     no title as opposed to "None" which signifies that the
 110                     extractor failed to obtain a title
 111
 112     Additionally, it must contain either a formats entry or a url one:
 113
 114     formats:        A list of dictionaries for each format available, ordered
 115                     from worst to best quality.
 116
 117                     Potential fields:
 118                     * url        The mandatory URL representing the media:
 119                                    for plain file media - HTTP URL of this file,
 120                                    for RTMP - RTMP URL,
 121                                    for HLS - URL of the M3U8 media playlist,
 122                                    for HDS - URL of the F4M manifest,
 123                                    for DASH
 124                                      - HTTP URL to plain file media (in case of
 125                                        unfragmented media)
 126                                      - URL of the MPD manifest or base URL
 127                                        representing the media if MPD manifest
 128                                        is parsed from a string (in case of
 129                                        fragmented media)
 130                                    for MSS - URL of the ISM manifest.
 131                     * manifest_url
 132                                  The URL of the manifest file in case of
 133                                  fragmented media:
 134                                    for HLS - URL of the M3U8 master playlist,
 135                                    for HDS - URL of the F4M manifest,
 136                                    for DASH - URL of the MPD manifest,
 137                                    for MSS - URL of the ISM manifest.
 138                     * manifest_stream_number  (For internal use only)
 139                                  The index of the stream in the manifest file
 140                     * ext        Will be calculated from URL if missing
 141                     * format     A human-readable description of the format
 142                                  ("mp4 container with h264/opus").
 143                                  Calculated from the format_id, width, height.
 144                                  and format_note fields if missing.
 145                     * format_id  A short description of the format
 146                                  ("mp4_h264_opus" or "19").
 147                                 Technically optional, but strongly recommended.
 148                     * format_note Additional info about the format
 149                                  ("3D" or "DASH video")
 150                     * width      Width of the video, if known
 151                     * height     Height of the video, if known
 152                     * resolution Textual description of width and height
 153                     * dynamic_range The dynamic range of the video. One of:
 154                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 155                     * tbr        Average bitrate of audio and video in KBit/s
 156                     * abr        Average audio bitrate in KBit/s
 157                     * acodec     Name of the audio codec in use
 158                     * asr        Audio sampling rate in Hertz
 159                     * audio_channels  Number of audio channels
 160                     * vbr        Average video bitrate in KBit/s
 161                     * fps        Frame rate
 162                     * vcodec     Name of the video codec in use
 163                     * container  Name of the container format
 164                     * filesize   The number of bytes, if known in advance
 165                     * filesize_approx  An estimate for the number of bytes
 166                     * player_url SWF Player URL (used for rtmpdump).
 167                     * protocol   The protocol that will be used for the actual
 168                                  download, lower-case. One of "http", "https" or
 169                                  one of the protocols defined in downloader.PROTOCOL_MAP
 170                     * fragment_base_url
 171                                  Base URL for fragments. Each fragment's path
 172                                  value (if present) will be relative to
 173                                  this URL.
 174                     * fragments  A list of fragments of a fragmented media.
 175                                  Each fragment entry must contain either an url
 176                                  or a path. If an url is present it should be
 177                                  considered by a client. Otherwise both path and
 178                                  fragment_base_url must be present. Here is
 179                                  the list of all potential fields:
 180                                  * "url" - fragment's URL
 181                                  * "path" - fragment's path relative to
 182                                             fragment_base_url
 183                                  * "duration" (optional, int or float)
 184                                  * "filesize" (optional, int)
 185                     * is_from_start  Is a live format that can be downloaded
 186                                 from the start. Boolean
 187                     * preference Order number of this format. If this field is
 188                                  present and not None, the formats get sorted
 189                                  by this field, regardless of all other values.
 190                                  -1 for default (order by other properties),
 191                                  -2 or smaller for less than default.
 192                                  < -1000 to hide the format (if there is
 193                                     another one which is strictly better)
 194                     * language   Language code, e.g. "de" or "en-US".
 195                     * language_preference  Is this in the language mentioned in
 196                                  the URL?
 197                                  10 if it's what the URL is about,
 198                                  -1 for default (don't know),
 199                                  -10 otherwise, other values reserved for now.
 200                     * quality    Order number of the video quality of this
 201                                  format, irrespective of the file format.
 202                                  -1 for default (order by other properties),
 203                                  -2 or smaller for less than default.
 204                     * source_preference  Order number for this video source
 205                                   (quality takes higher priority)
 206                                  -1 for default (order by other properties),
 207                                  -2 or smaller for less than default.
 208                     * http_headers  A dictionary of additional HTTP headers
 209                                  to add to the request.
 210                     * stretched_ratio  If given and not 1, indicates that the
 211                                  video's pixels are not square.
 212                                  width : height ratio as float.
 213                     * no_resume  The server does not support resuming the
 214                                  (HTTP or RTMP) download. Boolean.
 215                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 216                     * downloader_options  A dictionary of downloader options
 217                                  (For internal use only)
 218                                  * http_chunk_size Chunk size for HTTP downloads
 219                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 220                     RTMP formats can also have the additional fields: page_url,
 221                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 222                     rtmp_protocol, rtmp_real_time
 223
 224     url:            Final video URL.
 225     ext:            Video filename extension.
 226     format:         The video format, defaults to ext (used for --get-format)
 227     player_url:     SWF Player URL (used for rtmpdump).
 228
 229     The following fields are optional:
 230
 231     direct:         True if a direct video file was given (must only be set by GenericIE)
 232     alt_title:      A secondary title of the video.
 233     display_id      An alternative identifier for the video, not necessarily
 234                     unique, but available before title. Typically, id is
 235                     something like "4234987", title "Dancing naked mole rats",
 236                     and display_id "dancing-naked-mole-rats"
 237     thumbnails:     A list of dictionaries, with the following entries:
 238                         * "id" (optional, string) - Thumbnail format ID
 239                         * "url"
 240                         * "preference" (optional, int) - quality of the image
 241                         * "width" (optional, int)
 242                         * "height" (optional, int)
 243                         * "resolution" (optional, string "{width}x{height}",
 244                                         deprecated)
 245                         * "filesize" (optional, int)
 246                         * "http_headers" (dict) - HTTP headers for the request
 247     thumbnail:      Full URL to a video thumbnail image.
 248     description:    Full video description.
 249     uploader:       Full name of the video uploader.
 250     license:        License name the video is licensed under.
 251     creator:        The creator of the video.
 252     timestamp:      UNIX timestamp of the moment the video was uploaded
 253     upload_date:    Video upload date in UTC (YYYYMMDD).
 254                     If not explicitly set, calculated from timestamp
 255     release_timestamp: UNIX timestamp of the moment the video was released.
 256                     If it is not clear whether to use timestamp or this, use the former
 257     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 258                     If not explicitly set, calculated from release_timestamp
 259     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 260     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 261                     If not explicitly set, calculated from modified_timestamp
 262     uploader_id:    Nickname or id of the video uploader.
 263     uploader_url:   Full URL to a personal webpage of the video uploader.
 264     channel:        Full name of the channel the video is uploaded on.
 265                     Note that channel fields may or may not repeat uploader
 266                     fields. This depends on a particular extractor.
 267     channel_id:     Id of the channel.
 268     channel_url:    Full URL to a channel webpage.
 269     channel_follower_count: Number of followers of the channel.
 270     location:       Physical location where the video was filmed.
 271     subtitles:      The available subtitles as a dictionary in the format
 272                     {tag: subformats}. "tag" is usually a language code, and
 273                     "subformats" is a list sorted from lower to higher
 274                     preference, each element is a dictionary with the "ext"
 275                     entry and one of:
 276                         * "data": The subtitles file contents
 277                         * "url": A URL pointing to the subtitles file
 278                     It can optionally also have:
 279                         * "name": Name or description of the subtitles
 280                         * "http_headers": A dictionary of additional HTTP headers
 281                                   to add to the request.
 282                     "ext" will be calculated from URL if missing
 283     automatic_captions: Like 'subtitles'; contains automatically generated
 284                     captions instead of normal subtitles
 285     duration:       Length of the video in seconds, as an integer or float.
 286     view_count:     How many users have watched the video on the platform.
 287     concurrent_view_count: How many users are currently watching the video on the platform.
 288     like_count:     Number of positive ratings of the video
 289     dislike_count:  Number of negative ratings of the video
 290     repost_count:   Number of reposts of the video
 291     average_rating: Average rating give by users, the scale used depends on the webpage
 292     comment_count:  Number of comments on the video
 293     comments:       A list of comments, each with one or more of the following
 294                     properties (all but one of text or html optional):
 295                         * "author" - human-readable name of the comment author
 296                         * "author_id" - user ID of the comment author
 297                         * "author_thumbnail" - The thumbnail of the comment author
 298                         * "id" - Comment ID
 299                         * "html" - Comment as HTML
 300                         * "text" - Plain text of the comment
 301                         * "timestamp" - UNIX timestamp of comment
 302                         * "parent" - ID of the comment this one is replying to.
 303                                      Set to "root" to indicate that this is a
 304                                      comment to the original video.
 305                         * "like_count" - Number of positive ratings of the comment
 306                         * "dislike_count" - Number of negative ratings of the comment
 307                         * "is_favorited" - Whether the comment is marked as
 308                                            favorite by the video uploader
 309                         * "author_is_uploader" - Whether the comment is made by
 310                                                  the video uploader
 311     age_limit:      Age restriction for the video, as an integer (years)
 312     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 313                     should allow to get the same result again. (It will be set
 314                     by YoutubeDL if it's missing)
 315     categories:     A list of categories that the video falls in, for example
 316                     ["Sports", "Berlin"]
 317     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 318     cast:           A list of the video cast
 319     is_live:        True, False, or None (=unknown). Whether this video is a
 320                     live stream that goes on instead of a fixed-length video.
 321     was_live:       True, False, or None (=unknown). Whether this video was
 322                     originally a live stream.
 323     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 324                     or 'post_live' (was live, but VOD is not yet processed)
 325                     If absent, automatically set from is_live, was_live
 326     start_time:     Time in seconds where the reproduction should start, as
 327                     specified in the URL.
 328     end_time:       Time in seconds where the reproduction should end, as
 329                     specified in the URL.
 330     chapters:       A list of dictionaries, with the following entries:
 331                         * "start_time" - The start time of the chapter in seconds
 332                         * "end_time" - The end time of the chapter in seconds
 333                         * "title" (optional, string)
 334     playable_in_embed: Whether this video is allowed to play in embedded
 335                     players on other sites. Can be True (=always allowed),
 336                     False (=never allowed), None (=unknown), or a string
 337                     specifying the criteria for embedability; e.g. 'whitelist'
 338     availability:   Under what condition the video is available. One of
 339                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 340                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 341                     to set it
 342     _old_archive_ids: A list of old archive ids needed for backward compatibility
 343     __post_extractor: A function to be called just before the metadata is
 344                     written to either disk, logger or console. The function
 345                     must return a dict which will be added to the info_dict.
 346                     This is usefull for additional information that is
 347                     time-consuming to extract. Note that the fields thus
 348                     extracted will not be available to output template and
 349                     match_filter. So, only "comments" and "comment_count" are
 350                     currently allowed to be extracted via this method.
 351
 352     The following fields should only be used when the video belongs to some logical
 353     chapter or section:
 354
 355     chapter:        Name or title of the chapter the video belongs to.
 356     chapter_number: Number of the chapter the video belongs to, as an integer.
 357     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 358
 359     The following fields should only be used when the video is an episode of some
 360     series, programme or podcast:
 361
 362     series:         Title of the series or programme the video episode belongs to.
 363     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 364     season:         Title of the season the video episode belongs to.
 365     season_number:  Number of the season the video episode belongs to, as an integer.
 366     season_id:      Id of the season the video episode belongs to, as a unicode string.
 367     episode:        Title of the video episode. Unlike mandatory video title field,
 368                     this field should denote the exact title of the video episode
 369                     without any kind of decoration.
 370     episode_number: Number of the video episode within a season, as an integer.
 371     episode_id:     Id of the video episode, as a unicode string.
 372
 373     The following fields should only be used when the media is a track or a part of
 374     a music album:
 375
 376     track:          Title of the track.
 377     track_number:   Number of the track within an album or a disc, as an integer.
 378     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 379                     as a unicode string.
 380     artist:         Artist(s) of the track.
 381     genre:          Genre(s) of the track.
 382     album:          Title of the album the track belongs to.
 383     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 384     album_artist:   List of all artists appeared on the album (e.g.
 385                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 386                     and compilations).
 387     disc_number:    Number of the disc or other physical medium the track belongs to,
 388                     as an integer.
 389     release_year:   Year (YYYY) when the album was released.
 390     composer:       Composer of the piece
 391
 392     The following fields should only be set for clips that should be cut from the original video:
 393
 394     section_start:  Start time of the section in seconds
 395     section_end:    End time of the section in seconds
 396
 397     The following fields should only be set for storyboards:
 398     rows:           Number of rows in each storyboard fragment, as an integer
 399     columns:        Number of columns in each storyboard fragment, as an integer
 400
 401     Unless mentioned otherwise, the fields should be Unicode strings.
 402
 403     Unless mentioned otherwise, None is equivalent to absence of information.
 404
 405
 406     _type "playlist" indicates multiple videos.
 407     There must be a key "entries", which is a list, an iterable, or a PagedList
 408     object, each element of which is a valid dictionary by this specification.
 409
 410     Additionally, playlists can have "id", "title", and any other relevant
 411     attributes with the same semantics as videos (see above).
 412
 413     It can also have the following optional fields:
 414
 415     playlist_count: The total number of videos in a playlist. If not given,
 416                     YoutubeDL tries to calculate it from "entries"
 417
 418
 419     _type "multi_video" indicates that there are multiple videos that
 420     form a single show, for examples multiple acts of an opera or TV episode.
 421     It must have an entries key like a playlist and contain all the keys
 422     required for a video at the same time.
 423
 424
 425     _type "url" indicates that the video must be extracted from another
 426     location, possibly by a different extractor. Its only required key is:
 427     "url" - the next URL to extract.
 428     The key "ie_key" can be set to the class name (minus the trailing "IE",
 429     e.g. "Youtube") if the extractor class is known in advance.
 430     Additionally, the dictionary may have any properties of the resolved entity
 431     known in advance, for example "title" if the title of the referred video is
 432     known ahead of time.
 433
 434
 435     _type "url_transparent" entities have the same specification as "url", but
 436     indicate that the given additional information is more precise than the one
 437     associated with the resolved URL.
 438     This is useful when a site employs a video service that hosts the video and
 439     its technical metadata, but that video service does not embed a useful
 440     title, description etc.
 441
 442
 443     Subclasses of this should also be added to the list of extractors and
 444     should define a _VALID_URL regexp and, re-define the _real_extract() and
 445     (optionally) _real_initialize() methods.
 446
 447     Subclasses may also override suitable() if necessary, but ensure the function
 448     signature is preserved and that this function imports everything it needs
 449     (except other extractors), so that lazy_extractors works correctly.
 450
 451     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 452     the HTML of Generic webpages. It may also override _extract_embed_urls
 453     or _extract_from_webpage as necessary. While these are normally classmethods,
 454     _extract_from_webpage is allowed to be an instance method.
 455
 456     _extract_from_webpage may raise self.StopExtraction() to stop further
 457     processing of the webpage and obtain exclusive rights to it. This is useful
 458     when the extractor cannot reliably be matched using just the URL,
 459     e.g. invidious/peertube instances
 460
 461     Embed-only extractors can be defined by setting _VALID_URL = False.
 462
 463     To support username + password (or netrc) login, the extractor must define a
 464     _NETRC_MACHINE and re-define _perform_login(username, password) and
 465     (optionally) _initialize_pre_login() methods. The _perform_login method will
 466     be called between _initialize_pre_login and _real_initialize if credentials
 467     are passed by the user. In cases where it is necessary to have the login
 468     process as part of the extraction rather than initialization, _perform_login
 469     can be left undefined.
 470
 471     _GEO_BYPASS attribute may be set to False in order to disable
 472     geo restriction bypass mechanisms for a particular extractor.
 473     Though it won't disable explicit geo restriction bypass based on
 474     country code provided with geo_bypass_country.
 475
 476     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 477     countries for this extractor. One of these countries will be used by
 478     geo restriction bypass mechanism right away in order to bypass
 479     geo restriction, of course, if the mechanism is not disabled.
 480
 481     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 482     IP blocks in CIDR notation for this extractor. One of these IP blocks
 483     will be used by geo restriction bypass mechanism similarly
 484     to _GEO_COUNTRIES.
 485
 486     The _ENABLED attribute should be set to False for IEs that
 487     are disabled by default and must be explicitly enabled.
 488
 489     The _WORKING attribute should be set to False for broken IEs
 490     in order to warn the users and skip the tests.
 491     """
 492
 493     _ready = False
 494     _downloader = None
 495     _x_forwarded_for_ip = None
 496     _GEO_BYPASS = True
 497     _GEO_COUNTRIES = None
 498     _GEO_IP_BLOCKS = None
 499     _WORKING = True
 500     _ENABLED = True
 501     _NETRC_MACHINE = None
 502     IE_DESC = None
 503     SEARCH_KEY = None
 504     _VALID_URL = None
 505     _EMBED_REGEX = []
 506
 507     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 508         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 509         return {
 510             None: '',
 511             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 512             'password': f'Use {password_hint}',
 513             'cookies': (
 514                 'Use --cookies-from-browser or --cookies for the authentication. '
 515                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 516         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 517
 518     def __init__(self, downloader=None):
 519         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 520         If a downloader is not passed during initialization,
 521         it must be set using "set_downloader()" before "extract()" is called"""
 522         self._ready = False
 523         self._x_forwarded_for_ip = None
 524         self._printed_messages = set()
 525         self.set_downloader(downloader)
 526
 527     @classmethod
 528     def _match_valid_url(cls, url):
 529         if cls._VALID_URL is False:
 530             return None
 531         # This does not use has/getattr intentionally - we want to know whether
 532         # we have cached the regexp for *this* class, whereas getattr would also
 533         # match the superclass
 534         if '_VALID_URL_RE' not in cls.__dict__:
 535             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 536         return cls._VALID_URL_RE.match(url)
 537
 538     @classmethod
 539     def suitable(cls, url):
 540         """Receives a URL and returns True if suitable for this IE."""
 541         # This function must import everything it needs (except other extractors),
 542         # so that lazy_extractors works correctly
 543         return cls._match_valid_url(url) is not None
 544
 545     @classmethod
 546     def _match_id(cls, url):
 547         return cls._match_valid_url(url).group('id')
 548
 549     @classmethod
 550     def get_temp_id(cls, url):
 551         try:
 552             return cls._match_id(url)
 553         except (IndexError, AttributeError):
 554             return None
 555
 556     @classmethod
 557     def working(cls):
 558         """Getter method for _WORKING."""
 559         return cls._WORKING
 560
 561     @classmethod
 562     def supports_login(cls):
 563         return bool(cls._NETRC_MACHINE)
 564
 565     def initialize(self):
 566         """Initializes an instance (authentication, etc)."""
 567         self._printed_messages = set()
 568         self._initialize_geo_bypass({
 569             'countries': self._GEO_COUNTRIES,
 570             'ip_blocks': self._GEO_IP_BLOCKS,
 571         })
 572         if not self._ready:
 573             self._initialize_pre_login()
 574             if self.supports_login():
 575                 username, password = self._get_login_info()
 576                 if username:
 577                     self._perform_login(username, password)
 578             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 579                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 580             self._real_initialize()
 581             self._ready = True
 582
 583     def _initialize_geo_bypass(self, geo_bypass_context):
 584         """
 585         Initialize geo restriction bypass mechanism.
 586
 587         This method is used to initialize geo bypass mechanism based on faking
 588         X-Forwarded-For HTTP header. A random country from provided country list
 589         is selected and a random IP belonging to this country is generated. This
 590         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 591         HTTP requests.
 592
 593         This method will be used for initial geo bypass mechanism initialization
 594         during the instance initialization with _GEO_COUNTRIES and
 595         _GEO_IP_BLOCKS.
 596
 597         You may also manually call it from extractor's code if geo bypass
 598         information is not available beforehand (e.g. obtained during
 599         extraction) or due to some other reason. In this case you should pass
 600         this information in geo bypass context passed as first argument. It may
 601         contain following fields:
 602
 603         countries:  List of geo unrestricted countries (similar
 604                     to _GEO_COUNTRIES)
 605         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 606                     (similar to _GEO_IP_BLOCKS)
 607
 608         """
 609         if not self._x_forwarded_for_ip:
 610
 611             # Geo bypass mechanism is explicitly disabled by user
 612             if not self.get_param('geo_bypass', True):
 613                 return
 614
 615             if not geo_bypass_context:
 616                 geo_bypass_context = {}
 617
 618             # Backward compatibility: previously _initialize_geo_bypass
 619             # expected a list of countries, some 3rd party code may still use
 620             # it this way
 621             if isinstance(geo_bypass_context, (list, tuple)):
 622                 geo_bypass_context = {
 623                     'countries': geo_bypass_context,
 624                 }
 625
 626             # The whole point of geo bypass mechanism is to fake IP
 627             # as X-Forwarded-For HTTP header based on some IP block or
 628             # country code.
 629
 630             # Path 1: bypassing based on IP block in CIDR notation
 631
 632             # Explicit IP block specified by user, use it right away
 633             # regardless of whether extractor is geo bypassable or not
 634             ip_block = self.get_param('geo_bypass_ip_block', None)
 635
 636             # Otherwise use random IP block from geo bypass context but only
 637             # if extractor is known as geo bypassable
 638             if not ip_block:
 639                 ip_blocks = geo_bypass_context.get('ip_blocks')
 640                 if self._GEO_BYPASS and ip_blocks:
 641                     ip_block = random.choice(ip_blocks)
 642
 643             if ip_block:
 644                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 645                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 646                 return
 647
 648             # Path 2: bypassing based on country code
 649
 650             # Explicit country code specified by user, use it right away
 651             # regardless of whether extractor is geo bypassable or not
 652             country = self.get_param('geo_bypass_country', None)
 653
 654             # Otherwise use random country code from geo bypass context but
 655             # only if extractor is known as geo bypassable
 656             if not country:
 657                 countries = geo_bypass_context.get('countries')
 658                 if self._GEO_BYPASS and countries:
 659                     country = random.choice(countries)
 660
 661             if country:
 662                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 663                 self._downloader.write_debug(
 664                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 665
 666     def extract(self, url):
 667         """Extracts URL information and returns it in list of dicts."""
 668         try:
 669             for _ in range(2):
 670                 try:
 671                     self.initialize()
 672                     self.write_debug('Extracting URL: %s' % url)
 673                     ie_result = self._real_extract(url)
 674                     if ie_result is None:
 675                         return None
 676                     if self._x_forwarded_for_ip:
 677                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 678                     subtitles = ie_result.get('subtitles') or {}
 679                     if 'no-live-chat' in self.get_param('compat_opts'):
 680                         for lang in ('live_chat', 'comments', 'danmaku'):
 681                             subtitles.pop(lang, None)
 682                     return ie_result
 683                 except GeoRestrictedError as e:
 684                     if self.__maybe_fake_ip_and_retry(e.countries):
 685                         continue
 686                     raise
 687         except UnsupportedError:
 688             raise
 689         except ExtractorError as e:
 690             kwargs = {
 691                 'video_id': e.video_id or self.get_temp_id(url),
 692                 'ie': self.IE_NAME,
 693                 'tb': e.traceback or sys.exc_info()[2],
 694                 'expected': e.expected,
 695                 'cause': e.cause
 696             }
 697             if hasattr(e, 'countries'):
 698                 kwargs['countries'] = e.countries
 699             raise type(e)(e.orig_msg, **kwargs)
 700         except http.client.IncompleteRead as e:
 701             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 702         except (KeyError, StopIteration) as e:
 703             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 704
 705     def __maybe_fake_ip_and_retry(self, countries):
 706         if (not self.get_param('geo_bypass_country', None)
 707                 and self._GEO_BYPASS
 708                 and self.get_param('geo_bypass', True)
 709                 and not self._x_forwarded_for_ip
 710                 and countries):
 711             country_code = random.choice(countries)
 712             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 713             if self._x_forwarded_for_ip:
 714                 self.report_warning(
 715                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 716                     % (self._x_forwarded_for_ip, country_code.upper()))
 717                 return True
 718         return False
 719
 720     def set_downloader(self, downloader):
 721         """Sets a YoutubeDL instance as the downloader for this IE."""
 722         self._downloader = downloader
 723
 724     @property
 725     def cache(self):
 726         return self._downloader.cache
 727
 728     @property
 729     def cookiejar(self):
 730         return self._downloader.cookiejar
 731
 732     def _initialize_pre_login(self):
 733         """ Initialization before login. Redefine in subclasses."""
 734         pass
 735
 736     def _perform_login(self, username, password):
 737         """ Login with username and password. Redefine in subclasses."""
 738         pass
 739
 740     def _real_initialize(self):
 741         """Real initialization process. Redefine in subclasses."""
 742         pass
 743
 744     def _real_extract(self, url):
 745         """Real extraction process. Redefine in subclasses."""
 746         raise NotImplementedError('This method must be implemented by subclasses')
 747
 748     @classmethod
 749     def ie_key(cls):
 750         """A string for getting the InfoExtractor with get_info_extractor"""
 751         return cls.__name__[:-2]
 752
 753     @classproperty
 754     def IE_NAME(cls):
 755         return cls.__name__[:-2]
 756
 757     @staticmethod
 758     def __can_accept_status_code(err, expected_status):
 759         assert isinstance(err, urllib.error.HTTPError)
 760         if expected_status is None:
 761             return False
 762         elif callable(expected_status):
 763             return expected_status(err.code) is True
 764         else:
 765             return err.code in variadic(expected_status)
 766
 767     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 768         if isinstance(url_or_request, urllib.request.Request):
 769             return update_Request(url_or_request, data=data, headers=headers, query=query)
 770         if query:
 771             url_or_request = update_url_query(url_or_request, query)
 772         return sanitized_Request(url_or_request, data, headers or {})
 773
 774     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 775         """
 776         Return the response handle.
 777
 778         See _download_webpage docstring for arguments specification.
 779         """
 780         if not self._downloader._first_webpage_request:
 781             sleep_interval = self.get_param('sleep_interval_requests') or 0
 782             if sleep_interval > 0:
 783                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 784                 time.sleep(sleep_interval)
 785         else:
 786             self._downloader._first_webpage_request = False
 787
 788         if note is None:
 789             self.report_download_webpage(video_id)
 790         elif note is not False:
 791             if video_id is None:
 792                 self.to_screen(str(note))
 793             else:
 794                 self.to_screen(f'{video_id}: {note}')
 795
 796         # Some sites check X-Forwarded-For HTTP header in order to figure out
 797         # the origin of the client behind proxy. This allows bypassing geo
 798         # restriction by faking this header's value to IP that belongs to some
 799         # geo unrestricted country. We will do so once we encounter any
 800         # geo restriction error.
 801         if self._x_forwarded_for_ip:
 802             headers = (headers or {}).copy()
 803             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 804
 805         try:
 806             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 807         except network_exceptions as err:
 808             if isinstance(err, urllib.error.HTTPError):
 809                 if self.__can_accept_status_code(err, expected_status):
 810                     # Retain reference to error to prevent file object from
 811                     # being closed before it can be read. Works around the
 812                     # effects of <https://bugs.python.org/issue15002>
 813                     # introduced in Python 3.4.1.
 814                     err.fp._error = err
 815                     return err.fp
 816
 817             if errnote is False:
 818                 return False
 819             if errnote is None:
 820                 errnote = 'Unable to download webpage'
 821
 822             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 823             if fatal:
 824                 raise ExtractorError(errmsg, cause=err)
 825             else:
 826                 self.report_warning(errmsg)
 827                 return False
 828
 829     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 830                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 831         """
 832         Return a tuple (page content as string, URL handle).
 833
 834         Arguments:
 835         url_or_request -- plain text URL as a string or
 836             a urllib.request.Request object
 837         video_id -- Video/playlist/item identifier (string)
 838
 839         Keyword arguments:
 840         note -- note printed before downloading (string)
 841         errnote -- note printed in case of an error (string)
 842         fatal -- flag denoting whether error should be considered fatal,
 843             i.e. whether it should cause ExtractionError to be raised,
 844             otherwise a warning will be reported and extraction continued
 845         encoding -- encoding for a page content decoding, guessed automatically
 846             when not explicitly specified
 847         data -- POST data (bytes)
 848         headers -- HTTP headers (dict)
 849         query -- URL query (dict)
 850         expected_status -- allows to accept failed HTTP requests (non 2xx
 851             status code) by explicitly specifying a set of accepted status
 852             codes. Can be any of the following entities:
 853                 - an integer type specifying an exact failed status code to
 854                   accept
 855                 - a list or a tuple of integer types specifying a list of
 856                   failed status codes to accept
 857                 - a callable accepting an actual failed status code and
 858                   returning True if it should be accepted
 859             Note that this argument does not affect success status codes (2xx)
 860             which are always accepted.
 861         """
 862
 863         # Strip hashes from the URL (#1038)
 864         if isinstance(url_or_request, str):
 865             url_or_request = url_or_request.partition('#')[0]
 866
 867         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 868         if urlh is False:
 869             assert not fatal
 870             return False
 871         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 872         return (content, urlh)
 873
 874     @staticmethod
 875     def _guess_encoding_from_content(content_type, webpage_bytes):
 876         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 877         if m:
 878             encoding = m.group(1)
 879         else:
 880             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 881                           webpage_bytes[:1024])
 882             if m:
 883                 encoding = m.group(1).decode('ascii')
 884             elif webpage_bytes.startswith(b'\xff\xfe'):
 885                 encoding = 'utf-16'
 886             else:
 887                 encoding = 'utf-8'
 888
 889         return encoding
 890
 891     def __check_blocked(self, content):
 892         first_block = content[:512]
 893         if ('<title>Access to this site is blocked</title>' in content
 894                 and 'Websense' in first_block):
 895             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 896             blocked_iframe = self._html_search_regex(
 897                 r'<iframe src="([^"]+)"', content,
 898                 'Websense information URL', default=None)
 899             if blocked_iframe:
 900                 msg += ' Visit %s for more details' % blocked_iframe
 901             raise ExtractorError(msg, expected=True)
 902         if '<title>The URL you requested has been blocked</title>' in first_block:
 903             msg = (
 904                 'Access to this webpage has been blocked by Indian censorship. '
 905                 'Use a VPN or proxy server (with --proxy) to route around it.')
 906             block_msg = self._html_search_regex(
 907                 r'</h1><p>(.*?)</p>',
 908                 content, 'block message', default=None)
 909             if block_msg:
 910                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 911             raise ExtractorError(msg, expected=True)
 912         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 913                 and 'blocklist.rkn.gov.ru' in content):
 914             raise ExtractorError(
 915                 'Access to this webpage has been blocked by decision of the Russian government. '
 916                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 917                 expected=True)
 918
 919     def _request_dump_filename(self, url, video_id):
 920         basen = f'{video_id}_{url}'
 921         trim_length = self.get_param('trim_file_name') or 240
 922         if len(basen) > trim_length:
 923             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 924             basen = basen[:trim_length - len(h)] + h
 925         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 926         # Working around MAX_PATH limitation on Windows (see
 927         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 928         if compat_os_name == 'nt':
 929             absfilepath = os.path.abspath(filename)
 930             if len(absfilepath) > 259:
 931                 filename = fR'\\?\{absfilepath}'
 932         return filename
 933
 934     def __decode_webpage(self, webpage_bytes, encoding, headers):
 935         if not encoding:
 936             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 937         try:
 938             return webpage_bytes.decode(encoding, 'replace')
 939         except LookupError:
 940             return webpage_bytes.decode('utf-8', 'replace')
 941
 942     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 943         webpage_bytes = urlh.read()
 944         if prefix is not None:
 945             webpage_bytes = prefix + webpage_bytes
 946         if self.get_param('dump_intermediate_pages', False):
 947             self.to_screen('Dumping request to ' + urlh.geturl())
 948             dump = base64.b64encode(webpage_bytes).decode('ascii')
 949             self._downloader.to_screen(dump)
 950         if self.get_param('write_pages'):
 951             filename = self._request_dump_filename(urlh.geturl(), video_id)
 952             self.to_screen(f'Saving request to {filename}')
 953             with open(filename, 'wb') as outf:
 954                 outf.write(webpage_bytes)
 955
 956         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 957         self.__check_blocked(content)
 958
 959         return content
 960
 961     def __print_error(self, errnote, fatal, video_id, err):
 962         if fatal:
 963             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 964         elif errnote:
 965             self.report_warning(f'{video_id}: {errnote}: {err}')
 966
 967     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 968         if transform_source:
 969             xml_string = transform_source(xml_string)
 970         try:
 971             return compat_etree_fromstring(xml_string.encode('utf-8'))
 972         except xml.etree.ElementTree.ParseError as ve:
 973             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 974
 975     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
 976         try:
 977             return json.loads(
 978                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 979         except ValueError as ve:
 980             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
 981
 982     def _parse_socket_response_as_json(self, data, *args, **kwargs):
 983         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
 984
 985     def __create_download_methods(name, parser, note, errnote, return_value):
 986
 987         def parse(ie, content, *args, errnote=errnote, **kwargs):
 988             if parser is None:
 989                 return content
 990             if errnote is False:
 991                 kwargs['errnote'] = errnote
 992             # parser is fetched by name so subclasses can override it
 993             return getattr(ie, parser)(content, *args, **kwargs)
 994
 995         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 996                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 997             res = self._download_webpage_handle(
 998                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 999                 data=data, headers=headers, query=query, expected_status=expected_status)
1000             if res is False:
1001                 return res
1002             content, urlh = res
1003             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1004
1005         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1006                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1007             if self.get_param('load_pages'):
1008                 url_or_request = self._create_request(url_or_request, data, headers, query)
1009                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1010                 self.to_screen(f'Loading request from {filename}')
1011                 try:
1012                     with open(filename, 'rb') as dumpf:
1013                         webpage_bytes = dumpf.read()
1014                 except OSError as e:
1015                     self.report_warning(f'Unable to load request from disk: {e}')
1016                 else:
1017                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1018                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1019             kwargs = {
1020                 'note': note,
1021                 'errnote': errnote,
1022                 'transform_source': transform_source,
1023                 'fatal': fatal,
1024                 'encoding': encoding,
1025                 'data': data,
1026                 'headers': headers,
1027                 'query': query,
1028                 'expected_status': expected_status,
1029             }
1030             if parser is None:
1031                 kwargs.pop('transform_source')
1032             # The method is fetched by name so subclasses can override _download_..._handle
1033             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1034             return res if res is False else res[0]
1035
1036         def impersonate(func, name, return_value):
1037             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1038             func.__doc__ = f'''
1039                 @param transform_source     Apply this transformation before parsing
1040                 @returns                    {return_value}
1041
1042                 See _download_webpage_handle docstring for other arguments specification
1043             '''
1044
1045         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1046         impersonate(download_content, f'_download_{name}', f'{return_value}')
1047         return download_handle, download_content
1048
1049     _download_xml_handle, _download_xml = __create_download_methods(
1050         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1051     _download_json_handle, _download_json = __create_download_methods(
1052         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1053     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1054         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1055     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1056
1057     def _download_webpage(
1058             self, url_or_request, video_id, note=None, errnote=None,
1059             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1060         """
1061         Return the data of the page as a string.
1062
1063         Keyword arguments:
1064         tries -- number of tries
1065         timeout -- sleep interval between tries
1066
1067         See _download_webpage_handle docstring for other arguments specification.
1068         """
1069
1070         R''' # NB: These are unused; should they be deprecated?
1071         if tries != 1:
1072             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1073         if timeout is NO_DEFAULT:
1074             timeout = 5
1075         else:
1076             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1077         '''
1078
1079         try_count = 0
1080         while True:
1081             try:
1082                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1083             except http.client.IncompleteRead as e:
1084                 try_count += 1
1085                 if try_count >= tries:
1086                     raise e
1087                 self._sleep(timeout, video_id)
1088
1089     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1090         idstr = format_field(video_id, None, '%s: ')
1091         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1092         if only_once:
1093             if f'WARNING: {msg}' in self._printed_messages:
1094                 return
1095             self._printed_messages.add(f'WARNING: {msg}')
1096         self._downloader.report_warning(msg, *args, **kwargs)
1097
1098     def to_screen(self, msg, *args, **kwargs):
1099         """Print msg to screen, prefixing it with '[ie_name]'"""
1100         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1101
1102     def write_debug(self, msg, *args, **kwargs):
1103         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1104
1105     def get_param(self, name, default=None, *args, **kwargs):
1106         if self._downloader:
1107             return self._downloader.params.get(name, default, *args, **kwargs)
1108         return default
1109
1110     def report_drm(self, video_id, partial=False):
1111         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1112
1113     def report_extraction(self, id_or_name):
1114         """Report information extraction."""
1115         self.to_screen('%s: Extracting information' % id_or_name)
1116
1117     def report_download_webpage(self, video_id):
1118         """Report webpage download."""
1119         self.to_screen('%s: Downloading webpage' % video_id)
1120
1121     def report_age_confirmation(self):
1122         """Report attempt to confirm age."""
1123         self.to_screen('Confirming age')
1124
1125     def report_login(self):
1126         """Report attempt to log in."""
1127         self.to_screen('Logging in')
1128
1129     def raise_login_required(
1130             self, msg='This video is only available for registered users',
1131             metadata_available=False, method=NO_DEFAULT):
1132         if metadata_available and (
1133                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1134             self.report_warning(msg)
1135             return
1136         msg += format_field(self._login_hint(method), None, '. %s')
1137         raise ExtractorError(msg, expected=True)
1138
1139     def raise_geo_restricted(
1140             self, msg='This video is not available from your location due to geo restriction',
1141             countries=None, metadata_available=False):
1142         if metadata_available and (
1143                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1144             self.report_warning(msg)
1145         else:
1146             raise GeoRestrictedError(msg, countries=countries)
1147
1148     def raise_no_formats(self, msg, expected=False, video_id=None):
1149         if expected and (
1150                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1151             self.report_warning(msg, video_id)
1152         elif isinstance(msg, ExtractorError):
1153             raise msg
1154         else:
1155             raise ExtractorError(msg, expected=expected, video_id=video_id)
1156
1157     # Methods for following #608
1158     @staticmethod
1159     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1160         """Returns a URL that points to a page that should be processed"""
1161         if ie is not None:
1162             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1163         if video_id is not None:
1164             kwargs['id'] = video_id
1165         if video_title is not None:
1166             kwargs['title'] = video_title
1167         return {
1168             **kwargs,
1169             '_type': 'url_transparent' if url_transparent else 'url',
1170             'url': url,
1171         }
1172
1173     @classmethod
1174     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1175                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1176         return cls.playlist_result(
1177             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1178             playlist_id, playlist_title, **kwargs)
1179
1180     @staticmethod
1181     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1182         """Returns a playlist"""
1183         if playlist_id:
1184             kwargs['id'] = playlist_id
1185         if playlist_title:
1186             kwargs['title'] = playlist_title
1187         if playlist_description is not None:
1188             kwargs['description'] = playlist_description
1189         return {
1190             **kwargs,
1191             '_type': 'multi_video' if multi_video else 'playlist',
1192             'entries': entries,
1193         }
1194
1195     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1196         """
1197         Perform a regex search on the given string, using a single or a list of
1198         patterns returning the first matching group.
1199         In case of failure return a default value or raise a WARNING or a
1200         RegexNotFoundError, depending on fatal, specifying the field name.
1201         """
1202         if string is None:
1203             mobj = None
1204         elif isinstance(pattern, (str, re.Pattern)):
1205             mobj = re.search(pattern, string, flags)
1206         else:
1207             for p in pattern:
1208                 mobj = re.search(p, string, flags)
1209                 if mobj:
1210                     break
1211
1212         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1213
1214         if mobj:
1215             if group is None:
1216                 # return the first matching group
1217                 return next(g for g in mobj.groups() if g is not None)
1218             elif isinstance(group, (list, tuple)):
1219                 return tuple(mobj.group(g) for g in group)
1220             else:
1221                 return mobj.group(group)
1222         elif default is not NO_DEFAULT:
1223             return default
1224         elif fatal:
1225             raise RegexNotFoundError('Unable to extract %s' % _name)
1226         else:
1227             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1228             return None
1229
1230     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1231                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1232         """Searches string for the JSON object specified by start_pattern"""
1233         # NB: end_pattern is only used to reduce the size of the initial match
1234         if default is NO_DEFAULT:
1235             default, has_default = {}, False
1236         else:
1237             fatal, has_default = False, True
1238
1239         json_string = self._search_regex(
1240             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1241             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1242         if not json_string:
1243             return default
1244
1245         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1246         try:
1247             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1248         except ExtractorError as e:
1249             if fatal:
1250                 raise ExtractorError(
1251                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1252             elif not has_default:
1253                 self.report_warning(
1254                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1255         return default
1256
1257     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1258         """
1259         Like _search_regex, but strips HTML tags and unescapes entities.
1260         """
1261         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1262         if res:
1263             return clean_html(res).strip()
1264         else:
1265             return res
1266
1267     def _get_netrc_login_info(self, netrc_machine=None):
1268         username = None
1269         password = None
1270         netrc_machine = netrc_machine or self._NETRC_MACHINE
1271
1272         if self.get_param('usenetrc', False):
1273             try:
1274                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1275                 if os.path.isdir(netrc_file):
1276                     netrc_file = os.path.join(netrc_file, '.netrc')
1277                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1278                 if info is not None:
1279                     username = info[0]
1280                     password = info[2]
1281                 else:
1282                     raise netrc.NetrcParseError(
1283                         'No authenticators for %s' % netrc_machine)
1284             except (OSError, netrc.NetrcParseError) as err:
1285                 self.report_warning(
1286                     'parsing .netrc: %s' % error_to_compat_str(err))
1287
1288         return username, password
1289
1290     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1291         """
1292         Get the login info as (username, password)
1293         First look for the manually specified credentials using username_option
1294         and password_option as keys in params dictionary. If no such credentials
1295         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1296         value.
1297         If there's no info available, return (None, None)
1298         """
1299
1300         # Attempt to use provided username and password or .netrc data
1301         username = self.get_param(username_option)
1302         if username is not None:
1303             password = self.get_param(password_option)
1304         else:
1305             username, password = self._get_netrc_login_info(netrc_machine)
1306
1307         return username, password
1308
1309     def _get_tfa_info(self, note='two-factor verification code'):
1310         """
1311         Get the two-factor authentication info
1312         TODO - asking the user will be required for sms/phone verify
1313         currently just uses the command line option
1314         If there's no info available, return None
1315         """
1316
1317         tfa = self.get_param('twofactor')
1318         if tfa is not None:
1319             return tfa
1320
1321         return getpass.getpass('Type %s and press [Return]: ' % note)
1322
1323     # Helper functions for extracting OpenGraph info
1324     @staticmethod
1325     def _og_regexes(prop):
1326         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1327         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1328                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1329         template = r'<meta[^>]+?%s[^>]+?%s'
1330         return [
1331             template % (property_re, content_re),
1332             template % (content_re, property_re),
1333         ]
1334
1335     @staticmethod
1336     def _meta_regex(prop):
1337         return r'''(?isx)<meta
1338                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1339                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1340
1341     def _og_search_property(self, prop, html, name=None, **kargs):
1342         prop = variadic(prop)
1343         if name is None:
1344             name = 'OpenGraph %s' % prop[0]
1345         og_regexes = []
1346         for p in prop:
1347             og_regexes.extend(self._og_regexes(p))
1348         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1349         if escaped is None:
1350             return None
1351         return unescapeHTML(escaped)
1352
1353     def _og_search_thumbnail(self, html, **kargs):
1354         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1355
1356     def _og_search_description(self, html, **kargs):
1357         return self._og_search_property('description', html, fatal=False, **kargs)
1358
1359     def _og_search_title(self, html, *, fatal=False, **kargs):
1360         return self._og_search_property('title', html, fatal=fatal, **kargs)
1361
1362     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1363         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1364         if secure:
1365             regexes = self._og_regexes('video:secure_url') + regexes
1366         return self._html_search_regex(regexes, html, name, **kargs)
1367
1368     def _og_search_url(self, html, **kargs):
1369         return self._og_search_property('url', html, **kargs)
1370
1371     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1372         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1373
1374     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1375         name = variadic(name)
1376         if display_name is None:
1377             display_name = name[0]
1378         return self._html_search_regex(
1379             [self._meta_regex(n) for n in name],
1380             html, display_name, fatal=fatal, group='content', **kwargs)
1381
1382     def _dc_search_uploader(self, html):
1383         return self._html_search_meta('dc.creator', html, 'uploader')
1384
1385     @staticmethod
1386     def _rta_search(html):
1387         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1388         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1389                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1390                      html):
1391             return 18
1392
1393         # And then there are the jokers who advertise that they use RTA, but actually don't.
1394         AGE_LIMIT_MARKERS = [
1395             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1396         ]
1397         if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
1398             return 18
1399         return 0
1400
1401     def _media_rating_search(self, html):
1402         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1403         rating = self._html_search_meta('rating', html)
1404
1405         if not rating:
1406             return None
1407
1408         RATING_TABLE = {
1409             'safe for kids': 0,
1410             'general': 8,
1411             '14 years': 14,
1412             'mature': 17,
1413             'restricted': 19,
1414         }
1415         return RATING_TABLE.get(rating.lower())
1416
1417     def _family_friendly_search(self, html):
1418         # See http://schema.org/VideoObject
1419         family_friendly = self._html_search_meta(
1420             'isFamilyFriendly', html, default=None)
1421
1422         if not family_friendly:
1423             return None
1424
1425         RATING_TABLE = {
1426             '1': 0,
1427             'true': 0,
1428             '0': 18,
1429             'false': 18,
1430         }
1431         return RATING_TABLE.get(family_friendly.lower())
1432
1433     def _twitter_search_player(self, html):
1434         return self._html_search_meta('twitter:player', html,
1435                                       'twitter card player')
1436
1437     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1438         """Yield all json ld objects in the html"""
1439         if default is not NO_DEFAULT:
1440             fatal = False
1441         for mobj in re.finditer(JSON_LD_RE, html):
1442             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1443             for json_ld in variadic(json_ld_item):
1444                 if isinstance(json_ld, dict):
1445                     yield json_ld
1446
1447     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1448         """Search for a video in any json ld in the html"""
1449         if default is not NO_DEFAULT:
1450             fatal = False
1451         info = self._json_ld(
1452             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1453             video_id, fatal=fatal, expected_type=expected_type)
1454         if info:
1455             return info
1456         if default is not NO_DEFAULT:
1457             return default
1458         elif fatal:
1459             raise RegexNotFoundError('Unable to extract JSON-LD')
1460         else:
1461             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1462             return {}
1463
1464     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1465         if isinstance(json_ld, str):
1466             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1467         if not json_ld:
1468             return {}
1469         info = {}
1470         if not isinstance(json_ld, (list, tuple, dict)):
1471             return info
1472         if isinstance(json_ld, dict):
1473             json_ld = [json_ld]
1474
1475         INTERACTION_TYPE_MAP = {
1476             'CommentAction': 'comment',
1477             'AgreeAction': 'like',
1478             'DisagreeAction': 'dislike',
1479             'LikeAction': 'like',
1480             'DislikeAction': 'dislike',
1481             'ListenAction': 'view',
1482             'WatchAction': 'view',
1483             'ViewAction': 'view',
1484         }
1485
1486         def is_type(e, *expected_types):
1487             type = variadic(traverse_obj(e, '@type'))
1488             return any(x in type for x in expected_types)
1489
1490         def extract_interaction_type(e):
1491             interaction_type = e.get('interactionType')
1492             if isinstance(interaction_type, dict):
1493                 interaction_type = interaction_type.get('@type')
1494             return str_or_none(interaction_type)
1495
1496         def extract_interaction_statistic(e):
1497             interaction_statistic = e.get('interactionStatistic')
1498             if isinstance(interaction_statistic, dict):
1499                 interaction_statistic = [interaction_statistic]
1500             if not isinstance(interaction_statistic, list):
1501                 return
1502             for is_e in interaction_statistic:
1503                 if not is_type(is_e, 'InteractionCounter'):
1504                     continue
1505                 interaction_type = extract_interaction_type(is_e)
1506                 if not interaction_type:
1507                     continue
1508                 # For interaction count some sites provide string instead of
1509                 # an integer (as per spec) with non digit characters (e.g. ",")
1510                 # so extracting count with more relaxed str_to_int
1511                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1512                 if interaction_count is None:
1513                     continue
1514                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1515                 if not count_kind:
1516                     continue
1517                 count_key = '%s_count' % count_kind
1518                 if info.get(count_key) is not None:
1519                     continue
1520                 info[count_key] = interaction_count
1521
1522         def extract_chapter_information(e):
1523             chapters = [{
1524                 'title': part.get('name'),
1525                 'start_time': part.get('startOffset'),
1526                 'end_time': part.get('endOffset'),
1527             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1528             for idx, (last_c, current_c, next_c) in enumerate(zip(
1529                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1530                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1531                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1532                 if None in current_c.values():
1533                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1534                     return
1535             if chapters:
1536                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1537                 info['chapters'] = chapters
1538
1539         def extract_video_object(e):
1540             author = e.get('author')
1541             info.update({
1542                 'url': url_or_none(e.get('contentUrl')),
1543                 'ext': mimetype2ext(e.get('encodingFormat')),
1544                 'title': unescapeHTML(e.get('name')),
1545                 'description': unescapeHTML(e.get('description')),
1546                 'thumbnails': [{'url': unescapeHTML(url)}
1547                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1548                                if url_or_none(url)],
1549                 'duration': parse_duration(e.get('duration')),
1550                 'timestamp': unified_timestamp(e.get('uploadDate')),
1551                 # author can be an instance of 'Organization' or 'Person' types.
1552                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1553                 # however some websites are using 'Text' type instead.
1554                 # 1. https://schema.org/VideoObject
1555                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1556                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1557                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1558                 'tbr': int_or_none(e.get('bitrate')),
1559                 'width': int_or_none(e.get('width')),
1560                 'height': int_or_none(e.get('height')),
1561                 'view_count': int_or_none(e.get('interactionCount')),
1562                 'tags': try_call(lambda: e.get('keywords').split(',')),
1563             })
1564             if is_type(e, 'AudioObject'):
1565                 info.update({
1566                     'vcodec': 'none',
1567                     'abr': int_or_none(e.get('bitrate')),
1568                 })
1569             extract_interaction_statistic(e)
1570             extract_chapter_information(e)
1571
1572         def traverse_json_ld(json_ld, at_top_level=True):
1573             for e in json_ld:
1574                 if at_top_level and '@context' not in e:
1575                     continue
1576                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1577                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1578                     break
1579                 if expected_type is not None and not is_type(e, expected_type):
1580                     continue
1581                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1582                 if rating is not None:
1583                     info['average_rating'] = rating
1584                 if is_type(e, 'TVEpisode', 'Episode'):
1585                     episode_name = unescapeHTML(e.get('name'))
1586                     info.update({
1587                         'episode': episode_name,
1588                         'episode_number': int_or_none(e.get('episodeNumber')),
1589                         'description': unescapeHTML(e.get('description')),
1590                     })
1591                     if not info.get('title') and episode_name:
1592                         info['title'] = episode_name
1593                     part_of_season = e.get('partOfSeason')
1594                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1595                         info.update({
1596                             'season': unescapeHTML(part_of_season.get('name')),
1597                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1598                         })
1599                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1600                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1601                         info['series'] = unescapeHTML(part_of_series.get('name'))
1602                 elif is_type(e, 'Movie'):
1603                     info.update({
1604                         'title': unescapeHTML(e.get('name')),
1605                         'description': unescapeHTML(e.get('description')),
1606                         'duration': parse_duration(e.get('duration')),
1607                         'timestamp': unified_timestamp(e.get('dateCreated')),
1608                     })
1609                 elif is_type(e, 'Article', 'NewsArticle'):
1610                     info.update({
1611                         'timestamp': parse_iso8601(e.get('datePublished')),
1612                         'title': unescapeHTML(e.get('headline')),
1613                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1614                     })
1615                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1616                         extract_video_object(e['video'][0])
1617                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1618                         extract_video_object(e['subjectOf'][0])
1619                 elif is_type(e, 'VideoObject', 'AudioObject'):
1620                     extract_video_object(e)
1621                     if expected_type is None:
1622                         continue
1623                     else:
1624                         break
1625                 video = e.get('video')
1626                 if is_type(video, 'VideoObject'):
1627                     extract_video_object(video)
1628                 if expected_type is None:
1629                     continue
1630                 else:
1631                     break
1632         traverse_json_ld(json_ld)
1633
1634         return filter_dict(info)
1635
1636     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1637         return self._parse_json(
1638             self._search_regex(
1639                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1640                 webpage, 'next.js data', fatal=fatal, **kw),
1641             video_id, transform_source=transform_source, fatal=fatal)
1642
1643     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1644         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1645         rectx = re.escape(context_name)
1646         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1647         js, arg_keys, arg_vals = self._search_regex(
1648             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1649             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
1650
1651         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1652
1653         for key, val in args.items():
1654             if val in ('undefined', 'void 0'):
1655                 args[key] = 'null'
1656
1657         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1658         return traverse_obj(ret, traverse) or {}
1659
1660     @staticmethod
1661     def _hidden_inputs(html):
1662         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1663         hidden_inputs = {}
1664         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1665             attrs = extract_attributes(input)
1666             if not input:
1667                 continue
1668             if attrs.get('type') not in ('hidden', 'submit'):
1669                 continue
1670             name = attrs.get('name') or attrs.get('id')
1671             value = attrs.get('value')
1672             if name and value is not None:
1673                 hidden_inputs[name] = value
1674         return hidden_inputs
1675
1676     def _form_hidden_inputs(self, form_id, html):
1677         form = self._search_regex(
1678             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1679             html, '%s form' % form_id, group='form')
1680         return self._hidden_inputs(form)
1681
1682     class FormatSort:
1683         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1684
1685         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1686                    'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
1687                    'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1688         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1689                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1690                         'fps', 'fs_approx', 'source', 'id')
1691
1692         settings = {
1693             'vcodec': {'type': 'ordered', 'regex': True,
1694                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1695             'acodec': {'type': 'ordered', 'regex': True,
1696                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1697             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1698                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1699             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1700                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1701             'vext': {'type': 'ordered', 'field': 'video_ext',
1702                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1703                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1704             'aext': {'type': 'ordered', 'field': 'audio_ext',
1705                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1706                      'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
1707             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1708             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1709                            'field': ('vcodec', 'acodec'),
1710                            'function': lambda it: int(any(v != 'none' for v in it))},
1711             'ie_pref': {'priority': True, 'type': 'extractor'},
1712             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1713             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1714             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1715             'quality': {'convert': 'float', 'default': -1},
1716             'filesize': {'convert': 'bytes'},
1717             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1718             'id': {'convert': 'string', 'field': 'format_id'},
1719             'height': {'convert': 'float_none'},
1720             'width': {'convert': 'float_none'},
1721             'fps': {'convert': 'float_none'},
1722             'channels': {'convert': 'float_none', 'field': 'audio_channels'},
1723             'tbr': {'convert': 'float_none'},
1724             'vbr': {'convert': 'float_none'},
1725             'abr': {'convert': 'float_none'},
1726             'asr': {'convert': 'float_none'},
1727             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1728
1729             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1730             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1731             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1732             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1733             'res': {'type': 'multiple', 'field': ('height', 'width'),
1734                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1735
1736             # Actual field names
1737             'format_id': {'type': 'alias', 'field': 'id'},
1738             'preference': {'type': 'alias', 'field': 'ie_pref'},
1739             'language_preference': {'type': 'alias', 'field': 'lang'},
1740             'source_preference': {'type': 'alias', 'field': 'source'},
1741             'protocol': {'type': 'alias', 'field': 'proto'},
1742             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1743             'audio_channels': {'type': 'alias', 'field': 'channels'},
1744
1745             # Deprecated
1746             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1747             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1748             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1749             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1750             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1751             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1752             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1753             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1754             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1755             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1756             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1757             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1758             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1759             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1760             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1761             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1762             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1763             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1764             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1765             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1766         }
1767
1768         def __init__(self, ie, field_preference):
1769             self._order = []
1770             self.ydl = ie._downloader
1771             self.evaluate_params(self.ydl.params, field_preference)
1772             if ie.get_param('verbose'):
1773                 self.print_verbose_info(self.ydl.write_debug)
1774
1775         def _get_field_setting(self, field, key):
1776             if field not in self.settings:
1777                 if key in ('forced', 'priority'):
1778                     return False
1779                 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
1780                                             'deprecated and may be removed in a future version')
1781                 self.settings[field] = {}
1782             propObj = self.settings[field]
1783             if key not in propObj:
1784                 type = propObj.get('type')
1785                 if key == 'field':
1786                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1787                 elif key == 'convert':
1788                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1789                 else:
1790                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1791                 propObj[key] = default
1792             return propObj[key]
1793
1794         def _resolve_field_value(self, field, value, convertNone=False):
1795             if value is None:
1796                 if not convertNone:
1797                     return None
1798             else:
1799                 value = value.lower()
1800             conversion = self._get_field_setting(field, 'convert')
1801             if conversion == 'ignore':
1802                 return None
1803             if conversion == 'string':
1804                 return value
1805             elif conversion == 'float_none':
1806                 return float_or_none(value)
1807             elif conversion == 'bytes':
1808                 return FileDownloader.parse_bytes(value)
1809             elif conversion == 'order':
1810                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1811                 use_regex = self._get_field_setting(field, 'regex')
1812                 list_length = len(order_list)
1813                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1814                 if use_regex and value is not None:
1815                     for i, regex in enumerate(order_list):
1816                         if regex and re.match(regex, value):
1817                             return list_length - i
1818                     return list_length - empty_pos  # not in list
1819                 else:  # not regex or  value = None
1820                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1821             else:
1822                 if value.isnumeric():
1823                     return float(value)
1824                 else:
1825                     self.settings[field]['convert'] = 'string'
1826                     return value
1827
1828         def evaluate_params(self, params, sort_extractor):
1829             self._use_free_order = params.get('prefer_free_formats', False)
1830             self._sort_user = params.get('format_sort', [])
1831             self._sort_extractor = sort_extractor
1832
1833             def add_item(field, reverse, closest, limit_text):
1834                 field = field.lower()
1835                 if field in self._order:
1836                     return
1837                 self._order.append(field)
1838                 limit = self._resolve_field_value(field, limit_text)
1839                 data = {
1840                     'reverse': reverse,
1841                     'closest': False if limit is None else closest,
1842                     'limit_text': limit_text,
1843                     'limit': limit}
1844                 if field in self.settings:
1845                     self.settings[field].update(data)
1846                 else:
1847                     self.settings[field] = data
1848
1849             sort_list = (
1850                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1851                 + (tuple() if params.get('format_sort_force', False)
1852                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1853                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1854
1855             for item in sort_list:
1856                 match = re.match(self.regex, item)
1857                 if match is None:
1858                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1859                 field = match.group('field')
1860                 if field is None:
1861                     continue
1862                 if self._get_field_setting(field, 'type') == 'alias':
1863                     alias, field = field, self._get_field_setting(field, 'field')
1864                     if self._get_field_setting(alias, 'deprecated'):
1865                         self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
1866                                                     f'be removed in a future version. Please use {field} instead')
1867                 reverse = match.group('reverse') is not None
1868                 closest = match.group('separator') == '~'
1869                 limit_text = match.group('limit')
1870
1871                 has_limit = limit_text is not None
1872                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1873                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1874
1875                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1876                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1877                 limit_count = len(limits)
1878                 for (i, f) in enumerate(fields):
1879                     add_item(f, reverse, closest,
1880                              limits[i] if i < limit_count
1881                              else limits[0] if has_limit and not has_multiple_limits
1882                              else None)
1883
1884         def print_verbose_info(self, write_debug):
1885             if self._sort_user:
1886                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1887             if self._sort_extractor:
1888                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1889             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1890                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1891                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1892                               self._get_field_setting(field, 'limit_text'),
1893                               self._get_field_setting(field, 'limit'))
1894                 if self._get_field_setting(field, 'limit_text') is not None else '')
1895                 for field in self._order if self._get_field_setting(field, 'visible')]))
1896
1897         def _calculate_field_preference_from_value(self, format, field, type, value):
1898             reverse = self._get_field_setting(field, 'reverse')
1899             closest = self._get_field_setting(field, 'closest')
1900             limit = self._get_field_setting(field, 'limit')
1901
1902             if type == 'extractor':
1903                 maximum = self._get_field_setting(field, 'max')
1904                 if value is None or (maximum is not None and value >= maximum):
1905                     value = -1
1906             elif type == 'boolean':
1907                 in_list = self._get_field_setting(field, 'in_list')
1908                 not_in_list = self._get_field_setting(field, 'not_in_list')
1909                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1910             elif type == 'ordered':
1911                 value = self._resolve_field_value(field, value, True)
1912
1913             # try to convert to number
1914             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1915             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1916             if is_num:
1917                 value = val_num
1918
1919             return ((-10, 0) if value is None
1920                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1921                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1922                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1923                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1924                     else (-1, value, 0))
1925
1926         def _calculate_field_preference(self, format, field):
1927             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1928             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1929             if type == 'multiple':
1930                 type = 'field'  # Only 'field' is allowed in multiple for now
1931                 actual_fields = self._get_field_setting(field, 'field')
1932
1933                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1934             else:
1935                 value = get_value(field)
1936             return self._calculate_field_preference_from_value(format, field, type, value)
1937
1938         def calculate_preference(self, format):
1939             # Determine missing protocol
1940             if not format.get('protocol'):
1941                 format['protocol'] = determine_protocol(format)
1942
1943             # Determine missing ext
1944             if not format.get('ext') and 'url' in format:
1945                 format['ext'] = determine_ext(format['url'])
1946             if format.get('vcodec') == 'none':
1947                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1948                 format['video_ext'] = 'none'
1949             else:
1950                 format['video_ext'] = format['ext']
1951                 format['audio_ext'] = 'none'
1952             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1953             #    format['preference'] = -1000
1954
1955             # Determine missing bitrates
1956             if format.get('tbr') is None:
1957                 if format.get('vbr') is not None and format.get('abr') is not None:
1958                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1959             else:
1960                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1961                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1962                 if format.get('acodec') != 'none' and format.get('abr') is None:
1963                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1964
1965             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1966
1967     def _sort_formats(self, formats, field_preference=[]):
1968         if not formats:
1969             return
1970         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1971
1972     def _check_formats(self, formats, video_id):
1973         if formats:
1974             formats[:] = filter(
1975                 lambda f: self._is_valid_url(
1976                     f['url'], video_id,
1977                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1978                 formats)
1979
1980     @staticmethod
1981     def _remove_duplicate_formats(formats):
1982         format_urls = set()
1983         unique_formats = []
1984         for f in formats:
1985             if f['url'] not in format_urls:
1986                 format_urls.add(f['url'])
1987                 unique_formats.append(f)
1988         formats[:] = unique_formats
1989
1990     def _is_valid_url(self, url, video_id, item='video', headers={}):
1991         url = self._proto_relative_url(url, scheme='http:')
1992         # For now assume non HTTP(S) URLs always valid
1993         if not (url.startswith('http://') or url.startswith('https://')):
1994             return True
1995         try:
1996             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1997             return True
1998         except ExtractorError as e:
1999             self.to_screen(
2000                 '%s: %s URL is invalid, skipping: %s'
2001                 % (video_id, item, error_to_compat_str(e.cause)))
2002             return False
2003
2004     def http_scheme(self):
2005         """ Either "http:" or "https:", depending on the user's preferences """
2006         return (
2007             'http:'
2008             if self.get_param('prefer_insecure', False)
2009             else 'https:')
2010
2011     def _proto_relative_url(self, url, scheme=None):
2012         scheme = scheme or self.http_scheme()
2013         assert scheme.endswith(':')
2014         return sanitize_url(url, scheme=scheme[:-1])
2015
2016     def _sleep(self, timeout, video_id, msg_template=None):
2017         if msg_template is None:
2018             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
2019         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
2020         self.to_screen(msg)
2021         time.sleep(timeout)
2022
2023     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2024                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
2025                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
2026         res = self._download_xml_handle(
2027             manifest_url, video_id, 'Downloading f4m manifest',
2028             'Unable to download f4m manifest',
2029             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
2030             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
2031             transform_source=transform_source,
2032             fatal=fatal, data=data, headers=headers, query=query)
2033         if res is False:
2034             return []
2035
2036         manifest, urlh = res
2037         manifest_url = urlh.geturl()
2038
2039         return self._parse_f4m_formats(
2040             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2041             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2042
2043     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2044                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2045                            fatal=True, m3u8_id=None):
2046         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2047             return []
2048
2049         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2050         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2051         if akamai_pv is not None and ';' in akamai_pv.text:
2052             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2053             if playerVerificationChallenge.strip() != '':
2054                 return []
2055
2056         formats = []
2057         manifest_version = '1.0'
2058         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2059         if not media_nodes:
2060             manifest_version = '2.0'
2061             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2062         # Remove unsupported DRM protected media from final formats
2063         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2064         media_nodes = remove_encrypted_media(media_nodes)
2065         if not media_nodes:
2066             return formats
2067
2068         manifest_base_url = get_base_url(manifest)
2069
2070         bootstrap_info = xpath_element(
2071             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2072             'bootstrap info', default=None)
2073
2074         vcodec = None
2075         mime_type = xpath_text(
2076             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2077             'base URL', default=None)
2078         if mime_type and mime_type.startswith('audio/'):
2079             vcodec = 'none'
2080
2081         for i, media_el in enumerate(media_nodes):
2082             tbr = int_or_none(media_el.attrib.get('bitrate'))
2083             width = int_or_none(media_el.attrib.get('width'))
2084             height = int_or_none(media_el.attrib.get('height'))
2085             format_id = join_nonempty(f4m_id, tbr or i)
2086             # If <bootstrapInfo> is present, the specified f4m is a
2087             # stream-level manifest, and only set-level manifests may refer to
2088             # external resources.  See section 11.4 and section 4 of F4M spec
2089             if bootstrap_info is None:
2090                 media_url = None
2091                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2092                 if manifest_version == '2.0':
2093                     media_url = media_el.attrib.get('href')
2094                 if media_url is None:
2095                     media_url = media_el.attrib.get('url')
2096                 if not media_url:
2097                     continue
2098                 manifest_url = (
2099                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2100                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2101                 # If media_url is itself a f4m manifest do the recursive extraction
2102                 # since bitrates in parent manifest (this one) and media_url manifest
2103                 # may differ leading to inability to resolve the format by requested
2104                 # bitrate in f4m downloader
2105                 ext = determine_ext(manifest_url)
2106                 if ext == 'f4m':
2107                     f4m_formats = self._extract_f4m_formats(
2108                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2109                         transform_source=transform_source, fatal=fatal)
2110                     # Sometimes stream-level manifest contains single media entry that
2111                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2112                     # At the same time parent's media entry in set-level manifest may
2113                     # contain it. We will copy it from parent in such cases.
2114                     if len(f4m_formats) == 1:
2115                         f = f4m_formats[0]
2116                         f.update({
2117                             'tbr': f.get('tbr') or tbr,
2118                             'width': f.get('width') or width,
2119                             'height': f.get('height') or height,
2120                             'format_id': f.get('format_id') if not tbr else format_id,
2121                             'vcodec': vcodec,
2122                         })
2123                     formats.extend(f4m_formats)
2124                     continue
2125                 elif ext == 'm3u8':
2126                     formats.extend(self._extract_m3u8_formats(
2127                         manifest_url, video_id, 'mp4', preference=preference,
2128                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2129                     continue
2130             formats.append({
2131                 'format_id': format_id,
2132                 'url': manifest_url,
2133                 'manifest_url': manifest_url,
2134                 'ext': 'flv' if bootstrap_info is not None else None,
2135                 'protocol': 'f4m',
2136                 'tbr': tbr,
2137                 'width': width,
2138                 'height': height,
2139                 'vcodec': vcodec,
2140                 'preference': preference,
2141                 'quality': quality,
2142             })
2143         return formats
2144
2145     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2146         return {
2147             'format_id': join_nonempty(m3u8_id, 'meta'),
2148             'url': m3u8_url,
2149             'ext': ext,
2150             'protocol': 'm3u8',
2151             'preference': preference - 100 if preference else -100,
2152             'quality': quality,
2153             'resolution': 'multiple',
2154             'format_note': 'Quality selection URL',
2155         }
2156
2157     def _report_ignoring_subs(self, name):
2158         self.report_warning(bug_reports_message(
2159             f'Ignoring subtitle tracks found in the {name} manifest; '
2160             'if any subtitle tracks are missing,'
2161         ), only_once=True)
2162
2163     def _extract_m3u8_formats(self, *args, **kwargs):
2164         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2165         if subs:
2166             self._report_ignoring_subs('HLS')
2167         return fmts
2168
2169     def _extract_m3u8_formats_and_subtitles(
2170             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2171             preference=None, quality=None, m3u8_id=None, note=None,
2172             errnote=None, fatal=True, live=False, data=None, headers={},
2173             query={}):
2174
2175         res = self._download_webpage_handle(
2176             m3u8_url, video_id,
2177             note='Downloading m3u8 information' if note is None else note,
2178             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2179             fatal=fatal, data=data, headers=headers, query=query)
2180
2181         if res is False:
2182             return [], {}
2183
2184         m3u8_doc, urlh = res
2185         m3u8_url = urlh.geturl()
2186
2187         return self._parse_m3u8_formats_and_subtitles(
2188             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2189             preference=preference, quality=quality, m3u8_id=m3u8_id,
2190             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2191             headers=headers, query=query, video_id=video_id)
2192
2193     def _parse_m3u8_formats_and_subtitles(
2194             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2195             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2196             errnote=None, fatal=True, data=None, headers={}, query={},
2197             video_id=None):
2198         formats, subtitles = [], {}
2199
2200         has_drm = re.search('|'.join([
2201             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2202             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2203         ]), m3u8_doc)
2204
2205         def format_url(url):
2206             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2207
2208         if self.get_param('hls_split_discontinuity', False):
2209             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2210                 if not m3u8_doc:
2211                     if not manifest_url:
2212                         return []
2213                     m3u8_doc = self._download_webpage(
2214                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2215                         note=False, errnote='Failed to download m3u8 playlist information')
2216                     if m3u8_doc is False:
2217                         return []
2218                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2219
2220         else:
2221             def _extract_m3u8_playlist_indices(*args, **kwargs):
2222                 return [None]
2223
2224         # References:
2225         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2226         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2227         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2228
2229         # We should try extracting formats only from master playlists [1, 4.3.4],
2230         # i.e. playlists that describe available qualities. On the other hand
2231         # media playlists [1, 4.3.3] should be returned as is since they contain
2232         # just the media without qualities renditions.
2233         # Fortunately, master playlist can be easily distinguished from media
2234         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2235         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2236         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2237         # media playlist and MUST NOT appear in master playlist thus we can
2238         # clearly detect media playlist with this criterion.
2239
2240         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2241             formats = [{
2242                 'format_id': join_nonempty(m3u8_id, idx),
2243                 'format_index': idx,
2244                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2245                 'ext': ext,
2246                 'protocol': entry_protocol,
2247                 'preference': preference,
2248                 'quality': quality,
2249                 'has_drm': has_drm,
2250             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2251
2252             return formats, subtitles
2253
2254         groups = {}
2255         last_stream_inf = {}
2256
2257         def extract_media(x_media_line):
2258             media = parse_m3u8_attributes(x_media_line)
2259             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2260             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2261             if not (media_type and group_id and name):
2262                 return
2263             groups.setdefault(group_id, []).append(media)
2264             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2265             if media_type == 'SUBTITLES':
2266                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2267                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2268                 # However, lack of URI has been spotted in the wild.
2269                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2270                 if not media.get('URI'):
2271                     return
2272                 url = format_url(media['URI'])
2273                 sub_info = {
2274                     'url': url,
2275                     'ext': determine_ext(url),
2276                 }
2277                 if sub_info['ext'] == 'm3u8':
2278                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2279                     # files may contain is WebVTT:
2280                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2281                     sub_info['ext'] = 'vtt'
2282                     sub_info['protocol'] = 'm3u8_native'
2283                 lang = media.get('LANGUAGE') or 'und'
2284                 subtitles.setdefault(lang, []).append(sub_info)
2285             if media_type not in ('VIDEO', 'AUDIO'):
2286                 return
2287             media_url = media.get('URI')
2288             if media_url:
2289                 manifest_url = format_url(media_url)
2290                 formats.extend({
2291                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2292                     'format_note': name,
2293                     'format_index': idx,
2294                     'url': manifest_url,
2295                     'manifest_url': m3u8_url,
2296                     'language': media.get('LANGUAGE'),
2297                     'ext': ext,
2298                     'protocol': entry_protocol,
2299                     'preference': preference,
2300                     'quality': quality,
2301                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2302                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2303
2304         def build_stream_name():
2305             # Despite specification does not mention NAME attribute for
2306             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2307             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2308             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2309             stream_name = last_stream_inf.get('NAME')
2310             if stream_name:
2311                 return stream_name
2312             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2313             # from corresponding rendition group
2314             stream_group_id = last_stream_inf.get('VIDEO')
2315             if not stream_group_id:
2316                 return
2317             stream_group = groups.get(stream_group_id)
2318             if not stream_group:
2319                 return stream_group_id
2320             rendition = stream_group[0]
2321             return rendition.get('NAME') or stream_group_id
2322
2323         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2324         # chance to detect video only formats when EXT-X-STREAM-INF tags
2325         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2326         for line in m3u8_doc.splitlines():
2327             if line.startswith('#EXT-X-MEDIA:'):
2328                 extract_media(line)
2329
2330         for line in m3u8_doc.splitlines():
2331             if line.startswith('#EXT-X-STREAM-INF:'):
2332                 last_stream_inf = parse_m3u8_attributes(line)
2333             elif line.startswith('#') or not line.strip():
2334                 continue
2335             else:
2336                 tbr = float_or_none(
2337                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2338                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2339                 manifest_url = format_url(line.strip())
2340
2341                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2342                     format_id = [m3u8_id, None, idx]
2343                     # Bandwidth of live streams may differ over time thus making
2344                     # format_id unpredictable. So it's better to keep provided
2345                     # format_id intact.
2346                     if not live:
2347                         stream_name = build_stream_name()
2348                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2349                     f = {
2350                         'format_id': join_nonempty(*format_id),
2351                         'format_index': idx,
2352                         'url': manifest_url,
2353                         'manifest_url': m3u8_url,
2354                         'tbr': tbr,
2355                         'ext': ext,
2356                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2357                         'protocol': entry_protocol,
2358                         'preference': preference,
2359                         'quality': quality,
2360                     }
2361                     resolution = last_stream_inf.get('RESOLUTION')
2362                     if resolution:
2363                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2364                         if mobj:
2365                             f['width'] = int(mobj.group('width'))
2366                             f['height'] = int(mobj.group('height'))
2367                     # Unified Streaming Platform
2368                     mobj = re.search(
2369                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2370                     if mobj:
2371                         abr, vbr = mobj.groups()
2372                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2373                         f.update({
2374                             'vbr': vbr,
2375                             'abr': abr,
2376                         })
2377                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2378                     f.update(codecs)
2379                     audio_group_id = last_stream_inf.get('AUDIO')
2380                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2381                     # references a rendition group MUST have a CODECS attribute.
2382                     # However, this is not always respected. E.g. [2]
2383                     # contains EXT-X-STREAM-INF tag which references AUDIO
2384                     # rendition group but does not have CODECS and despite
2385                     # referencing an audio group it represents a complete
2386                     # (with audio and video) format. So, for such cases we will
2387                     # ignore references to rendition groups and treat them
2388                     # as complete formats.
2389                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2390                         audio_group = groups.get(audio_group_id)
2391                         if audio_group and audio_group[0].get('URI'):
2392                             # TODO: update acodec for audio only formats with
2393                             # the same GROUP-ID
2394                             f['acodec'] = 'none'
2395                     if not f.get('ext'):
2396                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2397                     formats.append(f)
2398
2399                     # for DailyMotion
2400                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2401                     if progressive_uri:
2402                         http_f = f.copy()
2403                         del http_f['manifest_url']
2404                         http_f.update({
2405                             'format_id': f['format_id'].replace('hls-', 'http-'),
2406                             'protocol': 'http',
2407                             'url': progressive_uri,
2408                         })
2409                         formats.append(http_f)
2410
2411                 last_stream_inf = {}
2412         return formats, subtitles
2413
2414     def _extract_m3u8_vod_duration(
2415             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2416
2417         m3u8_vod = self._download_webpage(
2418             m3u8_vod_url, video_id,
2419             note='Downloading m3u8 VOD manifest' if note is None else note,
2420             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2421             fatal=False, data=data, headers=headers, query=query)
2422
2423         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2424
2425     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2426         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2427             return None
2428
2429         return int(sum(
2430             float(line[len('#EXTINF:'):].split(',')[0])
2431             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2432
2433     @staticmethod
2434     def _xpath_ns(path, namespace=None):
2435         if not namespace:
2436             return path
2437         out = []
2438         for c in path.split('/'):
2439             if not c or c == '.':
2440                 out.append(c)
2441             else:
2442                 out.append('{%s}%s' % (namespace, c))
2443         return '/'.join(out)
2444
2445     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2446         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2447         if res is False:
2448             assert not fatal
2449             return [], {}
2450
2451         smil, urlh = res
2452         smil_url = urlh.geturl()
2453
2454         namespace = self._parse_smil_namespace(smil)
2455
2456         fmts = self._parse_smil_formats(
2457             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2458         subs = self._parse_smil_subtitles(
2459             smil, namespace=namespace)
2460
2461         return fmts, subs
2462
2463     def _extract_smil_formats(self, *args, **kwargs):
2464         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2465         if subs:
2466             self._report_ignoring_subs('SMIL')
2467         return fmts
2468
2469     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2470         res = self._download_smil(smil_url, video_id, fatal=fatal)
2471         if res is False:
2472             return {}
2473
2474         smil, urlh = res
2475         smil_url = urlh.geturl()
2476
2477         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2478
2479     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2480         return self._download_xml_handle(
2481             smil_url, video_id, 'Downloading SMIL file',
2482             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2483
2484     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2485         namespace = self._parse_smil_namespace(smil)
2486
2487         formats = self._parse_smil_formats(
2488             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2489         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2490
2491         video_id = os.path.splitext(url_basename(smil_url))[0]
2492         title = None
2493         description = None
2494         upload_date = None
2495         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2496             name = meta.attrib.get('name')
2497             content = meta.attrib.get('content')
2498             if not name or not content:
2499                 continue
2500             if not title and name == 'title':
2501                 title = content
2502             elif not description and name in ('description', 'abstract'):
2503                 description = content
2504             elif not upload_date and name == 'date':
2505                 upload_date = unified_strdate(content)
2506
2507         thumbnails = [{
2508             'id': image.get('type'),
2509             'url': image.get('src'),
2510             'width': int_or_none(image.get('width')),
2511             'height': int_or_none(image.get('height')),
2512         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2513
2514         return {
2515             'id': video_id,
2516             'title': title or video_id,
2517             'description': description,
2518             'upload_date': upload_date,
2519             'thumbnails': thumbnails,
2520             'formats': formats,
2521             'subtitles': subtitles,
2522         }
2523
2524     def _parse_smil_namespace(self, smil):
2525         return self._search_regex(
2526             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2527
2528     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2529         base = smil_url
2530         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2531             b = meta.get('base') or meta.get('httpBase')
2532             if b:
2533                 base = b
2534                 break
2535
2536         formats = []
2537         rtmp_count = 0
2538         http_count = 0
2539         m3u8_count = 0
2540         imgs_count = 0
2541
2542         srcs = set()
2543         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2544         for medium in media:
2545             src = medium.get('src')
2546             if not src or src in srcs:
2547                 continue
2548             srcs.add(src)
2549
2550             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2551             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2552             width = int_or_none(medium.get('width'))
2553             height = int_or_none(medium.get('height'))
2554             proto = medium.get('proto')
2555             ext = medium.get('ext')
2556             src_ext = determine_ext(src)
2557             streamer = medium.get('streamer') or base
2558
2559             if proto == 'rtmp' or streamer.startswith('rtmp'):
2560                 rtmp_count += 1
2561                 formats.append({
2562                     'url': streamer,
2563                     'play_path': src,
2564                     'ext': 'flv',
2565                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2566                     'tbr': bitrate,
2567                     'filesize': filesize,
2568                     'width': width,
2569                     'height': height,
2570                 })
2571                 if transform_rtmp_url:
2572                     streamer, src = transform_rtmp_url(streamer, src)
2573                     formats[-1].update({
2574                         'url': streamer,
2575                         'play_path': src,
2576                     })
2577                 continue
2578
2579             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2580             src_url = src_url.strip()
2581
2582             if proto == 'm3u8' or src_ext == 'm3u8':
2583                 m3u8_formats = self._extract_m3u8_formats(
2584                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2585                 if len(m3u8_formats) == 1:
2586                     m3u8_count += 1
2587                     m3u8_formats[0].update({
2588                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2589                         'tbr': bitrate,
2590                         'width': width,
2591                         'height': height,
2592                     })
2593                 formats.extend(m3u8_formats)
2594             elif src_ext == 'f4m':
2595                 f4m_url = src_url
2596                 if not f4m_params:
2597                     f4m_params = {
2598                         'hdcore': '3.2.0',
2599                         'plugin': 'flowplayer-3.2.0.1',
2600                     }
2601                 f4m_url += '&' if '?' in f4m_url else '?'
2602                 f4m_url += urllib.parse.urlencode(f4m_params)
2603                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2604             elif src_ext == 'mpd':
2605                 formats.extend(self._extract_mpd_formats(
2606                     src_url, video_id, mpd_id='dash', fatal=False))
2607             elif re.search(r'\.ism/[Mm]anifest', src_url):
2608                 formats.extend(self._extract_ism_formats(
2609                     src_url, video_id, ism_id='mss', fatal=False))
2610             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2611                 http_count += 1
2612                 formats.append({
2613                     'url': src_url,
2614                     'ext': ext or src_ext or 'flv',
2615                     'format_id': 'http-%d' % (bitrate or http_count),
2616                     'tbr': bitrate,
2617                     'filesize': filesize,
2618                     'width': width,
2619                     'height': height,
2620                 })
2621
2622         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2623             src = medium.get('src')
2624             if not src or src in srcs:
2625                 continue
2626             srcs.add(src)
2627
2628             imgs_count += 1
2629             formats.append({
2630                 'format_id': 'imagestream-%d' % (imgs_count),
2631                 'url': src,
2632                 'ext': mimetype2ext(medium.get('type')),
2633                 'acodec': 'none',
2634                 'vcodec': 'none',
2635                 'width': int_or_none(medium.get('width')),
2636                 'height': int_or_none(medium.get('height')),
2637                 'format_note': 'SMIL storyboards',
2638             })
2639
2640         return formats
2641
2642     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2643         urls = []
2644         subtitles = {}
2645         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2646             src = textstream.get('src')
2647             if not src or src in urls:
2648                 continue
2649             urls.append(src)
2650             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2651             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2652             subtitles.setdefault(lang, []).append({
2653                 'url': src,
2654                 'ext': ext,
2655             })
2656         return subtitles
2657
2658     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2659         res = self._download_xml_handle(
2660             xspf_url, playlist_id, 'Downloading xpsf playlist',
2661             'Unable to download xspf manifest', fatal=fatal)
2662         if res is False:
2663             return []
2664
2665         xspf, urlh = res
2666         xspf_url = urlh.geturl()
2667
2668         return self._parse_xspf(
2669             xspf, playlist_id, xspf_url=xspf_url,
2670             xspf_base_url=base_url(xspf_url))
2671
2672     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2673         NS_MAP = {
2674             'xspf': 'http://xspf.org/ns/0/',
2675             's1': 'http://static.streamone.nl/player/ns/0',
2676         }
2677
2678         entries = []
2679         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2680             title = xpath_text(
2681                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2682             description = xpath_text(
2683                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2684             thumbnail = xpath_text(
2685                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2686             duration = float_or_none(
2687                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2688
2689             formats = []
2690             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2691                 format_url = urljoin(xspf_base_url, location.text)
2692                 if not format_url:
2693                     continue
2694                 formats.append({
2695                     'url': format_url,
2696                     'manifest_url': xspf_url,
2697                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2698                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2699                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2700                 })
2701             self._sort_formats(formats)
2702
2703             entries.append({
2704                 'id': playlist_id,
2705                 'title': title,
2706                 'description': description,
2707                 'thumbnail': thumbnail,
2708                 'duration': duration,
2709                 'formats': formats,
2710             })
2711         return entries
2712
2713     def _extract_mpd_formats(self, *args, **kwargs):
2714         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2715         if subs:
2716             self._report_ignoring_subs('DASH')
2717         return fmts
2718
2719     def _extract_mpd_formats_and_subtitles(
2720             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2721             fatal=True, data=None, headers={}, query={}):
2722         res = self._download_xml_handle(
2723             mpd_url, video_id,
2724             note='Downloading MPD manifest' if note is None else note,
2725             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2726             fatal=fatal, data=data, headers=headers, query=query)
2727         if res is False:
2728             return [], {}
2729         mpd_doc, urlh = res
2730         if mpd_doc is None:
2731             return [], {}
2732
2733         # We could have been redirected to a new url when we retrieved our mpd file.
2734         mpd_url = urlh.geturl()
2735         mpd_base_url = base_url(mpd_url)
2736
2737         return self._parse_mpd_formats_and_subtitles(
2738             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2739
2740     def _parse_mpd_formats(self, *args, **kwargs):
2741         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2742         if subs:
2743             self._report_ignoring_subs('DASH')
2744         return fmts
2745
2746     def _parse_mpd_formats_and_subtitles(
2747             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2748         """
2749         Parse formats from MPD manifest.
2750         References:
2751          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2752             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2753          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2754         """
2755         if not self.get_param('dynamic_mpd', True):
2756             if mpd_doc.get('type') == 'dynamic':
2757                 return [], {}
2758
2759         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2760
2761         def _add_ns(path):
2762             return self._xpath_ns(path, namespace)
2763
2764         def is_drm_protected(element):
2765             return element.find(_add_ns('ContentProtection')) is not None
2766
2767         def extract_multisegment_info(element, ms_parent_info):
2768             ms_info = ms_parent_info.copy()
2769
2770             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2771             # common attributes and elements.  We will only extract relevant
2772             # for us.
2773             def extract_common(source):
2774                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2775                 if segment_timeline is not None:
2776                     s_e = segment_timeline.findall(_add_ns('S'))
2777                     if s_e:
2778                         ms_info['total_number'] = 0
2779                         ms_info['s'] = []
2780                         for s in s_e:
2781                             r = int(s.get('r', 0))
2782                             ms_info['total_number'] += 1 + r
2783                             ms_info['s'].append({
2784                                 't': int(s.get('t', 0)),
2785                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2786                                 'd': int(s.attrib['d']),
2787                                 'r': r,
2788                             })
2789                 start_number = source.get('startNumber')
2790                 if start_number:
2791                     ms_info['start_number'] = int(start_number)
2792                 timescale = source.get('timescale')
2793                 if timescale:
2794                     ms_info['timescale'] = int(timescale)
2795                 segment_duration = source.get('duration')
2796                 if segment_duration:
2797                     ms_info['segment_duration'] = float(segment_duration)
2798
2799             def extract_Initialization(source):
2800                 initialization = source.find(_add_ns('Initialization'))
2801                 if initialization is not None:
2802                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2803
2804             segment_list = element.find(_add_ns('SegmentList'))
2805             if segment_list is not None:
2806                 extract_common(segment_list)
2807                 extract_Initialization(segment_list)
2808                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2809                 if segment_urls_e:
2810                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2811             else:
2812                 segment_template = element.find(_add_ns('SegmentTemplate'))
2813                 if segment_template is not None:
2814                     extract_common(segment_template)
2815                     media = segment_template.get('media')
2816                     if media:
2817                         ms_info['media'] = media
2818                     initialization = segment_template.get('initialization')
2819                     if initialization:
2820                         ms_info['initialization'] = initialization
2821                     else:
2822                         extract_Initialization(segment_template)
2823             return ms_info
2824
2825         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2826         formats, subtitles = [], {}
2827         stream_numbers = collections.defaultdict(int)
2828         for period in mpd_doc.findall(_add_ns('Period')):
2829             period_duration = parse_duration(period.get('duration')) or mpd_duration
2830             period_ms_info = extract_multisegment_info(period, {
2831                 'start_number': 1,
2832                 'timescale': 1,
2833             })
2834             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2835                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2836                 for representation in adaptation_set.findall(_add_ns('Representation')):
2837                     representation_attrib = adaptation_set.attrib.copy()
2838                     representation_attrib.update(representation.attrib)
2839                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2840                     mime_type = representation_attrib['mimeType']
2841                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2842
2843                     codec_str = representation_attrib.get('codecs', '')
2844                     # Some kind of binary subtitle found in some youtube livestreams
2845                     if mime_type == 'application/x-rawcc':
2846                         codecs = {'scodec': codec_str}
2847                     else:
2848                         codecs = parse_codecs(codec_str)
2849                     if content_type not in ('video', 'audio', 'text'):
2850                         if mime_type == 'image/jpeg':
2851                             content_type = mime_type
2852                         elif codecs.get('vcodec', 'none') != 'none':
2853                             content_type = 'video'
2854                         elif codecs.get('acodec', 'none') != 'none':
2855                             content_type = 'audio'
2856                         elif codecs.get('scodec', 'none') != 'none':
2857                             content_type = 'text'
2858                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2859                             content_type = 'text'
2860                         else:
2861                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2862                             continue
2863
2864                     base_url = ''
2865                     for element in (representation, adaptation_set, period, mpd_doc):
2866                         base_url_e = element.find(_add_ns('BaseURL'))
2867                         if try_call(lambda: base_url_e.text) is not None:
2868                             base_url = base_url_e.text + base_url
2869                             if re.match(r'^https?://', base_url):
2870                                 break
2871                     if mpd_base_url and base_url.startswith('/'):
2872                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2873                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2874                         if not mpd_base_url.endswith('/'):
2875                             mpd_base_url += '/'
2876                         base_url = mpd_base_url + base_url
2877                     representation_id = representation_attrib.get('id')
2878                     lang = representation_attrib.get('lang')
2879                     url_el = representation.find(_add_ns('BaseURL'))
2880                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2881                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2882                     if representation_id is not None:
2883                         format_id = representation_id
2884                     else:
2885                         format_id = content_type
2886                     if mpd_id:
2887                         format_id = mpd_id + '-' + format_id
2888                     if content_type in ('video', 'audio'):
2889                         f = {
2890                             'format_id': format_id,
2891                             'manifest_url': mpd_url,
2892                             'ext': mimetype2ext(mime_type),
2893                             'width': int_or_none(representation_attrib.get('width')),
2894                             'height': int_or_none(representation_attrib.get('height')),
2895                             'tbr': float_or_none(bandwidth, 1000),
2896                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2897                             'fps': int_or_none(representation_attrib.get('frameRate')),
2898                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2899                             'format_note': 'DASH %s' % content_type,
2900                             'filesize': filesize,
2901                             'container': mimetype2ext(mime_type) + '_dash',
2902                             **codecs
2903                         }
2904                     elif content_type == 'text':
2905                         f = {
2906                             'ext': mimetype2ext(mime_type),
2907                             'manifest_url': mpd_url,
2908                             'filesize': filesize,
2909                         }
2910                     elif content_type == 'image/jpeg':
2911                         # See test case in VikiIE
2912                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2913                         f = {
2914                             'format_id': format_id,
2915                             'ext': 'mhtml',
2916                             'manifest_url': mpd_url,
2917                             'format_note': 'DASH storyboards (jpeg)',
2918                             'acodec': 'none',
2919                             'vcodec': 'none',
2920                         }
2921                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2922                         f['has_drm'] = True
2923                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2924
2925                     def prepare_template(template_name, identifiers):
2926                         tmpl = representation_ms_info[template_name]
2927                         if representation_id is not None:
2928                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2929                         # First of, % characters outside $...$ templates
2930                         # must be escaped by doubling for proper processing
2931                         # by % operator string formatting used further (see
2932                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2933                         t = ''
2934                         in_template = False
2935                         for c in tmpl:
2936                             t += c
2937                             if c == '$':
2938                                 in_template = not in_template
2939                             elif c == '%' and not in_template:
2940                                 t += c
2941                         # Next, $...$ templates are translated to their
2942                         # %(...) counterparts to be used with % operator
2943                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2944                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2945                         t.replace('$$', '$')
2946                         return t
2947
2948                     # @initialization is a regular template like @media one
2949                     # so it should be handled just the same way (see
2950                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2951                     if 'initialization' in representation_ms_info:
2952                         initialization_template = prepare_template(
2953                             'initialization',
2954                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2955                             # $Time$ shall not be included for @initialization thus
2956                             # only $Bandwidth$ remains
2957                             ('Bandwidth', ))
2958                         representation_ms_info['initialization_url'] = initialization_template % {
2959                             'Bandwidth': bandwidth,
2960                         }
2961
2962                     def location_key(location):
2963                         return 'url' if re.match(r'^https?://', location) else 'path'
2964
2965                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2966
2967                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2968                         media_location_key = location_key(media_template)
2969
2970                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2971                         # can't be used at the same time
2972                         if '%(Number' in media_template and 's' not in representation_ms_info:
2973                             segment_duration = None
2974                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2975                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2976                                 representation_ms_info['total_number'] = int(math.ceil(
2977                                     float_or_none(period_duration, segment_duration, default=0)))
2978                             representation_ms_info['fragments'] = [{
2979                                 media_location_key: media_template % {
2980                                     'Number': segment_number,
2981                                     'Bandwidth': bandwidth,
2982                                 },
2983                                 'duration': segment_duration,
2984                             } for segment_number in range(
2985                                 representation_ms_info['start_number'],
2986                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2987                         else:
2988                             # $Number*$ or $Time$ in media template with S list available
2989                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2990                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2991                             representation_ms_info['fragments'] = []
2992                             segment_time = 0
2993                             segment_d = None
2994                             segment_number = representation_ms_info['start_number']
2995
2996                             def add_segment_url():
2997                                 segment_url = media_template % {
2998                                     'Time': segment_time,
2999                                     'Bandwidth': bandwidth,
3000                                     'Number': segment_number,
3001                                 }
3002                                 representation_ms_info['fragments'].append({
3003                                     media_location_key: segment_url,
3004                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
3005                                 })
3006
3007                             for num, s in enumerate(representation_ms_info['s']):
3008                                 segment_time = s.get('t') or segment_time
3009                                 segment_d = s['d']
3010                                 add_segment_url()
3011                                 segment_number += 1
3012                                 for r in range(s.get('r', 0)):
3013                                     segment_time += segment_d
3014                                     add_segment_url()
3015                                     segment_number += 1
3016                                 segment_time += segment_d
3017                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
3018                         # No media template,
3019                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
3020                         # or any YouTube dashsegments video
3021                         fragments = []
3022                         segment_index = 0
3023                         timescale = representation_ms_info['timescale']
3024                         for s in representation_ms_info['s']:
3025                             duration = float_or_none(s['d'], timescale)
3026                             for r in range(s.get('r', 0) + 1):
3027                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
3028                                 fragments.append({
3029                                     location_key(segment_uri): segment_uri,
3030                                     'duration': duration,
3031                                 })
3032                                 segment_index += 1
3033                         representation_ms_info['fragments'] = fragments
3034                     elif 'segment_urls' in representation_ms_info:
3035                         # Segment URLs with no SegmentTimeline
3036                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3037                         # https://github.com/ytdl-org/youtube-dl/pull/14844
3038                         fragments = []
3039                         segment_duration = float_or_none(
3040                             representation_ms_info['segment_duration'],
3041                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3042                         for segment_url in representation_ms_info['segment_urls']:
3043                             fragment = {
3044                                 location_key(segment_url): segment_url,
3045                             }
3046                             if segment_duration:
3047                                 fragment['duration'] = segment_duration
3048                             fragments.append(fragment)
3049                         representation_ms_info['fragments'] = fragments
3050                     # If there is a fragments key available then we correctly recognized fragmented media.
3051                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3052                     # assumption is not necessarily correct since we may simply have no support for
3053                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3054                     if 'fragments' in representation_ms_info:
3055                         f.update({
3056                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3057                             'url': mpd_url or base_url,
3058                             'fragment_base_url': base_url,
3059                             'fragments': [],
3060                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3061                         })
3062                         if 'initialization_url' in representation_ms_info:
3063                             initialization_url = representation_ms_info['initialization_url']
3064                             if not f.get('url'):
3065                                 f['url'] = initialization_url
3066                             f['fragments'].append({location_key(initialization_url): initialization_url})
3067                         f['fragments'].extend(representation_ms_info['fragments'])
3068                         if not period_duration:
3069                             period_duration = try_get(
3070                                 representation_ms_info,
3071                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3072                     else:
3073                         # Assuming direct URL to unfragmented media.
3074                         f['url'] = base_url
3075                     if content_type in ('video', 'audio', 'image/jpeg'):
3076                         f['manifest_stream_number'] = stream_numbers[f['url']]
3077                         stream_numbers[f['url']] += 1
3078                         formats.append(f)
3079                     elif content_type == 'text':
3080                         subtitles.setdefault(lang or 'und', []).append(f)
3081
3082         return formats, subtitles
3083
3084     def _extract_ism_formats(self, *args, **kwargs):
3085         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3086         if subs:
3087             self._report_ignoring_subs('ISM')
3088         return fmts
3089
3090     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3091         res = self._download_xml_handle(
3092             ism_url, video_id,
3093             note='Downloading ISM manifest' if note is None else note,
3094             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3095             fatal=fatal, data=data, headers=headers, query=query)
3096         if res is False:
3097             return [], {}
3098         ism_doc, urlh = res
3099         if ism_doc is None:
3100             return [], {}
3101
3102         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3103
3104     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3105         """
3106         Parse formats from ISM manifest.
3107         References:
3108          1. [MS-SSTR]: Smooth Streaming Protocol,
3109             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3110         """
3111         if ism_doc.get('IsLive') == 'TRUE':
3112             return [], {}
3113
3114         duration = int(ism_doc.attrib['Duration'])
3115         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3116
3117         formats = []
3118         subtitles = {}
3119         for stream in ism_doc.findall('StreamIndex'):
3120             stream_type = stream.get('Type')
3121             if stream_type not in ('video', 'audio', 'text'):
3122                 continue
3123             url_pattern = stream.attrib['Url']
3124             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3125             stream_name = stream.get('Name')
3126             stream_language = stream.get('Language', 'und')
3127             for track in stream.findall('QualityLevel'):
3128                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3129                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3130                 # TODO: add support for WVC1 and WMAP
3131                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3132                     self.report_warning('%s is not a supported codec' % fourcc)
3133                     continue
3134                 tbr = int(track.attrib['Bitrate']) // 1000
3135                 # [1] does not mention Width and Height attributes. However,
3136                 # they're often present while MaxWidth and MaxHeight are
3137                 # missing, so should be used as fallbacks
3138                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3139                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3140                 sampling_rate = int_or_none(track.get('SamplingRate'))
3141
3142                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3143                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3144
3145                 fragments = []
3146                 fragment_ctx = {
3147                     'time': 0,
3148                 }
3149                 stream_fragments = stream.findall('c')
3150                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3151                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3152                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3153                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3154                     if not fragment_ctx['duration']:
3155                         try:
3156                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3157                         except IndexError:
3158                             next_fragment_time = duration
3159                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3160                     for _ in range(fragment_repeat):
3161                         fragments.append({
3162                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3163                             'duration': fragment_ctx['duration'] / stream_timescale,
3164                         })
3165                         fragment_ctx['time'] += fragment_ctx['duration']
3166
3167                 if stream_type == 'text':
3168                     subtitles.setdefault(stream_language, []).append({
3169                         'ext': 'ismt',
3170                         'protocol': 'ism',
3171                         'url': ism_url,
3172                         'manifest_url': ism_url,
3173                         'fragments': fragments,
3174                         '_download_params': {
3175                             'stream_type': stream_type,
3176                             'duration': duration,
3177                             'timescale': stream_timescale,
3178                             'fourcc': fourcc,
3179                             'language': stream_language,
3180                             'codec_private_data': track.get('CodecPrivateData'),
3181                         }
3182                     })
3183                 elif stream_type in ('video', 'audio'):
3184                     formats.append({
3185                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3186                         'url': ism_url,
3187                         'manifest_url': ism_url,
3188                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3189                         'width': width,
3190                         'height': height,
3191                         'tbr': tbr,
3192                         'asr': sampling_rate,
3193                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3194                         'acodec': 'none' if stream_type == 'video' else fourcc,
3195                         'protocol': 'ism',
3196                         'fragments': fragments,
3197                         'has_drm': ism_doc.find('Protection') is not None,
3198                         '_download_params': {
3199                             'stream_type': stream_type,
3200                             'duration': duration,
3201                             'timescale': stream_timescale,
3202                             'width': width or 0,
3203                             'height': height or 0,
3204                             'fourcc': fourcc,
3205                             'language': stream_language,
3206                             'codec_private_data': track.get('CodecPrivateData'),
3207                             'sampling_rate': sampling_rate,
3208                             'channels': int_or_none(track.get('Channels', 2)),
3209                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3210                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3211                         },
3212                     })
3213         return formats, subtitles
3214
3215     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3216         def absolute_url(item_url):
3217             return urljoin(base_url, item_url)
3218
3219         def parse_content_type(content_type):
3220             if not content_type:
3221                 return {}
3222             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3223             if ctr:
3224                 mimetype, codecs = ctr.groups()
3225                 f = parse_codecs(codecs)
3226                 f['ext'] = mimetype2ext(mimetype)
3227                 return f
3228             return {}
3229
3230         def _media_formats(src, cur_media_type, type_info=None):
3231             type_info = type_info or {}
3232             full_url = absolute_url(src)
3233             ext = type_info.get('ext') or determine_ext(full_url)
3234             if ext == 'm3u8':
3235                 is_plain_url = False
3236                 formats = self._extract_m3u8_formats(
3237                     full_url, video_id, ext='mp4',
3238                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3239                     preference=preference, quality=quality, fatal=False)
3240             elif ext == 'mpd':
3241                 is_plain_url = False
3242                 formats = self._extract_mpd_formats(
3243                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3244             else:
3245                 is_plain_url = True
3246                 formats = [{
3247                     'url': full_url,
3248                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3249                     'ext': ext,
3250                 }]
3251             return is_plain_url, formats
3252
3253         entries = []
3254         # amp-video and amp-audio are very similar to their HTML5 counterparts
3255         # so we will include them right here (see
3256         # https://www.ampproject.org/docs/reference/components/amp-video)
3257         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3258         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3259         media_tags = [(media_tag, media_tag_name, media_type, '')
3260                       for media_tag, media_tag_name, media_type
3261                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3262         media_tags.extend(re.findall(
3263             # We only allow video|audio followed by a whitespace or '>'.
3264             # Allowing more characters may end up in significant slow down (see
3265             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3266             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3267             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3268         for media_tag, _, media_type, media_content in media_tags:
3269             media_info = {
3270                 'formats': [],
3271                 'subtitles': {},
3272             }
3273             media_attributes = extract_attributes(media_tag)
3274             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3275             if src:
3276                 f = parse_content_type(media_attributes.get('type'))
3277                 _, formats = _media_formats(src, media_type, f)
3278                 media_info['formats'].extend(formats)
3279             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3280             if media_content:
3281                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3282                     s_attr = extract_attributes(source_tag)
3283                     # data-video-src and data-src are non standard but seen
3284                     # several times in the wild
3285                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3286                     if not src:
3287                         continue
3288                     f = parse_content_type(s_attr.get('type'))
3289                     is_plain_url, formats = _media_formats(src, media_type, f)
3290                     if is_plain_url:
3291                         # width, height, res, label and title attributes are
3292                         # all not standard but seen several times in the wild
3293                         labels = [
3294                             s_attr.get(lbl)
3295                             for lbl in ('label', 'title')
3296                             if str_or_none(s_attr.get(lbl))
3297                         ]
3298                         width = int_or_none(s_attr.get('width'))
3299                         height = (int_or_none(s_attr.get('height'))
3300                                   or int_or_none(s_attr.get('res')))
3301                         if not width or not height:
3302                             for lbl in labels:
3303                                 resolution = parse_resolution(lbl)
3304                                 if not resolution:
3305                                     continue
3306                                 width = width or resolution.get('width')
3307                                 height = height or resolution.get('height')
3308                         for lbl in labels:
3309                             tbr = parse_bitrate(lbl)
3310                             if tbr:
3311                                 break
3312                         else:
3313                             tbr = None
3314                         f.update({
3315                             'width': width,
3316                             'height': height,
3317                             'tbr': tbr,
3318                             'format_id': s_attr.get('label') or s_attr.get('title'),
3319                         })
3320                         f.update(formats[0])
3321                         media_info['formats'].append(f)
3322                     else:
3323                         media_info['formats'].extend(formats)
3324                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3325                     track_attributes = extract_attributes(track_tag)
3326                     kind = track_attributes.get('kind')
3327                     if not kind or kind in ('subtitles', 'captions'):
3328                         src = strip_or_none(track_attributes.get('src'))
3329                         if not src:
3330                             continue
3331                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3332                         media_info['subtitles'].setdefault(lang, []).append({
3333                             'url': absolute_url(src),
3334                         })
3335             for f in media_info['formats']:
3336                 f.setdefault('http_headers', {})['Referer'] = base_url
3337             if media_info['formats'] or media_info['subtitles']:
3338                 entries.append(media_info)
3339         return entries
3340
3341     def _extract_akamai_formats(self, *args, **kwargs):
3342         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3343         if subs:
3344             self._report_ignoring_subs('akamai')
3345         return fmts
3346
3347     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3348         signed = 'hdnea=' in manifest_url
3349         if not signed:
3350             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3351             manifest_url = re.sub(
3352                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3353                 '', manifest_url).strip('?')
3354
3355         formats = []
3356         subtitles = {}
3357
3358         hdcore_sign = 'hdcore=3.7.0'
3359         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3360         hds_host = hosts.get('hds')
3361         if hds_host:
3362             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3363         if 'hdcore=' not in f4m_url:
3364             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3365         f4m_formats = self._extract_f4m_formats(
3366             f4m_url, video_id, f4m_id='hds', fatal=False)
3367         for entry in f4m_formats:
3368             entry.update({'extra_param_to_segment_url': hdcore_sign})
3369         formats.extend(f4m_formats)
3370
3371         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3372         hls_host = hosts.get('hls')
3373         if hls_host:
3374             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3375         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3376             m3u8_url, video_id, 'mp4', 'm3u8_native',
3377             m3u8_id='hls', fatal=False)
3378         formats.extend(m3u8_formats)
3379         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3380
3381         http_host = hosts.get('http')
3382         if http_host and m3u8_formats and not signed:
3383             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3384             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3385             qualities_length = len(qualities)
3386             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3387                 i = 0
3388                 for f in m3u8_formats:
3389                     if f['vcodec'] != 'none':
3390                         for protocol in ('http', 'https'):
3391                             http_f = f.copy()
3392                             del http_f['manifest_url']
3393                             http_url = re.sub(
3394                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3395                             http_f.update({
3396                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3397                                 'url': http_url,
3398                                 'protocol': protocol,
3399                             })
3400                             formats.append(http_f)
3401                         i += 1
3402
3403         return formats, subtitles
3404
3405     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3406         query = urllib.parse.urlparse(url).query
3407         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3408         mobj = re.search(
3409             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3410         url_base = mobj.group('url')
3411         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3412         formats = []
3413
3414         def manifest_url(manifest):
3415             m_url = f'{http_base_url}/{manifest}'
3416             if query:
3417                 m_url += '?%s' % query
3418             return m_url
3419
3420         if 'm3u8' not in skip_protocols:
3421             formats.extend(self._extract_m3u8_formats(
3422                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3423                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3424         if 'f4m' not in skip_protocols:
3425             formats.extend(self._extract_f4m_formats(
3426                 manifest_url('manifest.f4m'),
3427                 video_id, f4m_id='hds', fatal=False))
3428         if 'dash' not in skip_protocols:
3429             formats.extend(self._extract_mpd_formats(
3430                 manifest_url('manifest.mpd'),
3431                 video_id, mpd_id='dash', fatal=False))
3432         if re.search(r'(?:/smil:|\.smil)', url_base):
3433             if 'smil' not in skip_protocols:
3434                 rtmp_formats = self._extract_smil_formats(
3435                     manifest_url('jwplayer.smil'),
3436                     video_id, fatal=False)
3437                 for rtmp_format in rtmp_formats:
3438                     rtsp_format = rtmp_format.copy()
3439                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3440                     del rtsp_format['play_path']
3441                     del rtsp_format['ext']
3442                     rtsp_format.update({
3443                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3444                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3445                         'protocol': 'rtsp',
3446                     })
3447                     formats.extend([rtmp_format, rtsp_format])
3448         else:
3449             for protocol in ('rtmp', 'rtsp'):
3450                 if protocol not in skip_protocols:
3451                     formats.append({
3452                         'url': f'{protocol}:{url_base}',
3453                         'format_id': protocol,
3454                         'protocol': protocol,
3455                     })
3456         return formats
3457
3458     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3459         mobj = re.search(
3460             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3461             webpage)
3462         if mobj:
3463             try:
3464                 jwplayer_data = self._parse_json(mobj.group('options'),
3465                                                  video_id=video_id,
3466                                                  transform_source=transform_source)
3467             except ExtractorError:
3468                 pass
3469             else:
3470                 if isinstance(jwplayer_data, dict):
3471                     return jwplayer_data
3472
3473     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3474         jwplayer_data = self._find_jwplayer_data(
3475             webpage, video_id, transform_source=js_to_json)
3476         return self._parse_jwplayer_data(
3477             jwplayer_data, video_id, *args, **kwargs)
3478
3479     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3480                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3481         # JWPlayer backward compatibility: flattened playlists
3482         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3483         if 'playlist' not in jwplayer_data:
3484             jwplayer_data = {'playlist': [jwplayer_data]}
3485
3486         entries = []
3487
3488         # JWPlayer backward compatibility: single playlist item
3489         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3490         if not isinstance(jwplayer_data['playlist'], list):
3491             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3492
3493         for video_data in jwplayer_data['playlist']:
3494             # JWPlayer backward compatibility: flattened sources
3495             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3496             if 'sources' not in video_data:
3497                 video_data['sources'] = [video_data]
3498
3499             this_video_id = video_id or video_data['mediaid']
3500
3501             formats = self._parse_jwplayer_formats(
3502                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3503                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3504
3505             subtitles = {}
3506             tracks = video_data.get('tracks')
3507             if tracks and isinstance(tracks, list):
3508                 for track in tracks:
3509                     if not isinstance(track, dict):
3510                         continue
3511                     track_kind = track.get('kind')
3512                     if not track_kind or not isinstance(track_kind, str):
3513                         continue
3514                     if track_kind.lower() not in ('captions', 'subtitles'):
3515                         continue
3516                     track_url = urljoin(base_url, track.get('file'))
3517                     if not track_url:
3518                         continue
3519                     subtitles.setdefault(track.get('label') or 'en', []).append({
3520                         'url': self._proto_relative_url(track_url)
3521                     })
3522
3523             entry = {
3524                 'id': this_video_id,
3525                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3526                 'description': clean_html(video_data.get('description')),
3527                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3528                 'timestamp': int_or_none(video_data.get('pubdate')),
3529                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3530                 'subtitles': subtitles,
3531             }
3532             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3533             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3534                 entry.update({
3535                     '_type': 'url_transparent',
3536                     'url': formats[0]['url'],
3537                 })
3538             else:
3539                 self._sort_formats(formats)
3540                 entry['formats'] = formats
3541             entries.append(entry)
3542         if len(entries) == 1:
3543             return entries[0]
3544         else:
3545             return self.playlist_result(entries)
3546
3547     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3548                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3549         urls = []
3550         formats = []
3551         for source in jwplayer_sources_data:
3552             if not isinstance(source, dict):
3553                 continue
3554             source_url = urljoin(
3555                 base_url, self._proto_relative_url(source.get('file')))
3556             if not source_url or source_url in urls:
3557                 continue
3558             urls.append(source_url)
3559             source_type = source.get('type') or ''
3560             ext = mimetype2ext(source_type) or determine_ext(source_url)
3561             if source_type == 'hls' or ext == 'm3u8':
3562                 formats.extend(self._extract_m3u8_formats(
3563                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3564                     m3u8_id=m3u8_id, fatal=False))
3565             elif source_type == 'dash' or ext == 'mpd':
3566                 formats.extend(self._extract_mpd_formats(
3567                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3568             elif ext == 'smil':
3569                 formats.extend(self._extract_smil_formats(
3570                     source_url, video_id, fatal=False))
3571             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3572             elif source_type.startswith('audio') or ext in (
3573                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3574                 formats.append({
3575                     'url': source_url,
3576                     'vcodec': 'none',
3577                     'ext': ext,
3578                 })
3579             else:
3580                 height = int_or_none(source.get('height'))
3581                 if height is None:
3582                     # Often no height is provided but there is a label in
3583                     # format like "1080p", "720p SD", or 1080.
3584                     height = int_or_none(self._search_regex(
3585                         r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
3586                         'height', default=None))
3587                 a_format = {
3588                     'url': source_url,
3589                     'width': int_or_none(source.get('width')),
3590                     'height': height,
3591                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3592                     'filesize': int_or_none(source.get('filesize')),
3593                     'ext': ext,
3594                 }
3595                 if source_url.startswith('rtmp'):
3596                     a_format['ext'] = 'flv'
3597                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3598                     # of jwplayer.flash.swf
3599                     rtmp_url_parts = re.split(
3600                         r'((?:mp4|mp3|flv):)', source_url, 1)
3601                     if len(rtmp_url_parts) == 3:
3602                         rtmp_url, prefix, play_path = rtmp_url_parts
3603                         a_format.update({
3604                             'url': rtmp_url,
3605                             'play_path': prefix + play_path,
3606                         })
3607                     if rtmp_params:
3608                         a_format.update(rtmp_params)
3609                 formats.append(a_format)
3610         return formats
3611
3612     def _live_title(self, name):
3613         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3614         return name
3615
3616     def _int(self, v, name, fatal=False, **kwargs):
3617         res = int_or_none(v, **kwargs)
3618         if res is None:
3619             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3620             if fatal:
3621                 raise ExtractorError(msg)
3622             else:
3623                 self.report_warning(msg)
3624         return res
3625
3626     def _float(self, v, name, fatal=False, **kwargs):
3627         res = float_or_none(v, **kwargs)
3628         if res is None:
3629             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3630             if fatal:
3631                 raise ExtractorError(msg)
3632             else:
3633                 self.report_warning(msg)
3634         return res
3635
3636     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3637                     path='/', secure=False, discard=False, rest={}, **kwargs):
3638         cookie = http.cookiejar.Cookie(
3639             0, name, value, port, port is not None, domain, True,
3640             domain.startswith('.'), path, True, secure, expire_time,
3641             discard, None, None, rest)
3642         self.cookiejar.set_cookie(cookie)
3643
3644     def _get_cookies(self, url):
3645         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3646         return LenientSimpleCookie(self._downloader._calc_cookies(url))
3647
3648     def _apply_first_set_cookie_header(self, url_handle, cookie):
3649         """
3650         Apply first Set-Cookie header instead of the last. Experimental.
3651
3652         Some sites (e.g. [1-3]) may serve two cookies under the same name
3653         in Set-Cookie header and expect the first (old) one to be set rather
3654         than second (new). However, as of RFC6265 the newer one cookie
3655         should be set into cookie store what actually happens.
3656         We will workaround this issue by resetting the cookie to
3657         the first one manually.
3658         1. https://new.vk.com/
3659         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3660         3. https://learning.oreilly.com/
3661         """
3662         for header, cookies in url_handle.headers.items():
3663             if header.lower() != 'set-cookie':
3664                 continue
3665             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3666             cookie_value = re.search(
3667                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3668             if cookie_value:
3669                 value, domain = cookie_value.groups()
3670                 self._set_cookie(domain, cookie, value)
3671                 break
3672
3673     @classmethod
3674     def get_testcases(cls, include_onlymatching=False):
3675         t = getattr(cls, '_TEST', None)
3676         if t:
3677             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3678             tests = [t]
3679         else:
3680             tests = getattr(cls, '_TESTS', [])
3681         for t in tests:
3682             if not include_onlymatching and t.get('only_matching', False):
3683                 continue
3684             t['name'] = cls.ie_key()
3685             yield t
3686
3687     @classmethod
3688     def get_webpage_testcases(cls):
3689         tests = getattr(cls, '_WEBPAGE_TESTS', [])
3690         for t in tests:
3691             t['name'] = cls.ie_key()
3692         return tests
3693
3694     @classproperty
3695     def age_limit(cls):
3696         """Get age limit from the testcases"""
3697         return max(traverse_obj(
3698             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3699             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3700
3701     @classmethod
3702     def is_suitable(cls, age_limit):
3703         """Test whether the extractor is generally suitable for the given age limit"""
3704         return not age_restricted(cls.age_limit, age_limit)
3705
3706     @classmethod
3707     def description(cls, *, markdown=True, search_examples=None):
3708         """Description of the extractor"""
3709         desc = ''
3710         if cls._NETRC_MACHINE:
3711             if markdown:
3712                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3713             else:
3714                 desc += f' [{cls._NETRC_MACHINE}]'
3715         if cls.IE_DESC is False:
3716             desc += ' [HIDDEN]'
3717         elif cls.IE_DESC:
3718             desc += f' {cls.IE_DESC}'
3719         if cls.SEARCH_KEY:
3720             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3721             if search_examples:
3722                 _COUNTS = ('', '5', '10', 'all')
3723                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3724         if not cls.working():
3725             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3726
3727         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3728         return f'{name}:{desc}' if desc else name
3729
3730     def extract_subtitles(self, *args, **kwargs):
3731         if (self.get_param('writesubtitles', False)
3732                 or self.get_param('listsubtitles')):
3733             return self._get_subtitles(*args, **kwargs)
3734         return {}
3735
3736     def _get_subtitles(self, *args, **kwargs):
3737         raise NotImplementedError('This method must be implemented by subclasses')
3738
3739     def extract_comments(self, *args, **kwargs):
3740         if not self.get_param('getcomments'):
3741             return None
3742         generator = self._get_comments(*args, **kwargs)
3743
3744         def extractor():
3745             comments = []
3746             interrupted = True
3747             try:
3748                 while True:
3749                     comments.append(next(generator))
3750             except StopIteration:
3751                 interrupted = False
3752             except KeyboardInterrupt:
3753                 self.to_screen('Interrupted by user')
3754             except Exception as e:
3755                 if self.get_param('ignoreerrors') is not True:
3756                     raise
3757                 self._downloader.report_error(e)
3758             comment_count = len(comments)
3759             self.to_screen(f'Extracted {comment_count} comments')
3760             return {
3761                 'comments': comments,
3762                 'comment_count': None if interrupted else comment_count
3763             }
3764         return extractor
3765
3766     def _get_comments(self, *args, **kwargs):
3767         raise NotImplementedError('This method must be implemented by subclasses')
3768
3769     @staticmethod
3770     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3771         """ Merge subtitle items for one language. Items with duplicated URLs/data
3772         will be dropped. """
3773         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3774         ret = list(subtitle_list1)
3775         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3776         return ret
3777
3778     @classmethod
3779     def _merge_subtitles(cls, *dicts, target=None):
3780         """ Merge subtitle dictionaries, language by language. """
3781         if target is None:
3782             target = {}
3783         for d in dicts:
3784             for lang, subs in d.items():
3785                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3786         return target
3787
3788     def extract_automatic_captions(self, *args, **kwargs):
3789         if (self.get_param('writeautomaticsub', False)
3790                 or self.get_param('listsubtitles')):
3791             return self._get_automatic_captions(*args, **kwargs)
3792         return {}
3793
3794     def _get_automatic_captions(self, *args, **kwargs):
3795         raise NotImplementedError('This method must be implemented by subclasses')
3796
3797     @functools.cached_property
3798     def _cookies_passed(self):
3799         """Whether cookies have been passed to YoutubeDL"""
3800         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3801
3802     def mark_watched(self, *args, **kwargs):
3803         if not self.get_param('mark_watched', False):
3804             return
3805         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3806             self._mark_watched(*args, **kwargs)
3807
3808     def _mark_watched(self, *args, **kwargs):
3809         raise NotImplementedError('This method must be implemented by subclasses')
3810
3811     def geo_verification_headers(self):
3812         headers = {}
3813         geo_verification_proxy = self.get_param('geo_verification_proxy')
3814         if geo_verification_proxy:
3815             headers['Ytdl-request-proxy'] = geo_verification_proxy
3816         return headers
3817
3818     @staticmethod
3819     def _generic_id(url):
3820         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3821
3822     @staticmethod
3823     def _generic_title(url):
3824         return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3825
3826     @staticmethod
3827     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3828         all_known = all(map(
3829             lambda x: x is not None,
3830             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3831         return (
3832             'private' if is_private
3833             else 'premium_only' if needs_premium
3834             else 'subscriber_only' if needs_subscription
3835             else 'needs_auth' if needs_auth
3836             else 'unlisted' if is_unlisted
3837             else 'public' if all_known
3838             else None)
3839
3840     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3841         '''
3842         @returns            A list of values for the extractor argument given by "key"
3843                             or "default" if no such key is present
3844         @param default      The default value to return when the key is not present (default: [])
3845         @param casesense    When false, the values are converted to lower case
3846         '''
3847         val = traverse_obj(
3848             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3849         if val is None:
3850             return [] if default is NO_DEFAULT else default
3851         return list(val) if casesense else [x.lower() for x in val]
3852
3853     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3854         if not playlist_id or not video_id:
3855             return not video_id
3856
3857         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3858         if no_playlist is not None:
3859             return not no_playlist
3860
3861         video_id = '' if video_id is True else f' {video_id}'
3862         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3863         if self.get_param('noplaylist'):
3864             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3865             return False
3866         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3867         return True
3868
3869     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3870         RetryManager.report_retry(
3871             err, _count or int(fatal), _retries,
3872             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3873             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3874
3875     def RetryManager(self, **kwargs):
3876         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3877
3878     @classmethod
3879     def extract_from_webpage(cls, ydl, url, webpage):
3880         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3881               else ydl.get_info_extractor(cls.ie_key()))
3882         for info in ie._extract_from_webpage(url, webpage) or []:
3883             # url = None since we do not want to set (webpage/original)_url
3884             ydl.add_default_extra_info(info, ie, None)
3885             yield info
3886
3887     @classmethod
3888     def _extract_from_webpage(cls, url, webpage):
3889         for embed_url in orderedSet(
3890                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3891             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3892
3893     @classmethod
3894     def _extract_embed_urls(cls, url, webpage):
3895         """@returns all the embed urls on the webpage"""
3896         if '_EMBED_URL_RE' not in cls.__dict__:
3897             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3898             for idx, regex in enumerate(cls._EMBED_REGEX):
3899                 assert regex.count('(?P<url>') == 1, \
3900                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3901             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3902
3903         for regex in cls._EMBED_URL_RE:
3904             for mobj in regex.finditer(webpage):
3905                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3906                 if cls._VALID_URL is False or cls.suitable(embed_url):
3907                     yield embed_url
3908
3909     class StopExtraction(Exception):
3910         pass
3911
3912     @classmethod
3913     def _extract_url(cls, webpage):  # TODO: Remove
3914         """Only for compatibility with some older extractors"""
3915         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3916
3917     @classmethod
3918     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3919         if plugin_name:
3920             mro = inspect.getmro(cls)
3921             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3922             cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key
3923             while getattr(super_class, '__wrapped__', None):
3924                 super_class = super_class.__wrapped__
3925             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3926
3927         return super().__init_subclass__(**kwargs)
3928
3929
3930 class SearchInfoExtractor(InfoExtractor):
3931     """
3932     Base class for paged search queries extractors.
3933     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3934     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3935     """
3936
3937     _MAX_RESULTS = float('inf')
3938
3939     @classproperty
3940     def _VALID_URL(cls):
3941         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3942
3943     def _real_extract(self, query):
3944         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3945         if prefix == '':
3946             return self._get_n_results(query, 1)
3947         elif prefix == 'all':
3948             return self._get_n_results(query, self._MAX_RESULTS)
3949         else:
3950             n = int(prefix)
3951             if n <= 0:
3952                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3953             elif n > self._MAX_RESULTS:
3954                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3955                 n = self._MAX_RESULTS
3956             return self._get_n_results(query, n)
3957
3958     def _get_n_results(self, query, n):
3959         """Get a specified number of results for a query.
3960         Either this function or _search_results must be overridden by subclasses """
3961         return self.playlist_result(
3962             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3963             query, query)
3964
3965     def _search_results(self, query):
3966         """Returns an iterator of search results"""
3967         raise NotImplementedError('This method must be implemented by subclasses')
3968
3969     @classproperty
3970     def SEARCH_KEY(cls):
3971         return cls._SEARCH_KEY
3972
3973
3974 class UnsupportedURLIE(InfoExtractor):
3975     _VALID_URL = '.*'
3976     _ENABLED = False
3977     IE_DESC = False
3978
3979     def _real_extract(self, url):
3980         raise UnsupportedError(url)