yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import itertools
   9 import json
  10 import math
  11 import netrc
  12 import os
  13 import random
  14 import re
  15 import sys
  16 import time
  17 import types
  18 import urllib.parse
  19 import urllib.request
  20 import xml.etree.ElementTree
  21
  22 from ..compat import functools  # isort: split
  23 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  24 from ..downloader import FileDownloader
  25 from ..downloader.f4m import get_base_url, remove_encrypted_media
  26 from ..utils import (
  27     IDENTITY,
  28     JSON_LD_RE,
  29     NO_DEFAULT,
  30     ExtractorError,
  31     GeoRestrictedError,
  32     GeoUtils,
  33     LenientJSONDecoder,
  34     RegexNotFoundError,
  35     RetryManager,
  36     UnsupportedError,
  37     age_restricted,
  38     base_url,
  39     bug_reports_message,
  40     classproperty,
  41     clean_html,
  42     determine_ext,
  43     determine_protocol,
  44     dict_get,
  45     encode_data_uri,
  46     error_to_compat_str,
  47     extract_attributes,
  48     filter_dict,
  49     fix_xml_ampersands,
  50     float_or_none,
  51     format_field,
  52     int_or_none,
  53     join_nonempty,
  54     js_to_json,
  55     mimetype2ext,
  56     network_exceptions,
  57     orderedSet,
  58     parse_bitrate,
  59     parse_codecs,
  60     parse_duration,
  61     parse_iso8601,
  62     parse_m3u8_attributes,
  63     parse_resolution,
  64     sanitize_filename,
  65     sanitize_url,
  66     sanitized_Request,
  67     str_or_none,
  68     str_to_int,
  69     strip_or_none,
  70     traverse_obj,
  71     try_call,
  72     try_get,
  73     unescapeHTML,
  74     unified_strdate,
  75     unified_timestamp,
  76     update_Request,
  77     update_url_query,
  78     url_basename,
  79     url_or_none,
  80     urljoin,
  81     variadic,
  82     xpath_element,
  83     xpath_text,
  84     xpath_with_ns,
  85 )
  86
  87
  88 class InfoExtractor:
  89     """Information Extractor class.
  90
  91     Information extractors are the classes that, given a URL, extract
  92     information about the video (or videos) the URL refers to. This
  93     information includes the real video URL, the video title, author and
  94     others. The information is stored in a dictionary which is then
  95     passed to the YoutubeDL. The YoutubeDL processes this
  96     information possibly downloading the video to the file system, among
  97     other possible outcomes.
  98
  99     The type field determines the type of the result.
 100     By far the most common value (and the default if _type is missing) is
 101     "video", which indicates a single video.
 102
 103     For a video, the dictionaries must include the following fields:
 104
 105     id:             Video identifier.
 106     title:          Video title, unescaped. Set to an empty string if video has
 107                     no title as opposed to "None" which signifies that the
 108                     extractor failed to obtain a title
 109
 110     Additionally, it must contain either a formats entry or a url one:
 111
 112     formats:        A list of dictionaries for each format available, ordered
 113                     from worst to best quality.
 114
 115                     Potential fields:
 116                     * url        The mandatory URL representing the media:
 117                                    for plain file media - HTTP URL of this file,
 118                                    for RTMP - RTMP URL,
 119                                    for HLS - URL of the M3U8 media playlist,
 120                                    for HDS - URL of the F4M manifest,
 121                                    for DASH
 122                                      - HTTP URL to plain file media (in case of
 123                                        unfragmented media)
 124                                      - URL of the MPD manifest or base URL
 125                                        representing the media if MPD manifest
 126                                        is parsed from a string (in case of
 127                                        fragmented media)
 128                                    for MSS - URL of the ISM manifest.
 129                     * manifest_url
 130                                  The URL of the manifest file in case of
 131                                  fragmented media:
 132                                    for HLS - URL of the M3U8 master playlist,
 133                                    for HDS - URL of the F4M manifest,
 134                                    for DASH - URL of the MPD manifest,
 135                                    for MSS - URL of the ISM manifest.
 136                     * manifest_stream_number  (For internal use only)
 137                                  The index of the stream in the manifest file
 138                     * ext        Will be calculated from URL if missing
 139                     * format     A human-readable description of the format
 140                                  ("mp4 container with h264/opus").
 141                                  Calculated from the format_id, width, height.
 142                                  and format_note fields if missing.
 143                     * format_id  A short description of the format
 144                                  ("mp4_h264_opus" or "19").
 145                                 Technically optional, but strongly recommended.
 146                     * format_note Additional info about the format
 147                                  ("3D" or "DASH video")
 148                     * width      Width of the video, if known
 149                     * height     Height of the video, if known
 150                     * resolution Textual description of width and height
 151                     * dynamic_range The dynamic range of the video. One of:
 152                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 153                     * tbr        Average bitrate of audio and video in KBit/s
 154                     * abr        Average audio bitrate in KBit/s
 155                     * acodec     Name of the audio codec in use
 156                     * asr        Audio sampling rate in Hertz
 157                     * audio_channels  Number of audio channels
 158                     * vbr        Average video bitrate in KBit/s
 159                     * fps        Frame rate
 160                     * vcodec     Name of the video codec in use
 161                     * container  Name of the container format
 162                     * filesize   The number of bytes, if known in advance
 163                     * filesize_approx  An estimate for the number of bytes
 164                     * player_url SWF Player URL (used for rtmpdump).
 165                     * protocol   The protocol that will be used for the actual
 166                                  download, lower-case. One of "http", "https" or
 167                                  one of the protocols defined in downloader.PROTOCOL_MAP
 168                     * fragment_base_url
 169                                  Base URL for fragments. Each fragment's path
 170                                  value (if present) will be relative to
 171                                  this URL.
 172                     * fragments  A list of fragments of a fragmented media.
 173                                  Each fragment entry must contain either an url
 174                                  or a path. If an url is present it should be
 175                                  considered by a client. Otherwise both path and
 176                                  fragment_base_url must be present. Here is
 177                                  the list of all potential fields:
 178                                  * "url" - fragment's URL
 179                                  * "path" - fragment's path relative to
 180                                             fragment_base_url
 181                                  * "duration" (optional, int or float)
 182                                  * "filesize" (optional, int)
 183                     * is_from_start  Is a live format that can be downloaded
 184                                 from the start. Boolean
 185                     * preference Order number of this format. If this field is
 186                                  present and not None, the formats get sorted
 187                                  by this field, regardless of all other values.
 188                                  -1 for default (order by other properties),
 189                                  -2 or smaller for less than default.
 190                                  < -1000 to hide the format (if there is
 191                                     another one which is strictly better)
 192                     * language   Language code, e.g. "de" or "en-US".
 193                     * language_preference  Is this in the language mentioned in
 194                                  the URL?
 195                                  10 if it's what the URL is about,
 196                                  -1 for default (don't know),
 197                                  -10 otherwise, other values reserved for now.
 198                     * quality    Order number of the video quality of this
 199                                  format, irrespective of the file format.
 200                                  -1 for default (order by other properties),
 201                                  -2 or smaller for less than default.
 202                     * source_preference  Order number for this video source
 203                                   (quality takes higher priority)
 204                                  -1 for default (order by other properties),
 205                                  -2 or smaller for less than default.
 206                     * http_headers  A dictionary of additional HTTP headers
 207                                  to add to the request.
 208                     * stretched_ratio  If given and not 1, indicates that the
 209                                  video's pixels are not square.
 210                                  width : height ratio as float.
 211                     * no_resume  The server does not support resuming the
 212                                  (HTTP or RTMP) download. Boolean.
 213                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 214                     * downloader_options  A dictionary of downloader options
 215                                  (For internal use only)
 216                                  * http_chunk_size Chunk size for HTTP downloads
 217                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 218                     RTMP formats can also have the additional fields: page_url,
 219                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 220                     rtmp_protocol, rtmp_real_time
 221
 222     url:            Final video URL.
 223     ext:            Video filename extension.
 224     format:         The video format, defaults to ext (used for --get-format)
 225     player_url:     SWF Player URL (used for rtmpdump).
 226
 227     The following fields are optional:
 228
 229     direct:         True if a direct video file was given (must only be set by GenericIE)
 230     alt_title:      A secondary title of the video.
 231     display_id      An alternative identifier for the video, not necessarily
 232                     unique, but available before title. Typically, id is
 233                     something like "4234987", title "Dancing naked mole rats",
 234                     and display_id "dancing-naked-mole-rats"
 235     thumbnails:     A list of dictionaries, with the following entries:
 236                         * "id" (optional, string) - Thumbnail format ID
 237                         * "url"
 238                         * "preference" (optional, int) - quality of the image
 239                         * "width" (optional, int)
 240                         * "height" (optional, int)
 241                         * "resolution" (optional, string "{width}x{height}",
 242                                         deprecated)
 243                         * "filesize" (optional, int)
 244                         * "http_headers" (dict) - HTTP headers for the request
 245     thumbnail:      Full URL to a video thumbnail image.
 246     description:    Full video description.
 247     uploader:       Full name of the video uploader.
 248     license:        License name the video is licensed under.
 249     creator:        The creator of the video.
 250     timestamp:      UNIX timestamp of the moment the video was uploaded
 251     upload_date:    Video upload date in UTC (YYYYMMDD).
 252                     If not explicitly set, calculated from timestamp
 253     release_timestamp: UNIX timestamp of the moment the video was released.
 254                     If it is not clear whether to use timestamp or this, use the former
 255     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 256                     If not explicitly set, calculated from release_timestamp
 257     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 258     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 259                     If not explicitly set, calculated from modified_timestamp
 260     uploader_id:    Nickname or id of the video uploader.
 261     uploader_url:   Full URL to a personal webpage of the video uploader.
 262     channel:        Full name of the channel the video is uploaded on.
 263                     Note that channel fields may or may not repeat uploader
 264                     fields. This depends on a particular extractor.
 265     channel_id:     Id of the channel.
 266     channel_url:    Full URL to a channel webpage.
 267     channel_follower_count: Number of followers of the channel.
 268     location:       Physical location where the video was filmed.
 269     subtitles:      The available subtitles as a dictionary in the format
 270                     {tag: subformats}. "tag" is usually a language code, and
 271                     "subformats" is a list sorted from lower to higher
 272                     preference, each element is a dictionary with the "ext"
 273                     entry and one of:
 274                         * "data": The subtitles file contents
 275                         * "url": A URL pointing to the subtitles file
 276                     It can optionally also have:
 277                         * "name": Name or description of the subtitles
 278                         * "http_headers": A dictionary of additional HTTP headers
 279                                   to add to the request.
 280                     "ext" will be calculated from URL if missing
 281     automatic_captions: Like 'subtitles'; contains automatically generated
 282                     captions instead of normal subtitles
 283     duration:       Length of the video in seconds, as an integer or float.
 284     view_count:     How many users have watched the video on the platform.
 285     like_count:     Number of positive ratings of the video
 286     dislike_count:  Number of negative ratings of the video
 287     repost_count:   Number of reposts of the video
 288     average_rating: Average rating give by users, the scale used depends on the webpage
 289     comment_count:  Number of comments on the video
 290     comments:       A list of comments, each with one or more of the following
 291                     properties (all but one of text or html optional):
 292                         * "author" - human-readable name of the comment author
 293                         * "author_id" - user ID of the comment author
 294                         * "author_thumbnail" - The thumbnail of the comment author
 295                         * "id" - Comment ID
 296                         * "html" - Comment as HTML
 297                         * "text" - Plain text of the comment
 298                         * "timestamp" - UNIX timestamp of comment
 299                         * "parent" - ID of the comment this one is replying to.
 300                                      Set to "root" to indicate that this is a
 301                                      comment to the original video.
 302                         * "like_count" - Number of positive ratings of the comment
 303                         * "dislike_count" - Number of negative ratings of the comment
 304                         * "is_favorited" - Whether the comment is marked as
 305                                            favorite by the video uploader
 306                         * "author_is_uploader" - Whether the comment is made by
 307                                                  the video uploader
 308     age_limit:      Age restriction for the video, as an integer (years)
 309     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 310                     should allow to get the same result again. (It will be set
 311                     by YoutubeDL if it's missing)
 312     categories:     A list of categories that the video falls in, for example
 313                     ["Sports", "Berlin"]
 314     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 315     cast:           A list of the video cast
 316     is_live:        True, False, or None (=unknown). Whether this video is a
 317                     live stream that goes on instead of a fixed-length video.
 318     was_live:       True, False, or None (=unknown). Whether this video was
 319                     originally a live stream.
 320     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 321                     or 'post_live' (was live, but VOD is not yet processed)
 322                     If absent, automatically set from is_live, was_live
 323     start_time:     Time in seconds where the reproduction should start, as
 324                     specified in the URL.
 325     end_time:       Time in seconds where the reproduction should end, as
 326                     specified in the URL.
 327     chapters:       A list of dictionaries, with the following entries:
 328                         * "start_time" - The start time of the chapter in seconds
 329                         * "end_time" - The end time of the chapter in seconds
 330                         * "title" (optional, string)
 331     playable_in_embed: Whether this video is allowed to play in embedded
 332                     players on other sites. Can be True (=always allowed),
 333                     False (=never allowed), None (=unknown), or a string
 334                     specifying the criteria for embedability; e.g. 'whitelist'
 335     availability:   Under what condition the video is available. One of
 336                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 337                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 338                     to set it
 339     _old_archive_ids: A list of old archive ids needed for backward compatibility
 340     __post_extractor: A function to be called just before the metadata is
 341                     written to either disk, logger or console. The function
 342                     must return a dict which will be added to the info_dict.
 343                     This is usefull for additional information that is
 344                     time-consuming to extract. Note that the fields thus
 345                     extracted will not be available to output template and
 346                     match_filter. So, only "comments" and "comment_count" are
 347                     currently allowed to be extracted via this method.
 348
 349     The following fields should only be used when the video belongs to some logical
 350     chapter or section:
 351
 352     chapter:        Name or title of the chapter the video belongs to.
 353     chapter_number: Number of the chapter the video belongs to, as an integer.
 354     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 355
 356     The following fields should only be used when the video is an episode of some
 357     series, programme or podcast:
 358
 359     series:         Title of the series or programme the video episode belongs to.
 360     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 361     season:         Title of the season the video episode belongs to.
 362     season_number:  Number of the season the video episode belongs to, as an integer.
 363     season_id:      Id of the season the video episode belongs to, as a unicode string.
 364     episode:        Title of the video episode. Unlike mandatory video title field,
 365                     this field should denote the exact title of the video episode
 366                     without any kind of decoration.
 367     episode_number: Number of the video episode within a season, as an integer.
 368     episode_id:     Id of the video episode, as a unicode string.
 369
 370     The following fields should only be used when the media is a track or a part of
 371     a music album:
 372
 373     track:          Title of the track.
 374     track_number:   Number of the track within an album or a disc, as an integer.
 375     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 376                     as a unicode string.
 377     artist:         Artist(s) of the track.
 378     genre:          Genre(s) of the track.
 379     album:          Title of the album the track belongs to.
 380     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 381     album_artist:   List of all artists appeared on the album (e.g.
 382                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 383                     and compilations).
 384     disc_number:    Number of the disc or other physical medium the track belongs to,
 385                     as an integer.
 386     release_year:   Year (YYYY) when the album was released.
 387     composer:       Composer of the piece
 388
 389     The following fields should only be set for clips that should be cut from the original video:
 390
 391     section_start:  Start time of the section in seconds
 392     section_end:    End time of the section in seconds
 393
 394     The following fields should only be set for storyboards:
 395     rows:           Number of rows in each storyboard fragment, as an integer
 396     columns:        Number of columns in each storyboard fragment, as an integer
 397
 398     Unless mentioned otherwise, the fields should be Unicode strings.
 399
 400     Unless mentioned otherwise, None is equivalent to absence of information.
 401
 402
 403     _type "playlist" indicates multiple videos.
 404     There must be a key "entries", which is a list, an iterable, or a PagedList
 405     object, each element of which is a valid dictionary by this specification.
 406
 407     Additionally, playlists can have "id", "title", and any other relevant
 408     attributes with the same semantics as videos (see above).
 409
 410     It can also have the following optional fields:
 411
 412     playlist_count: The total number of videos in a playlist. If not given,
 413                     YoutubeDL tries to calculate it from "entries"
 414
 415
 416     _type "multi_video" indicates that there are multiple videos that
 417     form a single show, for examples multiple acts of an opera or TV episode.
 418     It must have an entries key like a playlist and contain all the keys
 419     required for a video at the same time.
 420
 421
 422     _type "url" indicates that the video must be extracted from another
 423     location, possibly by a different extractor. Its only required key is:
 424     "url" - the next URL to extract.
 425     The key "ie_key" can be set to the class name (minus the trailing "IE",
 426     e.g. "Youtube") if the extractor class is known in advance.
 427     Additionally, the dictionary may have any properties of the resolved entity
 428     known in advance, for example "title" if the title of the referred video is
 429     known ahead of time.
 430
 431
 432     _type "url_transparent" entities have the same specification as "url", but
 433     indicate that the given additional information is more precise than the one
 434     associated with the resolved URL.
 435     This is useful when a site employs a video service that hosts the video and
 436     its technical metadata, but that video service does not embed a useful
 437     title, description etc.
 438
 439
 440     Subclasses of this should also be added to the list of extractors and
 441     should define a _VALID_URL regexp and, re-define the _real_extract() and
 442     (optionally) _real_initialize() methods.
 443
 444     Subclasses may also override suitable() if necessary, but ensure the function
 445     signature is preserved and that this function imports everything it needs
 446     (except other extractors), so that lazy_extractors works correctly.
 447
 448     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 449     the HTML of Generic webpages. It may also override _extract_embed_urls
 450     or _extract_from_webpage as necessary. While these are normally classmethods,
 451     _extract_from_webpage is allowed to be an instance method.
 452
 453     _extract_from_webpage may raise self.StopExtraction() to stop further
 454     processing of the webpage and obtain exclusive rights to it. This is useful
 455     when the extractor cannot reliably be matched using just the URL,
 456     e.g. invidious/peertube instances
 457
 458     Embed-only extractors can be defined by setting _VALID_URL = False.
 459
 460     To support username + password (or netrc) login, the extractor must define a
 461     _NETRC_MACHINE and re-define _perform_login(username, password) and
 462     (optionally) _initialize_pre_login() methods. The _perform_login method will
 463     be called between _initialize_pre_login and _real_initialize if credentials
 464     are passed by the user. In cases where it is necessary to have the login
 465     process as part of the extraction rather than initialization, _perform_login
 466     can be left undefined.
 467
 468     _GEO_BYPASS attribute may be set to False in order to disable
 469     geo restriction bypass mechanisms for a particular extractor.
 470     Though it won't disable explicit geo restriction bypass based on
 471     country code provided with geo_bypass_country.
 472
 473     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 474     countries for this extractor. One of these countries will be used by
 475     geo restriction bypass mechanism right away in order to bypass
 476     geo restriction, of course, if the mechanism is not disabled.
 477
 478     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 479     IP blocks in CIDR notation for this extractor. One of these IP blocks
 480     will be used by geo restriction bypass mechanism similarly
 481     to _GEO_COUNTRIES.
 482
 483     The _WORKING attribute should be set to False for broken IEs
 484     in order to warn the users and skip the tests.
 485     """
 486
 487     _ready = False
 488     _downloader = None
 489     _x_forwarded_for_ip = None
 490     _GEO_BYPASS = True
 491     _GEO_COUNTRIES = None
 492     _GEO_IP_BLOCKS = None
 493     _WORKING = True
 494     _NETRC_MACHINE = None
 495     IE_DESC = None
 496     SEARCH_KEY = None
 497     _VALID_URL = None
 498     _EMBED_REGEX = []
 499
 500     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 501         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 502         return {
 503             None: '',
 504             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 505             'password': f'Use {password_hint}',
 506             'cookies': (
 507                 'Use --cookies-from-browser or --cookies for the authentication. '
 508                 'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 509         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 510
 511     def __init__(self, downloader=None):
 512         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 513         If a downloader is not passed during initialization,
 514         it must be set using "set_downloader()" before "extract()" is called"""
 515         self._ready = False
 516         self._x_forwarded_for_ip = None
 517         self._printed_messages = set()
 518         self.set_downloader(downloader)
 519
 520     @classmethod
 521     def _match_valid_url(cls, url):
 522         if cls._VALID_URL is False:
 523             return None
 524         # This does not use has/getattr intentionally - we want to know whether
 525         # we have cached the regexp for *this* class, whereas getattr would also
 526         # match the superclass
 527         if '_VALID_URL_RE' not in cls.__dict__:
 528             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 529         return cls._VALID_URL_RE.match(url)
 530
 531     @classmethod
 532     def suitable(cls, url):
 533         """Receives a URL and returns True if suitable for this IE."""
 534         # This function must import everything it needs (except other extractors),
 535         # so that lazy_extractors works correctly
 536         return cls._match_valid_url(url) is not None
 537
 538     @classmethod
 539     def _match_id(cls, url):
 540         return cls._match_valid_url(url).group('id')
 541
 542     @classmethod
 543     def get_temp_id(cls, url):
 544         try:
 545             return cls._match_id(url)
 546         except (IndexError, AttributeError):
 547             return None
 548
 549     @classmethod
 550     def working(cls):
 551         """Getter method for _WORKING."""
 552         return cls._WORKING
 553
 554     @classmethod
 555     def supports_login(cls):
 556         return bool(cls._NETRC_MACHINE)
 557
 558     def initialize(self):
 559         """Initializes an instance (authentication, etc)."""
 560         self._printed_messages = set()
 561         self._initialize_geo_bypass({
 562             'countries': self._GEO_COUNTRIES,
 563             'ip_blocks': self._GEO_IP_BLOCKS,
 564         })
 565         if not self._ready:
 566             self._initialize_pre_login()
 567             if self.supports_login():
 568                 username, password = self._get_login_info()
 569                 if username:
 570                     self._perform_login(username, password)
 571             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 572                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 573             self._real_initialize()
 574             self._ready = True
 575
 576     def _initialize_geo_bypass(self, geo_bypass_context):
 577         """
 578         Initialize geo restriction bypass mechanism.
 579
 580         This method is used to initialize geo bypass mechanism based on faking
 581         X-Forwarded-For HTTP header. A random country from provided country list
 582         is selected and a random IP belonging to this country is generated. This
 583         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 584         HTTP requests.
 585
 586         This method will be used for initial geo bypass mechanism initialization
 587         during the instance initialization with _GEO_COUNTRIES and
 588         _GEO_IP_BLOCKS.
 589
 590         You may also manually call it from extractor's code if geo bypass
 591         information is not available beforehand (e.g. obtained during
 592         extraction) or due to some other reason. In this case you should pass
 593         this information in geo bypass context passed as first argument. It may
 594         contain following fields:
 595
 596         countries:  List of geo unrestricted countries (similar
 597                     to _GEO_COUNTRIES)
 598         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 599                     (similar to _GEO_IP_BLOCKS)
 600
 601         """
 602         if not self._x_forwarded_for_ip:
 603
 604             # Geo bypass mechanism is explicitly disabled by user
 605             if not self.get_param('geo_bypass', True):
 606                 return
 607
 608             if not geo_bypass_context:
 609                 geo_bypass_context = {}
 610
 611             # Backward compatibility: previously _initialize_geo_bypass
 612             # expected a list of countries, some 3rd party code may still use
 613             # it this way
 614             if isinstance(geo_bypass_context, (list, tuple)):
 615                 geo_bypass_context = {
 616                     'countries': geo_bypass_context,
 617                 }
 618
 619             # The whole point of geo bypass mechanism is to fake IP
 620             # as X-Forwarded-For HTTP header based on some IP block or
 621             # country code.
 622
 623             # Path 1: bypassing based on IP block in CIDR notation
 624
 625             # Explicit IP block specified by user, use it right away
 626             # regardless of whether extractor is geo bypassable or not
 627             ip_block = self.get_param('geo_bypass_ip_block', None)
 628
 629             # Otherwise use random IP block from geo bypass context but only
 630             # if extractor is known as geo bypassable
 631             if not ip_block:
 632                 ip_blocks = geo_bypass_context.get('ip_blocks')
 633                 if self._GEO_BYPASS and ip_blocks:
 634                     ip_block = random.choice(ip_blocks)
 635
 636             if ip_block:
 637                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 638                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 639                 return
 640
 641             # Path 2: bypassing based on country code
 642
 643             # Explicit country code specified by user, use it right away
 644             # regardless of whether extractor is geo bypassable or not
 645             country = self.get_param('geo_bypass_country', None)
 646
 647             # Otherwise use random country code from geo bypass context but
 648             # only if extractor is known as geo bypassable
 649             if not country:
 650                 countries = geo_bypass_context.get('countries')
 651                 if self._GEO_BYPASS and countries:
 652                     country = random.choice(countries)
 653
 654             if country:
 655                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 656                 self._downloader.write_debug(
 657                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 658
 659     def extract(self, url):
 660         """Extracts URL information and returns it in list of dicts."""
 661         try:
 662             for _ in range(2):
 663                 try:
 664                     self.initialize()
 665                     self.write_debug('Extracting URL: %s' % url)
 666                     ie_result = self._real_extract(url)
 667                     if ie_result is None:
 668                         return None
 669                     if self._x_forwarded_for_ip:
 670                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 671                     subtitles = ie_result.get('subtitles') or {}
 672                     if 'no-live-chat' in self.get_param('compat_opts'):
 673                         for lang in ('live_chat', 'comments', 'danmaku'):
 674                             subtitles.pop(lang, None)
 675                     return ie_result
 676                 except GeoRestrictedError as e:
 677                     if self.__maybe_fake_ip_and_retry(e.countries):
 678                         continue
 679                     raise
 680         except UnsupportedError:
 681             raise
 682         except ExtractorError as e:
 683             kwargs = {
 684                 'video_id': e.video_id or self.get_temp_id(url),
 685                 'ie': self.IE_NAME,
 686                 'tb': e.traceback or sys.exc_info()[2],
 687                 'expected': e.expected,
 688                 'cause': e.cause
 689             }
 690             if hasattr(e, 'countries'):
 691                 kwargs['countries'] = e.countries
 692             raise type(e)(e.orig_msg, **kwargs)
 693         except http.client.IncompleteRead as e:
 694             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 695         except (KeyError, StopIteration) as e:
 696             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 697
 698     def __maybe_fake_ip_and_retry(self, countries):
 699         if (not self.get_param('geo_bypass_country', None)
 700                 and self._GEO_BYPASS
 701                 and self.get_param('geo_bypass', True)
 702                 and not self._x_forwarded_for_ip
 703                 and countries):
 704             country_code = random.choice(countries)
 705             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 706             if self._x_forwarded_for_ip:
 707                 self.report_warning(
 708                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 709                     % (self._x_forwarded_for_ip, country_code.upper()))
 710                 return True
 711         return False
 712
 713     def set_downloader(self, downloader):
 714         """Sets a YoutubeDL instance as the downloader for this IE."""
 715         self._downloader = downloader
 716
 717     @property
 718     def cache(self):
 719         return self._downloader.cache
 720
 721     @property
 722     def cookiejar(self):
 723         return self._downloader.cookiejar
 724
 725     def _initialize_pre_login(self):
 726         """ Initialization before login. Redefine in subclasses."""
 727         pass
 728
 729     def _perform_login(self, username, password):
 730         """ Login with username and password. Redefine in subclasses."""
 731         pass
 732
 733     def _real_initialize(self):
 734         """Real initialization process. Redefine in subclasses."""
 735         pass
 736
 737     def _real_extract(self, url):
 738         """Real extraction process. Redefine in subclasses."""
 739         raise NotImplementedError('This method must be implemented by subclasses')
 740
 741     @classmethod
 742     def ie_key(cls):
 743         """A string for getting the InfoExtractor with get_info_extractor"""
 744         return cls.__name__[:-2]
 745
 746     @classproperty
 747     def IE_NAME(cls):
 748         return cls.__name__[:-2]
 749
 750     @staticmethod
 751     def __can_accept_status_code(err, expected_status):
 752         assert isinstance(err, urllib.error.HTTPError)
 753         if expected_status is None:
 754             return False
 755         elif callable(expected_status):
 756             return expected_status(err.code) is True
 757         else:
 758             return err.code in variadic(expected_status)
 759
 760     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 761         if isinstance(url_or_request, urllib.request.Request):
 762             return update_Request(url_or_request, data=data, headers=headers, query=query)
 763         if query:
 764             url_or_request = update_url_query(url_or_request, query)
 765         return sanitized_Request(url_or_request, data, headers or {})
 766
 767     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 768         """
 769         Return the response handle.
 770
 771         See _download_webpage docstring for arguments specification.
 772         """
 773         if not self._downloader._first_webpage_request:
 774             sleep_interval = self.get_param('sleep_interval_requests') or 0
 775             if sleep_interval > 0:
 776                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 777                 time.sleep(sleep_interval)
 778         else:
 779             self._downloader._first_webpage_request = False
 780
 781         if note is None:
 782             self.report_download_webpage(video_id)
 783         elif note is not False:
 784             if video_id is None:
 785                 self.to_screen(str(note))
 786             else:
 787                 self.to_screen(f'{video_id}: {note}')
 788
 789         # Some sites check X-Forwarded-For HTTP header in order to figure out
 790         # the origin of the client behind proxy. This allows bypassing geo
 791         # restriction by faking this header's value to IP that belongs to some
 792         # geo unrestricted country. We will do so once we encounter any
 793         # geo restriction error.
 794         if self._x_forwarded_for_ip:
 795             headers = (headers or {}).copy()
 796             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 797
 798         try:
 799             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 800         except network_exceptions as err:
 801             if isinstance(err, urllib.error.HTTPError):
 802                 if self.__can_accept_status_code(err, expected_status):
 803                     # Retain reference to error to prevent file object from
 804                     # being closed before it can be read. Works around the
 805                     # effects of <https://bugs.python.org/issue15002>
 806                     # introduced in Python 3.4.1.
 807                     err.fp._error = err
 808                     return err.fp
 809
 810             if errnote is False:
 811                 return False
 812             if errnote is None:
 813                 errnote = 'Unable to download webpage'
 814
 815             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 816             if fatal:
 817                 raise ExtractorError(errmsg, cause=err)
 818             else:
 819                 self.report_warning(errmsg)
 820                 return False
 821
 822     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 823                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 824         """
 825         Return a tuple (page content as string, URL handle).
 826
 827         Arguments:
 828         url_or_request -- plain text URL as a string or
 829             a urllib.request.Request object
 830         video_id -- Video/playlist/item identifier (string)
 831
 832         Keyword arguments:
 833         note -- note printed before downloading (string)
 834         errnote -- note printed in case of an error (string)
 835         fatal -- flag denoting whether error should be considered fatal,
 836             i.e. whether it should cause ExtractionError to be raised,
 837             otherwise a warning will be reported and extraction continued
 838         encoding -- encoding for a page content decoding, guessed automatically
 839             when not explicitly specified
 840         data -- POST data (bytes)
 841         headers -- HTTP headers (dict)
 842         query -- URL query (dict)
 843         expected_status -- allows to accept failed HTTP requests (non 2xx
 844             status code) by explicitly specifying a set of accepted status
 845             codes. Can be any of the following entities:
 846                 - an integer type specifying an exact failed status code to
 847                   accept
 848                 - a list or a tuple of integer types specifying a list of
 849                   failed status codes to accept
 850                 - a callable accepting an actual failed status code and
 851                   returning True if it should be accepted
 852             Note that this argument does not affect success status codes (2xx)
 853             which are always accepted.
 854         """
 855
 856         # Strip hashes from the URL (#1038)
 857         if isinstance(url_or_request, str):
 858             url_or_request = url_or_request.partition('#')[0]
 859
 860         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 861         if urlh is False:
 862             assert not fatal
 863             return False
 864         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 865         return (content, urlh)
 866
 867     @staticmethod
 868     def _guess_encoding_from_content(content_type, webpage_bytes):
 869         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 870         if m:
 871             encoding = m.group(1)
 872         else:
 873             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 874                           webpage_bytes[:1024])
 875             if m:
 876                 encoding = m.group(1).decode('ascii')
 877             elif webpage_bytes.startswith(b'\xff\xfe'):
 878                 encoding = 'utf-16'
 879             else:
 880                 encoding = 'utf-8'
 881
 882         return encoding
 883
 884     def __check_blocked(self, content):
 885         first_block = content[:512]
 886         if ('<title>Access to this site is blocked</title>' in content
 887                 and 'Websense' in first_block):
 888             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 889             blocked_iframe = self._html_search_regex(
 890                 r'<iframe src="([^"]+)"', content,
 891                 'Websense information URL', default=None)
 892             if blocked_iframe:
 893                 msg += ' Visit %s for more details' % blocked_iframe
 894             raise ExtractorError(msg, expected=True)
 895         if '<title>The URL you requested has been blocked</title>' in first_block:
 896             msg = (
 897                 'Access to this webpage has been blocked by Indian censorship. '
 898                 'Use a VPN or proxy server (with --proxy) to route around it.')
 899             block_msg = self._html_search_regex(
 900                 r'</h1><p>(.*?)</p>',
 901                 content, 'block message', default=None)
 902             if block_msg:
 903                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 904             raise ExtractorError(msg, expected=True)
 905         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 906                 and 'blocklist.rkn.gov.ru' in content):
 907             raise ExtractorError(
 908                 'Access to this webpage has been blocked by decision of the Russian government. '
 909                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 910                 expected=True)
 911
 912     def _request_dump_filename(self, url, video_id):
 913         basen = f'{video_id}_{url}'
 914         trim_length = self.get_param('trim_file_name') or 240
 915         if len(basen) > trim_length:
 916             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 917             basen = basen[:trim_length - len(h)] + h
 918         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 919         # Working around MAX_PATH limitation on Windows (see
 920         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 921         if compat_os_name == 'nt':
 922             absfilepath = os.path.abspath(filename)
 923             if len(absfilepath) > 259:
 924                 filename = fR'\\?\{absfilepath}'
 925         return filename
 926
 927     def __decode_webpage(self, webpage_bytes, encoding, headers):
 928         if not encoding:
 929             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 930         try:
 931             return webpage_bytes.decode(encoding, 'replace')
 932         except LookupError:
 933             return webpage_bytes.decode('utf-8', 'replace')
 934
 935     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 936         webpage_bytes = urlh.read()
 937         if prefix is not None:
 938             webpage_bytes = prefix + webpage_bytes
 939         if self.get_param('dump_intermediate_pages', False):
 940             self.to_screen('Dumping request to ' + urlh.geturl())
 941             dump = base64.b64encode(webpage_bytes).decode('ascii')
 942             self._downloader.to_screen(dump)
 943         if self.get_param('write_pages'):
 944             filename = self._request_dump_filename(urlh.geturl(), video_id)
 945             self.to_screen(f'Saving request to {filename}')
 946             with open(filename, 'wb') as outf:
 947                 outf.write(webpage_bytes)
 948
 949         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 950         self.__check_blocked(content)
 951
 952         return content
 953
 954     def __print_error(self, errnote, fatal, video_id, err):
 955         if fatal:
 956             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 957         elif errnote:
 958             self.report_warning(f'{video_id}: {errnote}: {err}')
 959
 960     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 961         if transform_source:
 962             xml_string = transform_source(xml_string)
 963         try:
 964             return compat_etree_fromstring(xml_string.encode('utf-8'))
 965         except xml.etree.ElementTree.ParseError as ve:
 966             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 967
 968     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
 969         try:
 970             return json.loads(
 971                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 972         except ValueError as ve:
 973             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
 974
 975     def _parse_socket_response_as_json(self, data, *args, **kwargs):
 976         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
 977
 978     def __create_download_methods(name, parser, note, errnote, return_value):
 979
 980         def parse(ie, content, *args, errnote=errnote, **kwargs):
 981             if parser is None:
 982                 return content
 983             if errnote is False:
 984                 kwargs['errnote'] = errnote
 985             # parser is fetched by name so subclasses can override it
 986             return getattr(ie, parser)(content, *args, **kwargs)
 987
 988         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 989                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 990             res = self._download_webpage_handle(
 991                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 992                 data=data, headers=headers, query=query, expected_status=expected_status)
 993             if res is False:
 994                 return res
 995             content, urlh = res
 996             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
 997
 998         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 999                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1000             if self.get_param('load_pages'):
1001                 url_or_request = self._create_request(url_or_request, data, headers, query)
1002                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1003                 self.to_screen(f'Loading request from {filename}')
1004                 try:
1005                     with open(filename, 'rb') as dumpf:
1006                         webpage_bytes = dumpf.read()
1007                 except OSError as e:
1008                     self.report_warning(f'Unable to load request from disk: {e}')
1009                 else:
1010                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1011                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1012             kwargs = {
1013                 'note': note,
1014                 'errnote': errnote,
1015                 'transform_source': transform_source,
1016                 'fatal': fatal,
1017                 'encoding': encoding,
1018                 'data': data,
1019                 'headers': headers,
1020                 'query': query,
1021                 'expected_status': expected_status,
1022             }
1023             if parser is None:
1024                 kwargs.pop('transform_source')
1025             # The method is fetched by name so subclasses can override _download_..._handle
1026             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1027             return res if res is False else res[0]
1028
1029         def impersonate(func, name, return_value):
1030             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1031             func.__doc__ = f'''
1032                 @param transform_source     Apply this transformation before parsing
1033                 @returns                    {return_value}
1034
1035                 See _download_webpage_handle docstring for other arguments specification
1036             '''
1037
1038         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1039         impersonate(download_content, f'_download_{name}', f'{return_value}')
1040         return download_handle, download_content
1041
1042     _download_xml_handle, _download_xml = __create_download_methods(
1043         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1044     _download_json_handle, _download_json = __create_download_methods(
1045         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1046     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1047         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1048     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1049
1050     def _download_webpage(
1051             self, url_or_request, video_id, note=None, errnote=None,
1052             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1053         """
1054         Return the data of the page as a string.
1055
1056         Keyword arguments:
1057         tries -- number of tries
1058         timeout -- sleep interval between tries
1059
1060         See _download_webpage_handle docstring for other arguments specification.
1061         """
1062
1063         R''' # NB: These are unused; should they be deprecated?
1064         if tries != 1:
1065             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1066         if timeout is NO_DEFAULT:
1067             timeout = 5
1068         else:
1069             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1070         '''
1071
1072         try_count = 0
1073         while True:
1074             try:
1075                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1076             except http.client.IncompleteRead as e:
1077                 try_count += 1
1078                 if try_count >= tries:
1079                     raise e
1080                 self._sleep(timeout, video_id)
1081
1082     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1083         idstr = format_field(video_id, None, '%s: ')
1084         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1085         if only_once:
1086             if f'WARNING: {msg}' in self._printed_messages:
1087                 return
1088             self._printed_messages.add(f'WARNING: {msg}')
1089         self._downloader.report_warning(msg, *args, **kwargs)
1090
1091     def to_screen(self, msg, *args, **kwargs):
1092         """Print msg to screen, prefixing it with '[ie_name]'"""
1093         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1094
1095     def write_debug(self, msg, *args, **kwargs):
1096         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1097
1098     def get_param(self, name, default=None, *args, **kwargs):
1099         if self._downloader:
1100             return self._downloader.params.get(name, default, *args, **kwargs)
1101         return default
1102
1103     def report_drm(self, video_id, partial=False):
1104         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1105
1106     def report_extraction(self, id_or_name):
1107         """Report information extraction."""
1108         self.to_screen('%s: Extracting information' % id_or_name)
1109
1110     def report_download_webpage(self, video_id):
1111         """Report webpage download."""
1112         self.to_screen('%s: Downloading webpage' % video_id)
1113
1114     def report_age_confirmation(self):
1115         """Report attempt to confirm age."""
1116         self.to_screen('Confirming age')
1117
1118     def report_login(self):
1119         """Report attempt to log in."""
1120         self.to_screen('Logging in')
1121
1122     def raise_login_required(
1123             self, msg='This video is only available for registered users',
1124             metadata_available=False, method=NO_DEFAULT):
1125         if metadata_available and (
1126                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1127             self.report_warning(msg)
1128             return
1129         msg += format_field(self._login_hint(method), None, '. %s')
1130         raise ExtractorError(msg, expected=True)
1131
1132     def raise_geo_restricted(
1133             self, msg='This video is not available from your location due to geo restriction',
1134             countries=None, metadata_available=False):
1135         if metadata_available and (
1136                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1137             self.report_warning(msg)
1138         else:
1139             raise GeoRestrictedError(msg, countries=countries)
1140
1141     def raise_no_formats(self, msg, expected=False, video_id=None):
1142         if expected and (
1143                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1144             self.report_warning(msg, video_id)
1145         elif isinstance(msg, ExtractorError):
1146             raise msg
1147         else:
1148             raise ExtractorError(msg, expected=expected, video_id=video_id)
1149
1150     # Methods for following #608
1151     @staticmethod
1152     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1153         """Returns a URL that points to a page that should be processed"""
1154         if ie is not None:
1155             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1156         if video_id is not None:
1157             kwargs['id'] = video_id
1158         if video_title is not None:
1159             kwargs['title'] = video_title
1160         return {
1161             **kwargs,
1162             '_type': 'url_transparent' if url_transparent else 'url',
1163             'url': url,
1164         }
1165
1166     @classmethod
1167     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1168                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1169         return cls.playlist_result(
1170             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1171             playlist_id, playlist_title, **kwargs)
1172
1173     @staticmethod
1174     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1175         """Returns a playlist"""
1176         if playlist_id:
1177             kwargs['id'] = playlist_id
1178         if playlist_title:
1179             kwargs['title'] = playlist_title
1180         if playlist_description is not None:
1181             kwargs['description'] = playlist_description
1182         return {
1183             **kwargs,
1184             '_type': 'multi_video' if multi_video else 'playlist',
1185             'entries': entries,
1186         }
1187
1188     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1189         """
1190         Perform a regex search on the given string, using a single or a list of
1191         patterns returning the first matching group.
1192         In case of failure return a default value or raise a WARNING or a
1193         RegexNotFoundError, depending on fatal, specifying the field name.
1194         """
1195         if string is None:
1196             mobj = None
1197         elif isinstance(pattern, (str, re.Pattern)):
1198             mobj = re.search(pattern, string, flags)
1199         else:
1200             for p in pattern:
1201                 mobj = re.search(p, string, flags)
1202                 if mobj:
1203                     break
1204
1205         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1206
1207         if mobj:
1208             if group is None:
1209                 # return the first matching group
1210                 return next(g for g in mobj.groups() if g is not None)
1211             elif isinstance(group, (list, tuple)):
1212                 return tuple(mobj.group(g) for g in group)
1213             else:
1214                 return mobj.group(group)
1215         elif default is not NO_DEFAULT:
1216             return default
1217         elif fatal:
1218             raise RegexNotFoundError('Unable to extract %s' % _name)
1219         else:
1220             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1221             return None
1222
1223     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1224                      contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs):
1225         """Searches string for the JSON object specified by start_pattern"""
1226         # NB: end_pattern is only used to reduce the size of the initial match
1227         if default is NO_DEFAULT:
1228             default, has_default = {}, False
1229         else:
1230             fatal, has_default = False, True
1231
1232         json_string = self._search_regex(
1233             rf'{start_pattern}\s*(?P<json>{{\s*{contains_pattern}\s*}})\s*{end_pattern}',
1234             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1235         if not json_string:
1236             return default
1237
1238         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1239         try:
1240             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1241         except ExtractorError as e:
1242             if fatal:
1243                 raise ExtractorError(
1244                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1245             elif not has_default:
1246                 self.report_warning(
1247                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1248         return default
1249
1250     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1251         """
1252         Like _search_regex, but strips HTML tags and unescapes entities.
1253         """
1254         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1255         if res:
1256             return clean_html(res).strip()
1257         else:
1258             return res
1259
1260     def _get_netrc_login_info(self, netrc_machine=None):
1261         username = None
1262         password = None
1263         netrc_machine = netrc_machine or self._NETRC_MACHINE
1264
1265         if self.get_param('usenetrc', False):
1266             try:
1267                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1268                 if os.path.isdir(netrc_file):
1269                     netrc_file = os.path.join(netrc_file, '.netrc')
1270                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1271                 if info is not None:
1272                     username = info[0]
1273                     password = info[2]
1274                 else:
1275                     raise netrc.NetrcParseError(
1276                         'No authenticators for %s' % netrc_machine)
1277             except (OSError, netrc.NetrcParseError) as err:
1278                 self.report_warning(
1279                     'parsing .netrc: %s' % error_to_compat_str(err))
1280
1281         return username, password
1282
1283     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1284         """
1285         Get the login info as (username, password)
1286         First look for the manually specified credentials using username_option
1287         and password_option as keys in params dictionary. If no such credentials
1288         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1289         value.
1290         If there's no info available, return (None, None)
1291         """
1292
1293         # Attempt to use provided username and password or .netrc data
1294         username = self.get_param(username_option)
1295         if username is not None:
1296             password = self.get_param(password_option)
1297         else:
1298             username, password = self._get_netrc_login_info(netrc_machine)
1299
1300         return username, password
1301
1302     def _get_tfa_info(self, note='two-factor verification code'):
1303         """
1304         Get the two-factor authentication info
1305         TODO - asking the user will be required for sms/phone verify
1306         currently just uses the command line option
1307         If there's no info available, return None
1308         """
1309
1310         tfa = self.get_param('twofactor')
1311         if tfa is not None:
1312             return tfa
1313
1314         return getpass.getpass('Type %s and press [Return]: ' % note)
1315
1316     # Helper functions for extracting OpenGraph info
1317     @staticmethod
1318     def _og_regexes(prop):
1319         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1320         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1321                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1322         template = r'<meta[^>]+?%s[^>]+?%s'
1323         return [
1324             template % (property_re, content_re),
1325             template % (content_re, property_re),
1326         ]
1327
1328     @staticmethod
1329     def _meta_regex(prop):
1330         return r'''(?isx)<meta
1331                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1332                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1333
1334     def _og_search_property(self, prop, html, name=None, **kargs):
1335         prop = variadic(prop)
1336         if name is None:
1337             name = 'OpenGraph %s' % prop[0]
1338         og_regexes = []
1339         for p in prop:
1340             og_regexes.extend(self._og_regexes(p))
1341         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1342         if escaped is None:
1343             return None
1344         return unescapeHTML(escaped)
1345
1346     def _og_search_thumbnail(self, html, **kargs):
1347         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1348
1349     def _og_search_description(self, html, **kargs):
1350         return self._og_search_property('description', html, fatal=False, **kargs)
1351
1352     def _og_search_title(self, html, *, fatal=False, **kargs):
1353         return self._og_search_property('title', html, fatal=fatal, **kargs)
1354
1355     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1356         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1357         if secure:
1358             regexes = self._og_regexes('video:secure_url') + regexes
1359         return self._html_search_regex(regexes, html, name, **kargs)
1360
1361     def _og_search_url(self, html, **kargs):
1362         return self._og_search_property('url', html, **kargs)
1363
1364     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1365         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1366
1367     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1368         name = variadic(name)
1369         if display_name is None:
1370             display_name = name[0]
1371         return self._html_search_regex(
1372             [self._meta_regex(n) for n in name],
1373             html, display_name, fatal=fatal, group='content', **kwargs)
1374
1375     def _dc_search_uploader(self, html):
1376         return self._html_search_meta('dc.creator', html, 'uploader')
1377
1378     @staticmethod
1379     def _rta_search(html):
1380         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1381         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1382                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1383                      html):
1384             return 18
1385
1386         # And then there are the jokers who advertise that they use RTA, but actually don't.
1387         AGE_LIMIT_MARKERS = [
1388             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1389         ]
1390         if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
1391             return 18
1392         return 0
1393
1394     def _media_rating_search(self, html):
1395         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1396         rating = self._html_search_meta('rating', html)
1397
1398         if not rating:
1399             return None
1400
1401         RATING_TABLE = {
1402             'safe for kids': 0,
1403             'general': 8,
1404             '14 years': 14,
1405             'mature': 17,
1406             'restricted': 19,
1407         }
1408         return RATING_TABLE.get(rating.lower())
1409
1410     def _family_friendly_search(self, html):
1411         # See http://schema.org/VideoObject
1412         family_friendly = self._html_search_meta(
1413             'isFamilyFriendly', html, default=None)
1414
1415         if not family_friendly:
1416             return None
1417
1418         RATING_TABLE = {
1419             '1': 0,
1420             'true': 0,
1421             '0': 18,
1422             'false': 18,
1423         }
1424         return RATING_TABLE.get(family_friendly.lower())
1425
1426     def _twitter_search_player(self, html):
1427         return self._html_search_meta('twitter:player', html,
1428                                       'twitter card player')
1429
1430     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1431         """Yield all json ld objects in the html"""
1432         if default is not NO_DEFAULT:
1433             fatal = False
1434         for mobj in re.finditer(JSON_LD_RE, html):
1435             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1436             for json_ld in variadic(json_ld_item):
1437                 if isinstance(json_ld, dict):
1438                     yield json_ld
1439
1440     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1441         """Search for a video in any json ld in the html"""
1442         if default is not NO_DEFAULT:
1443             fatal = False
1444         info = self._json_ld(
1445             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1446             video_id, fatal=fatal, expected_type=expected_type)
1447         if info:
1448             return info
1449         if default is not NO_DEFAULT:
1450             return default
1451         elif fatal:
1452             raise RegexNotFoundError('Unable to extract JSON-LD')
1453         else:
1454             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1455             return {}
1456
1457     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1458         if isinstance(json_ld, str):
1459             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1460         if not json_ld:
1461             return {}
1462         info = {}
1463         if not isinstance(json_ld, (list, tuple, dict)):
1464             return info
1465         if isinstance(json_ld, dict):
1466             json_ld = [json_ld]
1467
1468         INTERACTION_TYPE_MAP = {
1469             'CommentAction': 'comment',
1470             'AgreeAction': 'like',
1471             'DisagreeAction': 'dislike',
1472             'LikeAction': 'like',
1473             'DislikeAction': 'dislike',
1474             'ListenAction': 'view',
1475             'WatchAction': 'view',
1476             'ViewAction': 'view',
1477         }
1478
1479         def is_type(e, *expected_types):
1480             type = variadic(traverse_obj(e, '@type'))
1481             return any(x in type for x in expected_types)
1482
1483         def extract_interaction_type(e):
1484             interaction_type = e.get('interactionType')
1485             if isinstance(interaction_type, dict):
1486                 interaction_type = interaction_type.get('@type')
1487             return str_or_none(interaction_type)
1488
1489         def extract_interaction_statistic(e):
1490             interaction_statistic = e.get('interactionStatistic')
1491             if isinstance(interaction_statistic, dict):
1492                 interaction_statistic = [interaction_statistic]
1493             if not isinstance(interaction_statistic, list):
1494                 return
1495             for is_e in interaction_statistic:
1496                 if not is_type(is_e, 'InteractionCounter'):
1497                     continue
1498                 interaction_type = extract_interaction_type(is_e)
1499                 if not interaction_type:
1500                     continue
1501                 # For interaction count some sites provide string instead of
1502                 # an integer (as per spec) with non digit characters (e.g. ",")
1503                 # so extracting count with more relaxed str_to_int
1504                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1505                 if interaction_count is None:
1506                     continue
1507                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1508                 if not count_kind:
1509                     continue
1510                 count_key = '%s_count' % count_kind
1511                 if info.get(count_key) is not None:
1512                     continue
1513                 info[count_key] = interaction_count
1514
1515         def extract_chapter_information(e):
1516             chapters = [{
1517                 'title': part.get('name'),
1518                 'start_time': part.get('startOffset'),
1519                 'end_time': part.get('endOffset'),
1520             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1521             for idx, (last_c, current_c, next_c) in enumerate(zip(
1522                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1523                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1524                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1525                 if None in current_c.values():
1526                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1527                     return
1528             if chapters:
1529                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1530                 info['chapters'] = chapters
1531
1532         def extract_video_object(e):
1533             assert is_type(e, 'VideoObject')
1534             author = e.get('author')
1535             info.update({
1536                 'url': url_or_none(e.get('contentUrl')),
1537                 'title': unescapeHTML(e.get('name')),
1538                 'description': unescapeHTML(e.get('description')),
1539                 'thumbnails': [{'url': unescapeHTML(url)}
1540                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1541                                if url_or_none(url)],
1542                 'duration': parse_duration(e.get('duration')),
1543                 'timestamp': unified_timestamp(e.get('uploadDate')),
1544                 # author can be an instance of 'Organization' or 'Person' types.
1545                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1546                 # however some websites are using 'Text' type instead.
1547                 # 1. https://schema.org/VideoObject
1548                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1549                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1550                 'tbr': int_or_none(e.get('bitrate')),
1551                 'width': int_or_none(e.get('width')),
1552                 'height': int_or_none(e.get('height')),
1553                 'view_count': int_or_none(e.get('interactionCount')),
1554             })
1555             extract_interaction_statistic(e)
1556             extract_chapter_information(e)
1557
1558         def traverse_json_ld(json_ld, at_top_level=True):
1559             for e in json_ld:
1560                 if at_top_level and '@context' not in e:
1561                     continue
1562                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1563                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1564                     break
1565                 if expected_type is not None and not is_type(e, expected_type):
1566                     continue
1567                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1568                 if rating is not None:
1569                     info['average_rating'] = rating
1570                 if is_type(e, 'TVEpisode', 'Episode'):
1571                     episode_name = unescapeHTML(e.get('name'))
1572                     info.update({
1573                         'episode': episode_name,
1574                         'episode_number': int_or_none(e.get('episodeNumber')),
1575                         'description': unescapeHTML(e.get('description')),
1576                     })
1577                     if not info.get('title') and episode_name:
1578                         info['title'] = episode_name
1579                     part_of_season = e.get('partOfSeason')
1580                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1581                         info.update({
1582                             'season': unescapeHTML(part_of_season.get('name')),
1583                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1584                         })
1585                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1586                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1587                         info['series'] = unescapeHTML(part_of_series.get('name'))
1588                 elif is_type(e, 'Movie'):
1589                     info.update({
1590                         'title': unescapeHTML(e.get('name')),
1591                         'description': unescapeHTML(e.get('description')),
1592                         'duration': parse_duration(e.get('duration')),
1593                         'timestamp': unified_timestamp(e.get('dateCreated')),
1594                     })
1595                 elif is_type(e, 'Article', 'NewsArticle'):
1596                     info.update({
1597                         'timestamp': parse_iso8601(e.get('datePublished')),
1598                         'title': unescapeHTML(e.get('headline')),
1599                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1600                     })
1601                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1602                         extract_video_object(e['video'][0])
1603                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1604                         extract_video_object(e['subjectOf'][0])
1605                 elif is_type(e, 'VideoObject'):
1606                     extract_video_object(e)
1607                     if expected_type is None:
1608                         continue
1609                     else:
1610                         break
1611                 video = e.get('video')
1612                 if is_type(video, 'VideoObject'):
1613                     extract_video_object(video)
1614                 if expected_type is None:
1615                     continue
1616                 else:
1617                     break
1618         traverse_json_ld(json_ld)
1619
1620         return filter_dict(info)
1621
1622     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1623         return self._parse_json(
1624             self._search_regex(
1625                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1626                 webpage, 'next.js data', fatal=fatal, **kw),
1627             video_id, transform_source=transform_source, fatal=fatal)
1628
1629     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1630         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1631         rectx = re.escape(context_name)
1632         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1633         js, arg_keys, arg_vals = self._search_regex(
1634             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1635             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
1636
1637         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1638
1639         for key, val in args.items():
1640             if val in ('undefined', 'void 0'):
1641                 args[key] = 'null'
1642
1643         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1644         return traverse_obj(ret, traverse) or {}
1645
1646     @staticmethod
1647     def _hidden_inputs(html):
1648         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1649         hidden_inputs = {}
1650         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1651             attrs = extract_attributes(input)
1652             if not input:
1653                 continue
1654             if attrs.get('type') not in ('hidden', 'submit'):
1655                 continue
1656             name = attrs.get('name') or attrs.get('id')
1657             value = attrs.get('value')
1658             if name and value is not None:
1659                 hidden_inputs[name] = value
1660         return hidden_inputs
1661
1662     def _form_hidden_inputs(self, form_id, html):
1663         form = self._search_regex(
1664             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1665             html, '%s form' % form_id, group='form')
1666         return self._hidden_inputs(form)
1667
1668     class FormatSort:
1669         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1670
1671         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1672                    'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
1673                    'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1674         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1675                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1676                         'fps', 'fs_approx', 'source', 'id')
1677
1678         settings = {
1679             'vcodec': {'type': 'ordered', 'regex': True,
1680                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1681             'acodec': {'type': 'ordered', 'regex': True,
1682                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1683             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1684                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1685             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1686                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1687             'vext': {'type': 'ordered', 'field': 'video_ext',
1688                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1689                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1690             'aext': {'type': 'ordered', 'field': 'audio_ext',
1691                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1692                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1693             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1694             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1695                            'field': ('vcodec', 'acodec'),
1696                            'function': lambda it: int(any(v != 'none' for v in it))},
1697             'ie_pref': {'priority': True, 'type': 'extractor'},
1698             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1699             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1700             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1701             'quality': {'convert': 'float', 'default': -1},
1702             'filesize': {'convert': 'bytes'},
1703             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1704             'id': {'convert': 'string', 'field': 'format_id'},
1705             'height': {'convert': 'float_none'},
1706             'width': {'convert': 'float_none'},
1707             'fps': {'convert': 'float_none'},
1708             'channels': {'convert': 'float_none', 'field': 'audio_channels'},
1709             'tbr': {'convert': 'float_none'},
1710             'vbr': {'convert': 'float_none'},
1711             'abr': {'convert': 'float_none'},
1712             'asr': {'convert': 'float_none'},
1713             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1714
1715             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1716             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1717             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1718             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1719             'res': {'type': 'multiple', 'field': ('height', 'width'),
1720                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1721
1722             # Actual field names
1723             'format_id': {'type': 'alias', 'field': 'id'},
1724             'preference': {'type': 'alias', 'field': 'ie_pref'},
1725             'language_preference': {'type': 'alias', 'field': 'lang'},
1726             'source_preference': {'type': 'alias', 'field': 'source'},
1727             'protocol': {'type': 'alias', 'field': 'proto'},
1728             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1729             'audio_channels': {'type': 'alias', 'field': 'channels'},
1730
1731             # Deprecated
1732             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1733             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1734             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1735             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1736             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1737             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1738             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1739             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1740             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1741             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1742             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1743             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1744             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1745             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1746             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1747             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1748             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1749             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1750             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1751             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1752         }
1753
1754         def __init__(self, ie, field_preference):
1755             self._order = []
1756             self.ydl = ie._downloader
1757             self.evaluate_params(self.ydl.params, field_preference)
1758             if ie.get_param('verbose'):
1759                 self.print_verbose_info(self.ydl.write_debug)
1760
1761         def _get_field_setting(self, field, key):
1762             if field not in self.settings:
1763                 if key in ('forced', 'priority'):
1764                     return False
1765                 self.ydl.deprecation_warning(
1766                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1767                     'and may be removed in a future version')
1768                 self.settings[field] = {}
1769             propObj = self.settings[field]
1770             if key not in propObj:
1771                 type = propObj.get('type')
1772                 if key == 'field':
1773                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1774                 elif key == 'convert':
1775                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1776                 else:
1777                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1778                 propObj[key] = default
1779             return propObj[key]
1780
1781         def _resolve_field_value(self, field, value, convertNone=False):
1782             if value is None:
1783                 if not convertNone:
1784                     return None
1785             else:
1786                 value = value.lower()
1787             conversion = self._get_field_setting(field, 'convert')
1788             if conversion == 'ignore':
1789                 return None
1790             if conversion == 'string':
1791                 return value
1792             elif conversion == 'float_none':
1793                 return float_or_none(value)
1794             elif conversion == 'bytes':
1795                 return FileDownloader.parse_bytes(value)
1796             elif conversion == 'order':
1797                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1798                 use_regex = self._get_field_setting(field, 'regex')
1799                 list_length = len(order_list)
1800                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1801                 if use_regex and value is not None:
1802                     for i, regex in enumerate(order_list):
1803                         if regex and re.match(regex, value):
1804                             return list_length - i
1805                     return list_length - empty_pos  # not in list
1806                 else:  # not regex or  value = None
1807                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1808             else:
1809                 if value.isnumeric():
1810                     return float(value)
1811                 else:
1812                     self.settings[field]['convert'] = 'string'
1813                     return value
1814
1815         def evaluate_params(self, params, sort_extractor):
1816             self._use_free_order = params.get('prefer_free_formats', False)
1817             self._sort_user = params.get('format_sort', [])
1818             self._sort_extractor = sort_extractor
1819
1820             def add_item(field, reverse, closest, limit_text):
1821                 field = field.lower()
1822                 if field in self._order:
1823                     return
1824                 self._order.append(field)
1825                 limit = self._resolve_field_value(field, limit_text)
1826                 data = {
1827                     'reverse': reverse,
1828                     'closest': False if limit is None else closest,
1829                     'limit_text': limit_text,
1830                     'limit': limit}
1831                 if field in self.settings:
1832                     self.settings[field].update(data)
1833                 else:
1834                     self.settings[field] = data
1835
1836             sort_list = (
1837                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1838                 + (tuple() if params.get('format_sort_force', False)
1839                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1840                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1841
1842             for item in sort_list:
1843                 match = re.match(self.regex, item)
1844                 if match is None:
1845                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1846                 field = match.group('field')
1847                 if field is None:
1848                     continue
1849                 if self._get_field_setting(field, 'type') == 'alias':
1850                     alias, field = field, self._get_field_setting(field, 'field')
1851                     if self._get_field_setting(alias, 'deprecated'):
1852                         self.ydl.deprecation_warning(
1853                             f'Format sorting alias {alias} is deprecated '
1854                             f'and may be removed in a future version. Please use {field} instead')
1855                 reverse = match.group('reverse') is not None
1856                 closest = match.group('separator') == '~'
1857                 limit_text = match.group('limit')
1858
1859                 has_limit = limit_text is not None
1860                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1861                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1862
1863                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1864                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1865                 limit_count = len(limits)
1866                 for (i, f) in enumerate(fields):
1867                     add_item(f, reverse, closest,
1868                              limits[i] if i < limit_count
1869                              else limits[0] if has_limit and not has_multiple_limits
1870                              else None)
1871
1872         def print_verbose_info(self, write_debug):
1873             if self._sort_user:
1874                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1875             if self._sort_extractor:
1876                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1877             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1878                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1879                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1880                               self._get_field_setting(field, 'limit_text'),
1881                               self._get_field_setting(field, 'limit'))
1882                 if self._get_field_setting(field, 'limit_text') is not None else '')
1883                 for field in self._order if self._get_field_setting(field, 'visible')]))
1884
1885         def _calculate_field_preference_from_value(self, format, field, type, value):
1886             reverse = self._get_field_setting(field, 'reverse')
1887             closest = self._get_field_setting(field, 'closest')
1888             limit = self._get_field_setting(field, 'limit')
1889
1890             if type == 'extractor':
1891                 maximum = self._get_field_setting(field, 'max')
1892                 if value is None or (maximum is not None and value >= maximum):
1893                     value = -1
1894             elif type == 'boolean':
1895                 in_list = self._get_field_setting(field, 'in_list')
1896                 not_in_list = self._get_field_setting(field, 'not_in_list')
1897                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1898             elif type == 'ordered':
1899                 value = self._resolve_field_value(field, value, True)
1900
1901             # try to convert to number
1902             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1903             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1904             if is_num:
1905                 value = val_num
1906
1907             return ((-10, 0) if value is None
1908                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1909                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1910                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1911                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1912                     else (-1, value, 0))
1913
1914         def _calculate_field_preference(self, format, field):
1915             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1916             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1917             if type == 'multiple':
1918                 type = 'field'  # Only 'field' is allowed in multiple for now
1919                 actual_fields = self._get_field_setting(field, 'field')
1920
1921                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1922             else:
1923                 value = get_value(field)
1924             return self._calculate_field_preference_from_value(format, field, type, value)
1925
1926         def calculate_preference(self, format):
1927             # Determine missing protocol
1928             if not format.get('protocol'):
1929                 format['protocol'] = determine_protocol(format)
1930
1931             # Determine missing ext
1932             if not format.get('ext') and 'url' in format:
1933                 format['ext'] = determine_ext(format['url'])
1934             if format.get('vcodec') == 'none':
1935                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1936                 format['video_ext'] = 'none'
1937             else:
1938                 format['video_ext'] = format['ext']
1939                 format['audio_ext'] = 'none'
1940             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1941             #    format['preference'] = -1000
1942
1943             # Determine missing bitrates
1944             if format.get('tbr') is None:
1945                 if format.get('vbr') is not None and format.get('abr') is not None:
1946                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1947             else:
1948                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1949                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1950                 if format.get('acodec') != 'none' and format.get('abr') is None:
1951                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1952
1953             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1954
1955     def _sort_formats(self, formats, field_preference=[]):
1956         if not formats:
1957             return
1958         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1959
1960     def _check_formats(self, formats, video_id):
1961         if formats:
1962             formats[:] = filter(
1963                 lambda f: self._is_valid_url(
1964                     f['url'], video_id,
1965                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1966                 formats)
1967
1968     @staticmethod
1969     def _remove_duplicate_formats(formats):
1970         format_urls = set()
1971         unique_formats = []
1972         for f in formats:
1973             if f['url'] not in format_urls:
1974                 format_urls.add(f['url'])
1975                 unique_formats.append(f)
1976         formats[:] = unique_formats
1977
1978     def _is_valid_url(self, url, video_id, item='video', headers={}):
1979         url = self._proto_relative_url(url, scheme='http:')
1980         # For now assume non HTTP(S) URLs always valid
1981         if not (url.startswith('http://') or url.startswith('https://')):
1982             return True
1983         try:
1984             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1985             return True
1986         except ExtractorError as e:
1987             self.to_screen(
1988                 '%s: %s URL is invalid, skipping: %s'
1989                 % (video_id, item, error_to_compat_str(e.cause)))
1990             return False
1991
1992     def http_scheme(self):
1993         """ Either "http:" or "https:", depending on the user's preferences """
1994         return (
1995             'http:'
1996             if self.get_param('prefer_insecure', False)
1997             else 'https:')
1998
1999     def _proto_relative_url(self, url, scheme=None):
2000         scheme = scheme or self.http_scheme()
2001         assert scheme.endswith(':')
2002         return sanitize_url(url, scheme=scheme[:-1])
2003
2004     def _sleep(self, timeout, video_id, msg_template=None):
2005         if msg_template is None:
2006             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
2007         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
2008         self.to_screen(msg)
2009         time.sleep(timeout)
2010
2011     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2012                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
2013                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
2014         res = self._download_xml_handle(
2015             manifest_url, video_id, 'Downloading f4m manifest',
2016             'Unable to download f4m manifest',
2017             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
2018             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
2019             transform_source=transform_source,
2020             fatal=fatal, data=data, headers=headers, query=query)
2021         if res is False:
2022             return []
2023
2024         manifest, urlh = res
2025         manifest_url = urlh.geturl()
2026
2027         return self._parse_f4m_formats(
2028             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2029             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2030
2031     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2032                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2033                            fatal=True, m3u8_id=None):
2034         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2035             return []
2036
2037         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2038         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2039         if akamai_pv is not None and ';' in akamai_pv.text:
2040             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2041             if playerVerificationChallenge.strip() != '':
2042                 return []
2043
2044         formats = []
2045         manifest_version = '1.0'
2046         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2047         if not media_nodes:
2048             manifest_version = '2.0'
2049             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2050         # Remove unsupported DRM protected media from final formats
2051         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2052         media_nodes = remove_encrypted_media(media_nodes)
2053         if not media_nodes:
2054             return formats
2055
2056         manifest_base_url = get_base_url(manifest)
2057
2058         bootstrap_info = xpath_element(
2059             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2060             'bootstrap info', default=None)
2061
2062         vcodec = None
2063         mime_type = xpath_text(
2064             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2065             'base URL', default=None)
2066         if mime_type and mime_type.startswith('audio/'):
2067             vcodec = 'none'
2068
2069         for i, media_el in enumerate(media_nodes):
2070             tbr = int_or_none(media_el.attrib.get('bitrate'))
2071             width = int_or_none(media_el.attrib.get('width'))
2072             height = int_or_none(media_el.attrib.get('height'))
2073             format_id = join_nonempty(f4m_id, tbr or i)
2074             # If <bootstrapInfo> is present, the specified f4m is a
2075             # stream-level manifest, and only set-level manifests may refer to
2076             # external resources.  See section 11.4 and section 4 of F4M spec
2077             if bootstrap_info is None:
2078                 media_url = None
2079                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2080                 if manifest_version == '2.0':
2081                     media_url = media_el.attrib.get('href')
2082                 if media_url is None:
2083                     media_url = media_el.attrib.get('url')
2084                 if not media_url:
2085                     continue
2086                 manifest_url = (
2087                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2088                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2089                 # If media_url is itself a f4m manifest do the recursive extraction
2090                 # since bitrates in parent manifest (this one) and media_url manifest
2091                 # may differ leading to inability to resolve the format by requested
2092                 # bitrate in f4m downloader
2093                 ext = determine_ext(manifest_url)
2094                 if ext == 'f4m':
2095                     f4m_formats = self._extract_f4m_formats(
2096                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2097                         transform_source=transform_source, fatal=fatal)
2098                     # Sometimes stream-level manifest contains single media entry that
2099                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2100                     # At the same time parent's media entry in set-level manifest may
2101                     # contain it. We will copy it from parent in such cases.
2102                     if len(f4m_formats) == 1:
2103                         f = f4m_formats[0]
2104                         f.update({
2105                             'tbr': f.get('tbr') or tbr,
2106                             'width': f.get('width') or width,
2107                             'height': f.get('height') or height,
2108                             'format_id': f.get('format_id') if not tbr else format_id,
2109                             'vcodec': vcodec,
2110                         })
2111                     formats.extend(f4m_formats)
2112                     continue
2113                 elif ext == 'm3u8':
2114                     formats.extend(self._extract_m3u8_formats(
2115                         manifest_url, video_id, 'mp4', preference=preference,
2116                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2117                     continue
2118             formats.append({
2119                 'format_id': format_id,
2120                 'url': manifest_url,
2121                 'manifest_url': manifest_url,
2122                 'ext': 'flv' if bootstrap_info is not None else None,
2123                 'protocol': 'f4m',
2124                 'tbr': tbr,
2125                 'width': width,
2126                 'height': height,
2127                 'vcodec': vcodec,
2128                 'preference': preference,
2129                 'quality': quality,
2130             })
2131         return formats
2132
2133     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2134         return {
2135             'format_id': join_nonempty(m3u8_id, 'meta'),
2136             'url': m3u8_url,
2137             'ext': ext,
2138             'protocol': 'm3u8',
2139             'preference': preference - 100 if preference else -100,
2140             'quality': quality,
2141             'resolution': 'multiple',
2142             'format_note': 'Quality selection URL',
2143         }
2144
2145     def _report_ignoring_subs(self, name):
2146         self.report_warning(bug_reports_message(
2147             f'Ignoring subtitle tracks found in the {name} manifest; '
2148             'if any subtitle tracks are missing,'
2149         ), only_once=True)
2150
2151     def _extract_m3u8_formats(self, *args, **kwargs):
2152         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2153         if subs:
2154             self._report_ignoring_subs('HLS')
2155         return fmts
2156
2157     def _extract_m3u8_formats_and_subtitles(
2158             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2159             preference=None, quality=None, m3u8_id=None, note=None,
2160             errnote=None, fatal=True, live=False, data=None, headers={},
2161             query={}):
2162
2163         res = self._download_webpage_handle(
2164             m3u8_url, video_id,
2165             note='Downloading m3u8 information' if note is None else note,
2166             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2167             fatal=fatal, data=data, headers=headers, query=query)
2168
2169         if res is False:
2170             return [], {}
2171
2172         m3u8_doc, urlh = res
2173         m3u8_url = urlh.geturl()
2174
2175         return self._parse_m3u8_formats_and_subtitles(
2176             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2177             preference=preference, quality=quality, m3u8_id=m3u8_id,
2178             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2179             headers=headers, query=query, video_id=video_id)
2180
2181     def _parse_m3u8_formats_and_subtitles(
2182             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2183             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2184             errnote=None, fatal=True, data=None, headers={}, query={},
2185             video_id=None):
2186         formats, subtitles = [], {}
2187
2188         has_drm = re.search('|'.join([
2189             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2190             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2191         ]), m3u8_doc)
2192
2193         def format_url(url):
2194             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2195
2196         if self.get_param('hls_split_discontinuity', False):
2197             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2198                 if not m3u8_doc:
2199                     if not manifest_url:
2200                         return []
2201                     m3u8_doc = self._download_webpage(
2202                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2203                         note=False, errnote='Failed to download m3u8 playlist information')
2204                     if m3u8_doc is False:
2205                         return []
2206                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2207
2208         else:
2209             def _extract_m3u8_playlist_indices(*args, **kwargs):
2210                 return [None]
2211
2212         # References:
2213         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2214         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2215         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2216
2217         # We should try extracting formats only from master playlists [1, 4.3.4],
2218         # i.e. playlists that describe available qualities. On the other hand
2219         # media playlists [1, 4.3.3] should be returned as is since they contain
2220         # just the media without qualities renditions.
2221         # Fortunately, master playlist can be easily distinguished from media
2222         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2223         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2224         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2225         # media playlist and MUST NOT appear in master playlist thus we can
2226         # clearly detect media playlist with this criterion.
2227
2228         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2229             formats = [{
2230                 'format_id': join_nonempty(m3u8_id, idx),
2231                 'format_index': idx,
2232                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2233                 'ext': ext,
2234                 'protocol': entry_protocol,
2235                 'preference': preference,
2236                 'quality': quality,
2237                 'has_drm': has_drm,
2238             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2239
2240             return formats, subtitles
2241
2242         groups = {}
2243         last_stream_inf = {}
2244
2245         def extract_media(x_media_line):
2246             media = parse_m3u8_attributes(x_media_line)
2247             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2248             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2249             if not (media_type and group_id and name):
2250                 return
2251             groups.setdefault(group_id, []).append(media)
2252             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2253             if media_type == 'SUBTITLES':
2254                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2255                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2256                 # However, lack of URI has been spotted in the wild.
2257                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2258                 if not media.get('URI'):
2259                     return
2260                 url = format_url(media['URI'])
2261                 sub_info = {
2262                     'url': url,
2263                     'ext': determine_ext(url),
2264                 }
2265                 if sub_info['ext'] == 'm3u8':
2266                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2267                     # files may contain is WebVTT:
2268                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2269                     sub_info['ext'] = 'vtt'
2270                     sub_info['protocol'] = 'm3u8_native'
2271                 lang = media.get('LANGUAGE') or 'und'
2272                 subtitles.setdefault(lang, []).append(sub_info)
2273             if media_type not in ('VIDEO', 'AUDIO'):
2274                 return
2275             media_url = media.get('URI')
2276             if media_url:
2277                 manifest_url = format_url(media_url)
2278                 formats.extend({
2279                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2280                     'format_note': name,
2281                     'format_index': idx,
2282                     'url': manifest_url,
2283                     'manifest_url': m3u8_url,
2284                     'language': media.get('LANGUAGE'),
2285                     'ext': ext,
2286                     'protocol': entry_protocol,
2287                     'preference': preference,
2288                     'quality': quality,
2289                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2290                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2291
2292         def build_stream_name():
2293             # Despite specification does not mention NAME attribute for
2294             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2295             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2296             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2297             stream_name = last_stream_inf.get('NAME')
2298             if stream_name:
2299                 return stream_name
2300             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2301             # from corresponding rendition group
2302             stream_group_id = last_stream_inf.get('VIDEO')
2303             if not stream_group_id:
2304                 return
2305             stream_group = groups.get(stream_group_id)
2306             if not stream_group:
2307                 return stream_group_id
2308             rendition = stream_group[0]
2309             return rendition.get('NAME') or stream_group_id
2310
2311         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2312         # chance to detect video only formats when EXT-X-STREAM-INF tags
2313         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2314         for line in m3u8_doc.splitlines():
2315             if line.startswith('#EXT-X-MEDIA:'):
2316                 extract_media(line)
2317
2318         for line in m3u8_doc.splitlines():
2319             if line.startswith('#EXT-X-STREAM-INF:'):
2320                 last_stream_inf = parse_m3u8_attributes(line)
2321             elif line.startswith('#') or not line.strip():
2322                 continue
2323             else:
2324                 tbr = float_or_none(
2325                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2326                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2327                 manifest_url = format_url(line.strip())
2328
2329                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2330                     format_id = [m3u8_id, None, idx]
2331                     # Bandwidth of live streams may differ over time thus making
2332                     # format_id unpredictable. So it's better to keep provided
2333                     # format_id intact.
2334                     if not live:
2335                         stream_name = build_stream_name()
2336                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2337                     f = {
2338                         'format_id': join_nonempty(*format_id),
2339                         'format_index': idx,
2340                         'url': manifest_url,
2341                         'manifest_url': m3u8_url,
2342                         'tbr': tbr,
2343                         'ext': ext,
2344                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2345                         'protocol': entry_protocol,
2346                         'preference': preference,
2347                         'quality': quality,
2348                     }
2349                     resolution = last_stream_inf.get('RESOLUTION')
2350                     if resolution:
2351                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2352                         if mobj:
2353                             f['width'] = int(mobj.group('width'))
2354                             f['height'] = int(mobj.group('height'))
2355                     # Unified Streaming Platform
2356                     mobj = re.search(
2357                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2358                     if mobj:
2359                         abr, vbr = mobj.groups()
2360                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2361                         f.update({
2362                             'vbr': vbr,
2363                             'abr': abr,
2364                         })
2365                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2366                     f.update(codecs)
2367                     audio_group_id = last_stream_inf.get('AUDIO')
2368                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2369                     # references a rendition group MUST have a CODECS attribute.
2370                     # However, this is not always respected. E.g. [2]
2371                     # contains EXT-X-STREAM-INF tag which references AUDIO
2372                     # rendition group but does not have CODECS and despite
2373                     # referencing an audio group it represents a complete
2374                     # (with audio and video) format. So, for such cases we will
2375                     # ignore references to rendition groups and treat them
2376                     # as complete formats.
2377                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2378                         audio_group = groups.get(audio_group_id)
2379                         if audio_group and audio_group[0].get('URI'):
2380                             # TODO: update acodec for audio only formats with
2381                             # the same GROUP-ID
2382                             f['acodec'] = 'none'
2383                     if not f.get('ext'):
2384                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2385                     formats.append(f)
2386
2387                     # for DailyMotion
2388                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2389                     if progressive_uri:
2390                         http_f = f.copy()
2391                         del http_f['manifest_url']
2392                         http_f.update({
2393                             'format_id': f['format_id'].replace('hls-', 'http-'),
2394                             'protocol': 'http',
2395                             'url': progressive_uri,
2396                         })
2397                         formats.append(http_f)
2398
2399                 last_stream_inf = {}
2400         return formats, subtitles
2401
2402     def _extract_m3u8_vod_duration(
2403             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2404
2405         m3u8_vod = self._download_webpage(
2406             m3u8_vod_url, video_id,
2407             note='Downloading m3u8 VOD manifest' if note is None else note,
2408             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2409             fatal=False, data=data, headers=headers, query=query)
2410
2411         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2412
2413     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2414         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2415             return None
2416
2417         return int(sum(
2418             float(line[len('#EXTINF:'):].split(',')[0])
2419             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2420
2421     @staticmethod
2422     def _xpath_ns(path, namespace=None):
2423         if not namespace:
2424             return path
2425         out = []
2426         for c in path.split('/'):
2427             if not c or c == '.':
2428                 out.append(c)
2429             else:
2430                 out.append('{%s}%s' % (namespace, c))
2431         return '/'.join(out)
2432
2433     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2434         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2435         if res is False:
2436             assert not fatal
2437             return [], {}
2438
2439         smil, urlh = res
2440         smil_url = urlh.geturl()
2441
2442         namespace = self._parse_smil_namespace(smil)
2443
2444         fmts = self._parse_smil_formats(
2445             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2446         subs = self._parse_smil_subtitles(
2447             smil, namespace=namespace)
2448
2449         return fmts, subs
2450
2451     def _extract_smil_formats(self, *args, **kwargs):
2452         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2453         if subs:
2454             self._report_ignoring_subs('SMIL')
2455         return fmts
2456
2457     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2458         res = self._download_smil(smil_url, video_id, fatal=fatal)
2459         if res is False:
2460             return {}
2461
2462         smil, urlh = res
2463         smil_url = urlh.geturl()
2464
2465         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2466
2467     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2468         return self._download_xml_handle(
2469             smil_url, video_id, 'Downloading SMIL file',
2470             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2471
2472     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2473         namespace = self._parse_smil_namespace(smil)
2474
2475         formats = self._parse_smil_formats(
2476             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2477         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2478
2479         video_id = os.path.splitext(url_basename(smil_url))[0]
2480         title = None
2481         description = None
2482         upload_date = None
2483         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2484             name = meta.attrib.get('name')
2485             content = meta.attrib.get('content')
2486             if not name or not content:
2487                 continue
2488             if not title and name == 'title':
2489                 title = content
2490             elif not description and name in ('description', 'abstract'):
2491                 description = content
2492             elif not upload_date and name == 'date':
2493                 upload_date = unified_strdate(content)
2494
2495         thumbnails = [{
2496             'id': image.get('type'),
2497             'url': image.get('src'),
2498             'width': int_or_none(image.get('width')),
2499             'height': int_or_none(image.get('height')),
2500         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2501
2502         return {
2503             'id': video_id,
2504             'title': title or video_id,
2505             'description': description,
2506             'upload_date': upload_date,
2507             'thumbnails': thumbnails,
2508             'formats': formats,
2509             'subtitles': subtitles,
2510         }
2511
2512     def _parse_smil_namespace(self, smil):
2513         return self._search_regex(
2514             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2515
2516     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2517         base = smil_url
2518         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2519             b = meta.get('base') or meta.get('httpBase')
2520             if b:
2521                 base = b
2522                 break
2523
2524         formats = []
2525         rtmp_count = 0
2526         http_count = 0
2527         m3u8_count = 0
2528         imgs_count = 0
2529
2530         srcs = set()
2531         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2532         for medium in media:
2533             src = medium.get('src')
2534             if not src or src in srcs:
2535                 continue
2536             srcs.add(src)
2537
2538             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2539             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2540             width = int_or_none(medium.get('width'))
2541             height = int_or_none(medium.get('height'))
2542             proto = medium.get('proto')
2543             ext = medium.get('ext')
2544             src_ext = determine_ext(src)
2545             streamer = medium.get('streamer') or base
2546
2547             if proto == 'rtmp' or streamer.startswith('rtmp'):
2548                 rtmp_count += 1
2549                 formats.append({
2550                     'url': streamer,
2551                     'play_path': src,
2552                     'ext': 'flv',
2553                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2554                     'tbr': bitrate,
2555                     'filesize': filesize,
2556                     'width': width,
2557                     'height': height,
2558                 })
2559                 if transform_rtmp_url:
2560                     streamer, src = transform_rtmp_url(streamer, src)
2561                     formats[-1].update({
2562                         'url': streamer,
2563                         'play_path': src,
2564                     })
2565                 continue
2566
2567             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2568             src_url = src_url.strip()
2569
2570             if proto == 'm3u8' or src_ext == 'm3u8':
2571                 m3u8_formats = self._extract_m3u8_formats(
2572                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2573                 if len(m3u8_formats) == 1:
2574                     m3u8_count += 1
2575                     m3u8_formats[0].update({
2576                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2577                         'tbr': bitrate,
2578                         'width': width,
2579                         'height': height,
2580                     })
2581                 formats.extend(m3u8_formats)
2582             elif src_ext == 'f4m':
2583                 f4m_url = src_url
2584                 if not f4m_params:
2585                     f4m_params = {
2586                         'hdcore': '3.2.0',
2587                         'plugin': 'flowplayer-3.2.0.1',
2588                     }
2589                 f4m_url += '&' if '?' in f4m_url else '?'
2590                 f4m_url += urllib.parse.urlencode(f4m_params)
2591                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2592             elif src_ext == 'mpd':
2593                 formats.extend(self._extract_mpd_formats(
2594                     src_url, video_id, mpd_id='dash', fatal=False))
2595             elif re.search(r'\.ism/[Mm]anifest', src_url):
2596                 formats.extend(self._extract_ism_formats(
2597                     src_url, video_id, ism_id='mss', fatal=False))
2598             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2599                 http_count += 1
2600                 formats.append({
2601                     'url': src_url,
2602                     'ext': ext or src_ext or 'flv',
2603                     'format_id': 'http-%d' % (bitrate or http_count),
2604                     'tbr': bitrate,
2605                     'filesize': filesize,
2606                     'width': width,
2607                     'height': height,
2608                 })
2609
2610         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2611             src = medium.get('src')
2612             if not src or src in srcs:
2613                 continue
2614             srcs.add(src)
2615
2616             imgs_count += 1
2617             formats.append({
2618                 'format_id': 'imagestream-%d' % (imgs_count),
2619                 'url': src,
2620                 'ext': mimetype2ext(medium.get('type')),
2621                 'acodec': 'none',
2622                 'vcodec': 'none',
2623                 'width': int_or_none(medium.get('width')),
2624                 'height': int_or_none(medium.get('height')),
2625                 'format_note': 'SMIL storyboards',
2626             })
2627
2628         return formats
2629
2630     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2631         urls = []
2632         subtitles = {}
2633         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2634             src = textstream.get('src')
2635             if not src or src in urls:
2636                 continue
2637             urls.append(src)
2638             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2639             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2640             subtitles.setdefault(lang, []).append({
2641                 'url': src,
2642                 'ext': ext,
2643             })
2644         return subtitles
2645
2646     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2647         res = self._download_xml_handle(
2648             xspf_url, playlist_id, 'Downloading xpsf playlist',
2649             'Unable to download xspf manifest', fatal=fatal)
2650         if res is False:
2651             return []
2652
2653         xspf, urlh = res
2654         xspf_url = urlh.geturl()
2655
2656         return self._parse_xspf(
2657             xspf, playlist_id, xspf_url=xspf_url,
2658             xspf_base_url=base_url(xspf_url))
2659
2660     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2661         NS_MAP = {
2662             'xspf': 'http://xspf.org/ns/0/',
2663             's1': 'http://static.streamone.nl/player/ns/0',
2664         }
2665
2666         entries = []
2667         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2668             title = xpath_text(
2669                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2670             description = xpath_text(
2671                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2672             thumbnail = xpath_text(
2673                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2674             duration = float_or_none(
2675                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2676
2677             formats = []
2678             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2679                 format_url = urljoin(xspf_base_url, location.text)
2680                 if not format_url:
2681                     continue
2682                 formats.append({
2683                     'url': format_url,
2684                     'manifest_url': xspf_url,
2685                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2686                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2687                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2688                 })
2689             self._sort_formats(formats)
2690
2691             entries.append({
2692                 'id': playlist_id,
2693                 'title': title,
2694                 'description': description,
2695                 'thumbnail': thumbnail,
2696                 'duration': duration,
2697                 'formats': formats,
2698             })
2699         return entries
2700
2701     def _extract_mpd_formats(self, *args, **kwargs):
2702         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2703         if subs:
2704             self._report_ignoring_subs('DASH')
2705         return fmts
2706
2707     def _extract_mpd_formats_and_subtitles(
2708             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2709             fatal=True, data=None, headers={}, query={}):
2710         res = self._download_xml_handle(
2711             mpd_url, video_id,
2712             note='Downloading MPD manifest' if note is None else note,
2713             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2714             fatal=fatal, data=data, headers=headers, query=query)
2715         if res is False:
2716             return [], {}
2717         mpd_doc, urlh = res
2718         if mpd_doc is None:
2719             return [], {}
2720
2721         # We could have been redirected to a new url when we retrieved our mpd file.
2722         mpd_url = urlh.geturl()
2723         mpd_base_url = base_url(mpd_url)
2724
2725         return self._parse_mpd_formats_and_subtitles(
2726             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2727
2728     def _parse_mpd_formats(self, *args, **kwargs):
2729         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2730         if subs:
2731             self._report_ignoring_subs('DASH')
2732         return fmts
2733
2734     def _parse_mpd_formats_and_subtitles(
2735             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2736         """
2737         Parse formats from MPD manifest.
2738         References:
2739          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2740             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2741          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2742         """
2743         if not self.get_param('dynamic_mpd', True):
2744             if mpd_doc.get('type') == 'dynamic':
2745                 return [], {}
2746
2747         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2748
2749         def _add_ns(path):
2750             return self._xpath_ns(path, namespace)
2751
2752         def is_drm_protected(element):
2753             return element.find(_add_ns('ContentProtection')) is not None
2754
2755         def extract_multisegment_info(element, ms_parent_info):
2756             ms_info = ms_parent_info.copy()
2757
2758             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2759             # common attributes and elements.  We will only extract relevant
2760             # for us.
2761             def extract_common(source):
2762                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2763                 if segment_timeline is not None:
2764                     s_e = segment_timeline.findall(_add_ns('S'))
2765                     if s_e:
2766                         ms_info['total_number'] = 0
2767                         ms_info['s'] = []
2768                         for s in s_e:
2769                             r = int(s.get('r', 0))
2770                             ms_info['total_number'] += 1 + r
2771                             ms_info['s'].append({
2772                                 't': int(s.get('t', 0)),
2773                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2774                                 'd': int(s.attrib['d']),
2775                                 'r': r,
2776                             })
2777                 start_number = source.get('startNumber')
2778                 if start_number:
2779                     ms_info['start_number'] = int(start_number)
2780                 timescale = source.get('timescale')
2781                 if timescale:
2782                     ms_info['timescale'] = int(timescale)
2783                 segment_duration = source.get('duration')
2784                 if segment_duration:
2785                     ms_info['segment_duration'] = float(segment_duration)
2786
2787             def extract_Initialization(source):
2788                 initialization = source.find(_add_ns('Initialization'))
2789                 if initialization is not None:
2790                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2791
2792             segment_list = element.find(_add_ns('SegmentList'))
2793             if segment_list is not None:
2794                 extract_common(segment_list)
2795                 extract_Initialization(segment_list)
2796                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2797                 if segment_urls_e:
2798                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2799             else:
2800                 segment_template = element.find(_add_ns('SegmentTemplate'))
2801                 if segment_template is not None:
2802                     extract_common(segment_template)
2803                     media = segment_template.get('media')
2804                     if media:
2805                         ms_info['media'] = media
2806                     initialization = segment_template.get('initialization')
2807                     if initialization:
2808                         ms_info['initialization'] = initialization
2809                     else:
2810                         extract_Initialization(segment_template)
2811             return ms_info
2812
2813         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2814         formats, subtitles = [], {}
2815         stream_numbers = collections.defaultdict(int)
2816         for period in mpd_doc.findall(_add_ns('Period')):
2817             period_duration = parse_duration(period.get('duration')) or mpd_duration
2818             period_ms_info = extract_multisegment_info(period, {
2819                 'start_number': 1,
2820                 'timescale': 1,
2821             })
2822             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2823                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2824                 for representation in adaptation_set.findall(_add_ns('Representation')):
2825                     representation_attrib = adaptation_set.attrib.copy()
2826                     representation_attrib.update(representation.attrib)
2827                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2828                     mime_type = representation_attrib['mimeType']
2829                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2830
2831                     codec_str = representation_attrib.get('codecs', '')
2832                     # Some kind of binary subtitle found in some youtube livestreams
2833                     if mime_type == 'application/x-rawcc':
2834                         codecs = {'scodec': codec_str}
2835                     else:
2836                         codecs = parse_codecs(codec_str)
2837                     if content_type not in ('video', 'audio', 'text'):
2838                         if mime_type == 'image/jpeg':
2839                             content_type = mime_type
2840                         elif codecs.get('vcodec', 'none') != 'none':
2841                             content_type = 'video'
2842                         elif codecs.get('acodec', 'none') != 'none':
2843                             content_type = 'audio'
2844                         elif codecs.get('scodec', 'none') != 'none':
2845                             content_type = 'text'
2846                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2847                             content_type = 'text'
2848                         else:
2849                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2850                             continue
2851
2852                     base_url = ''
2853                     for element in (representation, adaptation_set, period, mpd_doc):
2854                         base_url_e = element.find(_add_ns('BaseURL'))
2855                         if try_call(lambda: base_url_e.text) is not None:
2856                             base_url = base_url_e.text + base_url
2857                             if re.match(r'^https?://', base_url):
2858                                 break
2859                     if mpd_base_url and base_url.startswith('/'):
2860                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2861                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2862                         if not mpd_base_url.endswith('/'):
2863                             mpd_base_url += '/'
2864                         base_url = mpd_base_url + base_url
2865                     representation_id = representation_attrib.get('id')
2866                     lang = representation_attrib.get('lang')
2867                     url_el = representation.find(_add_ns('BaseURL'))
2868                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2869                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2870                     if representation_id is not None:
2871                         format_id = representation_id
2872                     else:
2873                         format_id = content_type
2874                     if mpd_id:
2875                         format_id = mpd_id + '-' + format_id
2876                     if content_type in ('video', 'audio'):
2877                         f = {
2878                             'format_id': format_id,
2879                             'manifest_url': mpd_url,
2880                             'ext': mimetype2ext(mime_type),
2881                             'width': int_or_none(representation_attrib.get('width')),
2882                             'height': int_or_none(representation_attrib.get('height')),
2883                             'tbr': float_or_none(bandwidth, 1000),
2884                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2885                             'fps': int_or_none(representation_attrib.get('frameRate')),
2886                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2887                             'format_note': 'DASH %s' % content_type,
2888                             'filesize': filesize,
2889                             'container': mimetype2ext(mime_type) + '_dash',
2890                             **codecs
2891                         }
2892                     elif content_type == 'text':
2893                         f = {
2894                             'ext': mimetype2ext(mime_type),
2895                             'manifest_url': mpd_url,
2896                             'filesize': filesize,
2897                         }
2898                     elif content_type == 'image/jpeg':
2899                         # See test case in VikiIE
2900                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2901                         f = {
2902                             'format_id': format_id,
2903                             'ext': 'mhtml',
2904                             'manifest_url': mpd_url,
2905                             'format_note': 'DASH storyboards (jpeg)',
2906                             'acodec': 'none',
2907                             'vcodec': 'none',
2908                         }
2909                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2910                         f['has_drm'] = True
2911                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2912
2913                     def prepare_template(template_name, identifiers):
2914                         tmpl = representation_ms_info[template_name]
2915                         # First of, % characters outside $...$ templates
2916                         # must be escaped by doubling for proper processing
2917                         # by % operator string formatting used further (see
2918                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2919                         t = ''
2920                         in_template = False
2921                         for c in tmpl:
2922                             t += c
2923                             if c == '$':
2924                                 in_template = not in_template
2925                             elif c == '%' and not in_template:
2926                                 t += c
2927                         # Next, $...$ templates are translated to their
2928                         # %(...) counterparts to be used with % operator
2929                         if representation_id is not None:
2930                             t = t.replace('$RepresentationID$', representation_id)
2931                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2932                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2933                         t.replace('$$', '$')
2934                         return t
2935
2936                     # @initialization is a regular template like @media one
2937                     # so it should be handled just the same way (see
2938                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2939                     if 'initialization' in representation_ms_info:
2940                         initialization_template = prepare_template(
2941                             'initialization',
2942                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2943                             # $Time$ shall not be included for @initialization thus
2944                             # only $Bandwidth$ remains
2945                             ('Bandwidth', ))
2946                         representation_ms_info['initialization_url'] = initialization_template % {
2947                             'Bandwidth': bandwidth,
2948                         }
2949
2950                     def location_key(location):
2951                         return 'url' if re.match(r'^https?://', location) else 'path'
2952
2953                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2954
2955                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2956                         media_location_key = location_key(media_template)
2957
2958                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2959                         # can't be used at the same time
2960                         if '%(Number' in media_template and 's' not in representation_ms_info:
2961                             segment_duration = None
2962                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2963                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2964                                 representation_ms_info['total_number'] = int(math.ceil(
2965                                     float_or_none(period_duration, segment_duration, default=0)))
2966                             representation_ms_info['fragments'] = [{
2967                                 media_location_key: media_template % {
2968                                     'Number': segment_number,
2969                                     'Bandwidth': bandwidth,
2970                                 },
2971                                 'duration': segment_duration,
2972                             } for segment_number in range(
2973                                 representation_ms_info['start_number'],
2974                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2975                         else:
2976                             # $Number*$ or $Time$ in media template with S list available
2977                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2978                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2979                             representation_ms_info['fragments'] = []
2980                             segment_time = 0
2981                             segment_d = None
2982                             segment_number = representation_ms_info['start_number']
2983
2984                             def add_segment_url():
2985                                 segment_url = media_template % {
2986                                     'Time': segment_time,
2987                                     'Bandwidth': bandwidth,
2988                                     'Number': segment_number,
2989                                 }
2990                                 representation_ms_info['fragments'].append({
2991                                     media_location_key: segment_url,
2992                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2993                                 })
2994
2995                             for num, s in enumerate(representation_ms_info['s']):
2996                                 segment_time = s.get('t') or segment_time
2997                                 segment_d = s['d']
2998                                 add_segment_url()
2999                                 segment_number += 1
3000                                 for r in range(s.get('r', 0)):
3001                                     segment_time += segment_d
3002                                     add_segment_url()
3003                                     segment_number += 1
3004                                 segment_time += segment_d
3005                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
3006                         # No media template,
3007                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
3008                         # or any YouTube dashsegments video
3009                         fragments = []
3010                         segment_index = 0
3011                         timescale = representation_ms_info['timescale']
3012                         for s in representation_ms_info['s']:
3013                             duration = float_or_none(s['d'], timescale)
3014                             for r in range(s.get('r', 0) + 1):
3015                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
3016                                 fragments.append({
3017                                     location_key(segment_uri): segment_uri,
3018                                     'duration': duration,
3019                                 })
3020                                 segment_index += 1
3021                         representation_ms_info['fragments'] = fragments
3022                     elif 'segment_urls' in representation_ms_info:
3023                         # Segment URLs with no SegmentTimeline
3024                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3025                         # https://github.com/ytdl-org/youtube-dl/pull/14844
3026                         fragments = []
3027                         segment_duration = float_or_none(
3028                             representation_ms_info['segment_duration'],
3029                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3030                         for segment_url in representation_ms_info['segment_urls']:
3031                             fragment = {
3032                                 location_key(segment_url): segment_url,
3033                             }
3034                             if segment_duration:
3035                                 fragment['duration'] = segment_duration
3036                             fragments.append(fragment)
3037                         representation_ms_info['fragments'] = fragments
3038                     # If there is a fragments key available then we correctly recognized fragmented media.
3039                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3040                     # assumption is not necessarily correct since we may simply have no support for
3041                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3042                     if 'fragments' in representation_ms_info:
3043                         f.update({
3044                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3045                             'url': mpd_url or base_url,
3046                             'fragment_base_url': base_url,
3047                             'fragments': [],
3048                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3049                         })
3050                         if 'initialization_url' in representation_ms_info:
3051                             initialization_url = representation_ms_info['initialization_url']
3052                             if not f.get('url'):
3053                                 f['url'] = initialization_url
3054                             f['fragments'].append({location_key(initialization_url): initialization_url})
3055                         f['fragments'].extend(representation_ms_info['fragments'])
3056                         if not period_duration:
3057                             period_duration = try_get(
3058                                 representation_ms_info,
3059                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3060                     else:
3061                         # Assuming direct URL to unfragmented media.
3062                         f['url'] = base_url
3063                     if content_type in ('video', 'audio', 'image/jpeg'):
3064                         f['manifest_stream_number'] = stream_numbers[f['url']]
3065                         stream_numbers[f['url']] += 1
3066                         formats.append(f)
3067                     elif content_type == 'text':
3068                         subtitles.setdefault(lang or 'und', []).append(f)
3069
3070         return formats, subtitles
3071
3072     def _extract_ism_formats(self, *args, **kwargs):
3073         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3074         if subs:
3075             self._report_ignoring_subs('ISM')
3076         return fmts
3077
3078     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3079         res = self._download_xml_handle(
3080             ism_url, video_id,
3081             note='Downloading ISM manifest' if note is None else note,
3082             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3083             fatal=fatal, data=data, headers=headers, query=query)
3084         if res is False:
3085             return [], {}
3086         ism_doc, urlh = res
3087         if ism_doc is None:
3088             return [], {}
3089
3090         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3091
3092     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3093         """
3094         Parse formats from ISM manifest.
3095         References:
3096          1. [MS-SSTR]: Smooth Streaming Protocol,
3097             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3098         """
3099         if ism_doc.get('IsLive') == 'TRUE':
3100             return [], {}
3101
3102         duration = int(ism_doc.attrib['Duration'])
3103         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3104
3105         formats = []
3106         subtitles = {}
3107         for stream in ism_doc.findall('StreamIndex'):
3108             stream_type = stream.get('Type')
3109             if stream_type not in ('video', 'audio', 'text'):
3110                 continue
3111             url_pattern = stream.attrib['Url']
3112             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3113             stream_name = stream.get('Name')
3114             stream_language = stream.get('Language', 'und')
3115             for track in stream.findall('QualityLevel'):
3116                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3117                 # TODO: add support for WVC1 and WMAP
3118                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3119                     self.report_warning('%s is not a supported codec' % fourcc)
3120                     continue
3121                 tbr = int(track.attrib['Bitrate']) // 1000
3122                 # [1] does not mention Width and Height attributes. However,
3123                 # they're often present while MaxWidth and MaxHeight are
3124                 # missing, so should be used as fallbacks
3125                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3126                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3127                 sampling_rate = int_or_none(track.get('SamplingRate'))
3128
3129                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3130                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3131
3132                 fragments = []
3133                 fragment_ctx = {
3134                     'time': 0,
3135                 }
3136                 stream_fragments = stream.findall('c')
3137                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3138                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3139                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3140                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3141                     if not fragment_ctx['duration']:
3142                         try:
3143                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3144                         except IndexError:
3145                             next_fragment_time = duration
3146                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3147                     for _ in range(fragment_repeat):
3148                         fragments.append({
3149                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3150                             'duration': fragment_ctx['duration'] / stream_timescale,
3151                         })
3152                         fragment_ctx['time'] += fragment_ctx['duration']
3153
3154                 if stream_type == 'text':
3155                     subtitles.setdefault(stream_language, []).append({
3156                         'ext': 'ismt',
3157                         'protocol': 'ism',
3158                         'url': ism_url,
3159                         'manifest_url': ism_url,
3160                         'fragments': fragments,
3161                         '_download_params': {
3162                             'stream_type': stream_type,
3163                             'duration': duration,
3164                             'timescale': stream_timescale,
3165                             'fourcc': fourcc,
3166                             'language': stream_language,
3167                             'codec_private_data': track.get('CodecPrivateData'),
3168                         }
3169                     })
3170                 elif stream_type in ('video', 'audio'):
3171                     formats.append({
3172                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3173                         'url': ism_url,
3174                         'manifest_url': ism_url,
3175                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3176                         'width': width,
3177                         'height': height,
3178                         'tbr': tbr,
3179                         'asr': sampling_rate,
3180                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3181                         'acodec': 'none' if stream_type == 'video' else fourcc,
3182                         'protocol': 'ism',
3183                         'fragments': fragments,
3184                         'has_drm': ism_doc.find('Protection') is not None,
3185                         '_download_params': {
3186                             'stream_type': stream_type,
3187                             'duration': duration,
3188                             'timescale': stream_timescale,
3189                             'width': width or 0,
3190                             'height': height or 0,
3191                             'fourcc': fourcc,
3192                             'language': stream_language,
3193                             'codec_private_data': track.get('CodecPrivateData'),
3194                             'sampling_rate': sampling_rate,
3195                             'channels': int_or_none(track.get('Channels', 2)),
3196                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3197                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3198                         },
3199                     })
3200         return formats, subtitles
3201
3202     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3203         def absolute_url(item_url):
3204             return urljoin(base_url, item_url)
3205
3206         def parse_content_type(content_type):
3207             if not content_type:
3208                 return {}
3209             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3210             if ctr:
3211                 mimetype, codecs = ctr.groups()
3212                 f = parse_codecs(codecs)
3213                 f['ext'] = mimetype2ext(mimetype)
3214                 return f
3215             return {}
3216
3217         def _media_formats(src, cur_media_type, type_info=None):
3218             type_info = type_info or {}
3219             full_url = absolute_url(src)
3220             ext = type_info.get('ext') or determine_ext(full_url)
3221             if ext == 'm3u8':
3222                 is_plain_url = False
3223                 formats = self._extract_m3u8_formats(
3224                     full_url, video_id, ext='mp4',
3225                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3226                     preference=preference, quality=quality, fatal=False)
3227             elif ext == 'mpd':
3228                 is_plain_url = False
3229                 formats = self._extract_mpd_formats(
3230                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3231             else:
3232                 is_plain_url = True
3233                 formats = [{
3234                     'url': full_url,
3235                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3236                     'ext': ext,
3237                 }]
3238             return is_plain_url, formats
3239
3240         entries = []
3241         # amp-video and amp-audio are very similar to their HTML5 counterparts
3242         # so we will include them right here (see
3243         # https://www.ampproject.org/docs/reference/components/amp-video)
3244         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3245         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3246         media_tags = [(media_tag, media_tag_name, media_type, '')
3247                       for media_tag, media_tag_name, media_type
3248                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3249         media_tags.extend(re.findall(
3250             # We only allow video|audio followed by a whitespace or '>'.
3251             # Allowing more characters may end up in significant slow down (see
3252             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3253             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3254             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3255         for media_tag, _, media_type, media_content in media_tags:
3256             media_info = {
3257                 'formats': [],
3258                 'subtitles': {},
3259             }
3260             media_attributes = extract_attributes(media_tag)
3261             src = strip_or_none(media_attributes.get('src'))
3262             if src:
3263                 f = parse_content_type(media_attributes.get('type'))
3264                 _, formats = _media_formats(src, media_type, f)
3265                 media_info['formats'].extend(formats)
3266             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3267             if media_content:
3268                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3269                     s_attr = extract_attributes(source_tag)
3270                     # data-video-src and data-src are non standard but seen
3271                     # several times in the wild
3272                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3273                     if not src:
3274                         continue
3275                     f = parse_content_type(s_attr.get('type'))
3276                     is_plain_url, formats = _media_formats(src, media_type, f)
3277                     if is_plain_url:
3278                         # width, height, res, label and title attributes are
3279                         # all not standard but seen several times in the wild
3280                         labels = [
3281                             s_attr.get(lbl)
3282                             for lbl in ('label', 'title')
3283                             if str_or_none(s_attr.get(lbl))
3284                         ]
3285                         width = int_or_none(s_attr.get('width'))
3286                         height = (int_or_none(s_attr.get('height'))
3287                                   or int_or_none(s_attr.get('res')))
3288                         if not width or not height:
3289                             for lbl in labels:
3290                                 resolution = parse_resolution(lbl)
3291                                 if not resolution:
3292                                     continue
3293                                 width = width or resolution.get('width')
3294                                 height = height or resolution.get('height')
3295                         for lbl in labels:
3296                             tbr = parse_bitrate(lbl)
3297                             if tbr:
3298                                 break
3299                         else:
3300                             tbr = None
3301                         f.update({
3302                             'width': width,
3303                             'height': height,
3304                             'tbr': tbr,
3305                             'format_id': s_attr.get('label') or s_attr.get('title'),
3306                         })
3307                         f.update(formats[0])
3308                         media_info['formats'].append(f)
3309                     else:
3310                         media_info['formats'].extend(formats)
3311                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3312                     track_attributes = extract_attributes(track_tag)
3313                     kind = track_attributes.get('kind')
3314                     if not kind or kind in ('subtitles', 'captions'):
3315                         src = strip_or_none(track_attributes.get('src'))
3316                         if not src:
3317                             continue
3318                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3319                         media_info['subtitles'].setdefault(lang, []).append({
3320                             'url': absolute_url(src),
3321                         })
3322             for f in media_info['formats']:
3323                 f.setdefault('http_headers', {})['Referer'] = base_url
3324             if media_info['formats'] or media_info['subtitles']:
3325                 entries.append(media_info)
3326         return entries
3327
3328     def _extract_akamai_formats(self, *args, **kwargs):
3329         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3330         if subs:
3331             self._report_ignoring_subs('akamai')
3332         return fmts
3333
3334     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3335         signed = 'hdnea=' in manifest_url
3336         if not signed:
3337             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3338             manifest_url = re.sub(
3339                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3340                 '', manifest_url).strip('?')
3341
3342         formats = []
3343         subtitles = {}
3344
3345         hdcore_sign = 'hdcore=3.7.0'
3346         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3347         hds_host = hosts.get('hds')
3348         if hds_host:
3349             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3350         if 'hdcore=' not in f4m_url:
3351             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3352         f4m_formats = self._extract_f4m_formats(
3353             f4m_url, video_id, f4m_id='hds', fatal=False)
3354         for entry in f4m_formats:
3355             entry.update({'extra_param_to_segment_url': hdcore_sign})
3356         formats.extend(f4m_formats)
3357
3358         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3359         hls_host = hosts.get('hls')
3360         if hls_host:
3361             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3362         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3363             m3u8_url, video_id, 'mp4', 'm3u8_native',
3364             m3u8_id='hls', fatal=False)
3365         formats.extend(m3u8_formats)
3366         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3367
3368         http_host = hosts.get('http')
3369         if http_host and m3u8_formats and not signed:
3370             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3371             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3372             qualities_length = len(qualities)
3373             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3374                 i = 0
3375                 for f in m3u8_formats:
3376                     if f['vcodec'] != 'none':
3377                         for protocol in ('http', 'https'):
3378                             http_f = f.copy()
3379                             del http_f['manifest_url']
3380                             http_url = re.sub(
3381                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3382                             http_f.update({
3383                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3384                                 'url': http_url,
3385                                 'protocol': protocol,
3386                             })
3387                             formats.append(http_f)
3388                         i += 1
3389
3390         return formats, subtitles
3391
3392     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3393         query = urllib.parse.urlparse(url).query
3394         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3395         mobj = re.search(
3396             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3397         url_base = mobj.group('url')
3398         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3399         formats = []
3400
3401         def manifest_url(manifest):
3402             m_url = f'{http_base_url}/{manifest}'
3403             if query:
3404                 m_url += '?%s' % query
3405             return m_url
3406
3407         if 'm3u8' not in skip_protocols:
3408             formats.extend(self._extract_m3u8_formats(
3409                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3410                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3411         if 'f4m' not in skip_protocols:
3412             formats.extend(self._extract_f4m_formats(
3413                 manifest_url('manifest.f4m'),
3414                 video_id, f4m_id='hds', fatal=False))
3415         if 'dash' not in skip_protocols:
3416             formats.extend(self._extract_mpd_formats(
3417                 manifest_url('manifest.mpd'),
3418                 video_id, mpd_id='dash', fatal=False))
3419         if re.search(r'(?:/smil:|\.smil)', url_base):
3420             if 'smil' not in skip_protocols:
3421                 rtmp_formats = self._extract_smil_formats(
3422                     manifest_url('jwplayer.smil'),
3423                     video_id, fatal=False)
3424                 for rtmp_format in rtmp_formats:
3425                     rtsp_format = rtmp_format.copy()
3426                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3427                     del rtsp_format['play_path']
3428                     del rtsp_format['ext']
3429                     rtsp_format.update({
3430                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3431                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3432                         'protocol': 'rtsp',
3433                     })
3434                     formats.extend([rtmp_format, rtsp_format])
3435         else:
3436             for protocol in ('rtmp', 'rtsp'):
3437                 if protocol not in skip_protocols:
3438                     formats.append({
3439                         'url': f'{protocol}:{url_base}',
3440                         'format_id': protocol,
3441                         'protocol': protocol,
3442                     })
3443         return formats
3444
3445     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3446         mobj = re.search(
3447             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3448             webpage)
3449         if mobj:
3450             try:
3451                 jwplayer_data = self._parse_json(mobj.group('options'),
3452                                                  video_id=video_id,
3453                                                  transform_source=transform_source)
3454             except ExtractorError:
3455                 pass
3456             else:
3457                 if isinstance(jwplayer_data, dict):
3458                     return jwplayer_data
3459
3460     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3461         jwplayer_data = self._find_jwplayer_data(
3462             webpage, video_id, transform_source=js_to_json)
3463         return self._parse_jwplayer_data(
3464             jwplayer_data, video_id, *args, **kwargs)
3465
3466     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3467                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3468         # JWPlayer backward compatibility: flattened playlists
3469         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3470         if 'playlist' not in jwplayer_data:
3471             jwplayer_data = {'playlist': [jwplayer_data]}
3472
3473         entries = []
3474
3475         # JWPlayer backward compatibility: single playlist item
3476         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3477         if not isinstance(jwplayer_data['playlist'], list):
3478             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3479
3480         for video_data in jwplayer_data['playlist']:
3481             # JWPlayer backward compatibility: flattened sources
3482             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3483             if 'sources' not in video_data:
3484                 video_data['sources'] = [video_data]
3485
3486             this_video_id = video_id or video_data['mediaid']
3487
3488             formats = self._parse_jwplayer_formats(
3489                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3490                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3491
3492             subtitles = {}
3493             tracks = video_data.get('tracks')
3494             if tracks and isinstance(tracks, list):
3495                 for track in tracks:
3496                     if not isinstance(track, dict):
3497                         continue
3498                     track_kind = track.get('kind')
3499                     if not track_kind or not isinstance(track_kind, str):
3500                         continue
3501                     if track_kind.lower() not in ('captions', 'subtitles'):
3502                         continue
3503                     track_url = urljoin(base_url, track.get('file'))
3504                     if not track_url:
3505                         continue
3506                     subtitles.setdefault(track.get('label') or 'en', []).append({
3507                         'url': self._proto_relative_url(track_url)
3508                     })
3509
3510             entry = {
3511                 'id': this_video_id,
3512                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3513                 'description': clean_html(video_data.get('description')),
3514                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3515                 'timestamp': int_or_none(video_data.get('pubdate')),
3516                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3517                 'subtitles': subtitles,
3518             }
3519             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3520             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3521                 entry.update({
3522                     '_type': 'url_transparent',
3523                     'url': formats[0]['url'],
3524                 })
3525             else:
3526                 self._sort_formats(formats)
3527                 entry['formats'] = formats
3528             entries.append(entry)
3529         if len(entries) == 1:
3530             return entries[0]
3531         else:
3532             return self.playlist_result(entries)
3533
3534     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3535                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3536         urls = []
3537         formats = []
3538         for source in jwplayer_sources_data:
3539             if not isinstance(source, dict):
3540                 continue
3541             source_url = urljoin(
3542                 base_url, self._proto_relative_url(source.get('file')))
3543             if not source_url or source_url in urls:
3544                 continue
3545             urls.append(source_url)
3546             source_type = source.get('type') or ''
3547             ext = mimetype2ext(source_type) or determine_ext(source_url)
3548             if source_type == 'hls' or ext == 'm3u8':
3549                 formats.extend(self._extract_m3u8_formats(
3550                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3551                     m3u8_id=m3u8_id, fatal=False))
3552             elif source_type == 'dash' or ext == 'mpd':
3553                 formats.extend(self._extract_mpd_formats(
3554                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3555             elif ext == 'smil':
3556                 formats.extend(self._extract_smil_formats(
3557                     source_url, video_id, fatal=False))
3558             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3559             elif source_type.startswith('audio') or ext in (
3560                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3561                 formats.append({
3562                     'url': source_url,
3563                     'vcodec': 'none',
3564                     'ext': ext,
3565                 })
3566             else:
3567                 height = int_or_none(source.get('height'))
3568                 if height is None:
3569                     # Often no height is provided but there is a label in
3570                     # format like "1080p", "720p SD", or 1080.
3571                     height = int_or_none(self._search_regex(
3572                         r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
3573                         'height', default=None))
3574                 a_format = {
3575                     'url': source_url,
3576                     'width': int_or_none(source.get('width')),
3577                     'height': height,
3578                     'tbr': int_or_none(source.get('bitrate')),
3579                     'ext': ext,
3580                 }
3581                 if source_url.startswith('rtmp'):
3582                     a_format['ext'] = 'flv'
3583                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3584                     # of jwplayer.flash.swf
3585                     rtmp_url_parts = re.split(
3586                         r'((?:mp4|mp3|flv):)', source_url, 1)
3587                     if len(rtmp_url_parts) == 3:
3588                         rtmp_url, prefix, play_path = rtmp_url_parts
3589                         a_format.update({
3590                             'url': rtmp_url,
3591                             'play_path': prefix + play_path,
3592                         })
3593                     if rtmp_params:
3594                         a_format.update(rtmp_params)
3595                 formats.append(a_format)
3596         return formats
3597
3598     def _live_title(self, name):
3599         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3600         return name
3601
3602     def _int(self, v, name, fatal=False, **kwargs):
3603         res = int_or_none(v, **kwargs)
3604         if res is None:
3605             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3606             if fatal:
3607                 raise ExtractorError(msg)
3608             else:
3609                 self.report_warning(msg)
3610         return res
3611
3612     def _float(self, v, name, fatal=False, **kwargs):
3613         res = float_or_none(v, **kwargs)
3614         if res is None:
3615             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3616             if fatal:
3617                 raise ExtractorError(msg)
3618             else:
3619                 self.report_warning(msg)
3620         return res
3621
3622     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3623                     path='/', secure=False, discard=False, rest={}, **kwargs):
3624         cookie = http.cookiejar.Cookie(
3625             0, name, value, port, port is not None, domain, True,
3626             domain.startswith('.'), path, True, secure, expire_time,
3627             discard, None, None, rest)
3628         self.cookiejar.set_cookie(cookie)
3629
3630     def _get_cookies(self, url):
3631         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3632         return http.cookies.SimpleCookie(self._downloader._calc_cookies(url))
3633
3634     def _apply_first_set_cookie_header(self, url_handle, cookie):
3635         """
3636         Apply first Set-Cookie header instead of the last. Experimental.
3637
3638         Some sites (e.g. [1-3]) may serve two cookies under the same name
3639         in Set-Cookie header and expect the first (old) one to be set rather
3640         than second (new). However, as of RFC6265 the newer one cookie
3641         should be set into cookie store what actually happens.
3642         We will workaround this issue by resetting the cookie to
3643         the first one manually.
3644         1. https://new.vk.com/
3645         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3646         3. https://learning.oreilly.com/
3647         """
3648         for header, cookies in url_handle.headers.items():
3649             if header.lower() != 'set-cookie':
3650                 continue
3651             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3652             cookie_value = re.search(
3653                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3654             if cookie_value:
3655                 value, domain = cookie_value.groups()
3656                 self._set_cookie(domain, cookie, value)
3657                 break
3658
3659     @classmethod
3660     def get_testcases(cls, include_onlymatching=False):
3661         t = getattr(cls, '_TEST', None)
3662         if t:
3663             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3664             tests = [t]
3665         else:
3666             tests = getattr(cls, '_TESTS', [])
3667         for t in tests:
3668             if not include_onlymatching and t.get('only_matching', False):
3669                 continue
3670             t['name'] = cls.ie_key()
3671             yield t
3672
3673     @classmethod
3674     def get_webpage_testcases(cls):
3675         tests = getattr(cls, '_WEBPAGE_TESTS', [])
3676         for t in tests:
3677             t['name'] = cls.ie_key()
3678         return tests
3679
3680     @classproperty
3681     def age_limit(cls):
3682         """Get age limit from the testcases"""
3683         return max(traverse_obj(
3684             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3685             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3686
3687     @classmethod
3688     def is_suitable(cls, age_limit):
3689         """Test whether the extractor is generally suitable for the given age limit"""
3690         return not age_restricted(cls.age_limit, age_limit)
3691
3692     @classmethod
3693     def description(cls, *, markdown=True, search_examples=None):
3694         """Description of the extractor"""
3695         desc = ''
3696         if cls._NETRC_MACHINE:
3697             if markdown:
3698                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3699             else:
3700                 desc += f' [{cls._NETRC_MACHINE}]'
3701         if cls.IE_DESC is False:
3702             desc += ' [HIDDEN]'
3703         elif cls.IE_DESC:
3704             desc += f' {cls.IE_DESC}'
3705         if cls.SEARCH_KEY:
3706             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3707             if search_examples:
3708                 _COUNTS = ('', '5', '10', 'all')
3709                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3710         if not cls.working():
3711             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3712
3713         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3714         return f'{name}:{desc}' if desc else name
3715
3716     def extract_subtitles(self, *args, **kwargs):
3717         if (self.get_param('writesubtitles', False)
3718                 or self.get_param('listsubtitles')):
3719             return self._get_subtitles(*args, **kwargs)
3720         return {}
3721
3722     def _get_subtitles(self, *args, **kwargs):
3723         raise NotImplementedError('This method must be implemented by subclasses')
3724
3725     def extract_comments(self, *args, **kwargs):
3726         if not self.get_param('getcomments'):
3727             return None
3728         generator = self._get_comments(*args, **kwargs)
3729
3730         def extractor():
3731             comments = []
3732             interrupted = True
3733             try:
3734                 while True:
3735                     comments.append(next(generator))
3736             except StopIteration:
3737                 interrupted = False
3738             except KeyboardInterrupt:
3739                 self.to_screen('Interrupted by user')
3740             except Exception as e:
3741                 if self.get_param('ignoreerrors') is not True:
3742                     raise
3743                 self._downloader.report_error(e)
3744             comment_count = len(comments)
3745             self.to_screen(f'Extracted {comment_count} comments')
3746             return {
3747                 'comments': comments,
3748                 'comment_count': None if interrupted else comment_count
3749             }
3750         return extractor
3751
3752     def _get_comments(self, *args, **kwargs):
3753         raise NotImplementedError('This method must be implemented by subclasses')
3754
3755     @staticmethod
3756     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3757         """ Merge subtitle items for one language. Items with duplicated URLs/data
3758         will be dropped. """
3759         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3760         ret = list(subtitle_list1)
3761         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3762         return ret
3763
3764     @classmethod
3765     def _merge_subtitles(cls, *dicts, target=None):
3766         """ Merge subtitle dictionaries, language by language. """
3767         if target is None:
3768             target = {}
3769         for d in dicts:
3770             for lang, subs in d.items():
3771                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3772         return target
3773
3774     def extract_automatic_captions(self, *args, **kwargs):
3775         if (self.get_param('writeautomaticsub', False)
3776                 or self.get_param('listsubtitles')):
3777             return self._get_automatic_captions(*args, **kwargs)
3778         return {}
3779
3780     def _get_automatic_captions(self, *args, **kwargs):
3781         raise NotImplementedError('This method must be implemented by subclasses')
3782
3783     @functools.cached_property
3784     def _cookies_passed(self):
3785         """Whether cookies have been passed to YoutubeDL"""
3786         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3787
3788     def mark_watched(self, *args, **kwargs):
3789         if not self.get_param('mark_watched', False):
3790             return
3791         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3792             self._mark_watched(*args, **kwargs)
3793
3794     def _mark_watched(self, *args, **kwargs):
3795         raise NotImplementedError('This method must be implemented by subclasses')
3796
3797     def geo_verification_headers(self):
3798         headers = {}
3799         geo_verification_proxy = self.get_param('geo_verification_proxy')
3800         if geo_verification_proxy:
3801             headers['Ytdl-request-proxy'] = geo_verification_proxy
3802         return headers
3803
3804     @staticmethod
3805     def _generic_id(url):
3806         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3807
3808     @staticmethod
3809     def _generic_title(url):
3810         return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3811
3812     @staticmethod
3813     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3814         all_known = all(map(
3815             lambda x: x is not None,
3816             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3817         return (
3818             'private' if is_private
3819             else 'premium_only' if needs_premium
3820             else 'subscriber_only' if needs_subscription
3821             else 'needs_auth' if needs_auth
3822             else 'unlisted' if is_unlisted
3823             else 'public' if all_known
3824             else None)
3825
3826     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3827         '''
3828         @returns            A list of values for the extractor argument given by "key"
3829                             or "default" if no such key is present
3830         @param default      The default value to return when the key is not present (default: [])
3831         @param casesense    When false, the values are converted to lower case
3832         '''
3833         val = traverse_obj(
3834             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3835         if val is None:
3836             return [] if default is NO_DEFAULT else default
3837         return list(val) if casesense else [x.lower() for x in val]
3838
3839     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3840         if not playlist_id or not video_id:
3841             return not video_id
3842
3843         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3844         if no_playlist is not None:
3845             return not no_playlist
3846
3847         video_id = '' if video_id is True else f' {video_id}'
3848         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3849         if self.get_param('noplaylist'):
3850             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3851             return False
3852         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3853         return True
3854
3855     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3856         RetryManager.report_retry(err, _count or int(fatal), _retries, info=self.to_screen, warn=self.report_warning,
3857                                   sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3858
3859     def RetryManager(self, **kwargs):
3860         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3861
3862     @classmethod
3863     def extract_from_webpage(cls, ydl, url, webpage):
3864         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3865               else ydl.get_info_extractor(cls.ie_key()))
3866         for info in ie._extract_from_webpage(url, webpage) or []:
3867             # url = None since we do not want to set (webpage/original)_url
3868             ydl.add_default_extra_info(info, ie, None)
3869             yield info
3870
3871     @classmethod
3872     def _extract_from_webpage(cls, url, webpage):
3873         for embed_url in orderedSet(
3874                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3875             yield cls.url_result(embed_url, cls)
3876
3877     @classmethod
3878     def _extract_embed_urls(cls, url, webpage):
3879         """@returns all the embed urls on the webpage"""
3880         if '_EMBED_URL_RE' not in cls.__dict__:
3881             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3882             for idx, regex in enumerate(cls._EMBED_REGEX):
3883                 assert regex.count('(?P<url>') == 1, \
3884                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3885             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3886
3887         for regex in cls._EMBED_URL_RE:
3888             for mobj in regex.finditer(webpage):
3889                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3890                 if cls._VALID_URL is False or cls.suitable(embed_url):
3891                     yield embed_url
3892
3893     class StopExtraction(Exception):
3894         pass
3895
3896     @classmethod
3897     def _extract_url(cls, webpage):  # TODO: Remove
3898         """Only for compatibility with some older extractors"""
3899         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3900
3901
3902 class SearchInfoExtractor(InfoExtractor):
3903     """
3904     Base class for paged search queries extractors.
3905     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3906     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3907     """
3908
3909     _MAX_RESULTS = float('inf')
3910
3911     @classproperty
3912     def _VALID_URL(cls):
3913         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3914
3915     def _real_extract(self, query):
3916         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3917         if prefix == '':
3918             return self._get_n_results(query, 1)
3919         elif prefix == 'all':
3920             return self._get_n_results(query, self._MAX_RESULTS)
3921         else:
3922             n = int(prefix)
3923             if n <= 0:
3924                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3925             elif n > self._MAX_RESULTS:
3926                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3927                 n = self._MAX_RESULTS
3928             return self._get_n_results(query, n)
3929
3930     def _get_n_results(self, query, n):
3931         """Get a specified number of results for a query.
3932         Either this function or _search_results must be overridden by subclasses """
3933         return self.playlist_result(
3934             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3935             query, query)
3936
3937     def _search_results(self, query):
3938         """Returns an iterator of search results"""
3939         raise NotImplementedError('This method must be implemented by subclasses')
3940
3941     @classproperty
3942     def SEARCH_KEY(cls):
3943         return cls._SEARCH_KEY