yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import subprocess
  17 import sys
  18 import time
  19 import types
  20 import urllib.parse
  21 import urllib.request
  22 import xml.etree.ElementTree
  23
  24 from ..compat import functools  # isort: split
  25 from ..compat import (
  26     compat_etree_fromstring,
  27     compat_expanduser,
  28     compat_os_name,
  29     urllib_req_to_req,
  30 )
  31 from ..cookies import LenientSimpleCookie
  32 from ..downloader.f4m import get_base_url, remove_encrypted_media
  33 from ..downloader.hls import HlsFD
  34 from ..networking import HEADRequest, Request
  35 from ..networking.exceptions import (
  36     HTTPError,
  37     IncompleteRead,
  38     network_exceptions,
  39 )
  40 from ..networking.impersonate import ImpersonateTarget
  41 from ..utils import (
  42     IDENTITY,
  43     JSON_LD_RE,
  44     NO_DEFAULT,
  45     ExtractorError,
  46     FormatSorter,
  47     GeoRestrictedError,
  48     GeoUtils,
  49     LenientJSONDecoder,
  50     Popen,
  51     RegexNotFoundError,
  52     RetryManager,
  53     UnsupportedError,
  54     age_restricted,
  55     base_url,
  56     bug_reports_message,
  57     classproperty,
  58     clean_html,
  59     deprecation_warning,
  60     determine_ext,
  61     dict_get,
  62     encode_data_uri,
  63     error_to_compat_str,
  64     extract_attributes,
  65     filter_dict,
  66     fix_xml_ampersands,
  67     float_or_none,
  68     format_field,
  69     int_or_none,
  70     join_nonempty,
  71     js_to_json,
  72     mimetype2ext,
  73     netrc_from_content,
  74     orderedSet,
  75     parse_bitrate,
  76     parse_codecs,
  77     parse_duration,
  78     parse_iso8601,
  79     parse_m3u8_attributes,
  80     parse_resolution,
  81     sanitize_filename,
  82     sanitize_url,
  83     smuggle_url,
  84     str_or_none,
  85     str_to_int,
  86     strip_or_none,
  87     traverse_obj,
  88     truncate_string,
  89     try_call,
  90     try_get,
  91     unescapeHTML,
  92     unified_strdate,
  93     unified_timestamp,
  94     url_basename,
  95     url_or_none,
  96     urlhandle_detect_ext,
  97     urljoin,
  98     variadic,
  99     xpath_element,
 100     xpath_text,
 101     xpath_with_ns,
 102 )
 103
 104
 105 class InfoExtractor:
 106     """Information Extractor class.
 107
 108     Information extractors are the classes that, given a URL, extract
 109     information about the video (or videos) the URL refers to. This
 110     information includes the real video URL, the video title, author and
 111     others. The information is stored in a dictionary which is then
 112     passed to the YoutubeDL. The YoutubeDL processes this
 113     information possibly downloading the video to the file system, among
 114     other possible outcomes.
 115
 116     The type field determines the type of the result.
 117     By far the most common value (and the default if _type is missing) is
 118     "video", which indicates a single video.
 119
 120     For a video, the dictionaries must include the following fields:
 121
 122     id:             Video identifier.
 123     title:          Video title, unescaped. Set to an empty string if video has
 124                     no title as opposed to "None" which signifies that the
 125                     extractor failed to obtain a title
 126
 127     Additionally, it must contain either a formats entry or a url one:
 128
 129     formats:        A list of dictionaries for each format available, ordered
 130                     from worst to best quality.
 131
 132                     Potential fields:
 133                     * url        The mandatory URL representing the media:
 134                                    for plain file media - HTTP URL of this file,
 135                                    for RTMP - RTMP URL,
 136                                    for HLS - URL of the M3U8 media playlist,
 137                                    for HDS - URL of the F4M manifest,
 138                                    for DASH
 139                                      - HTTP URL to plain file media (in case of
 140                                        unfragmented media)
 141                                      - URL of the MPD manifest or base URL
 142                                        representing the media if MPD manifest
 143                                        is parsed from a string (in case of
 144                                        fragmented media)
 145                                    for MSS - URL of the ISM manifest.
 146                     * request_data  Data to send in POST request to the URL
 147                     * manifest_url
 148                                  The URL of the manifest file in case of
 149                                  fragmented media:
 150                                    for HLS - URL of the M3U8 master playlist,
 151                                    for HDS - URL of the F4M manifest,
 152                                    for DASH - URL of the MPD manifest,
 153                                    for MSS - URL of the ISM manifest.
 154                     * manifest_stream_number  (For internal use only)
 155                                  The index of the stream in the manifest file
 156                     * ext        Will be calculated from URL if missing
 157                     * format     A human-readable description of the format
 158                                  ("mp4 container with h264/opus").
 159                                  Calculated from the format_id, width, height.
 160                                  and format_note fields if missing.
 161                     * format_id  A short description of the format
 162                                  ("mp4_h264_opus" or "19").
 163                                 Technically optional, but strongly recommended.
 164                     * format_note Additional info about the format
 165                                  ("3D" or "DASH video")
 166                     * width      Width of the video, if known
 167                     * height     Height of the video, if known
 168                     * aspect_ratio  Aspect ratio of the video, if known
 169                                  Automatically calculated from width and height
 170                     * resolution Textual description of width and height
 171                                  Automatically calculated from width and height
 172                     * dynamic_range The dynamic range of the video. One of:
 173                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 174                     * tbr        Average bitrate of audio and video in kbps (1000 bits/sec)
 175                     * abr        Average audio bitrate in kbps (1000 bits/sec)
 176                     * acodec     Name of the audio codec in use
 177                     * asr        Audio sampling rate in Hertz
 178                     * audio_channels  Number of audio channels
 179                     * vbr        Average video bitrate in kbps (1000 bits/sec)
 180                     * fps        Frame rate
 181                     * vcodec     Name of the video codec in use
 182                     * container  Name of the container format
 183                     * filesize   The number of bytes, if known in advance
 184                     * filesize_approx  An estimate for the number of bytes
 185                     * player_url SWF Player URL (used for rtmpdump).
 186                     * protocol   The protocol that will be used for the actual
 187                                  download, lower-case. One of "http", "https" or
 188                                  one of the protocols defined in downloader.PROTOCOL_MAP
 189                     * fragment_base_url
 190                                  Base URL for fragments. Each fragment's path
 191                                  value (if present) will be relative to
 192                                  this URL.
 193                     * fragments  A list of fragments of a fragmented media.
 194                                  Each fragment entry must contain either an url
 195                                  or a path. If an url is present it should be
 196                                  considered by a client. Otherwise both path and
 197                                  fragment_base_url must be present. Here is
 198                                  the list of all potential fields:
 199                                  * "url" - fragment's URL
 200                                  * "path" - fragment's path relative to
 201                                             fragment_base_url
 202                                  * "duration" (optional, int or float)
 203                                  * "filesize" (optional, int)
 204                     * is_from_start  Is a live format that can be downloaded
 205                                 from the start. Boolean
 206                     * preference Order number of this format. If this field is
 207                                  present and not None, the formats get sorted
 208                                  by this field, regardless of all other values.
 209                                  -1 for default (order by other properties),
 210                                  -2 or smaller for less than default.
 211                                  < -1000 to hide the format (if there is
 212                                     another one which is strictly better)
 213                     * language   Language code, e.g. "de" or "en-US".
 214                     * language_preference  Is this in the language mentioned in
 215                                  the URL?
 216                                  10 if it's what the URL is about,
 217                                  -1 for default (don't know),
 218                                  -10 otherwise, other values reserved for now.
 219                     * quality    Order number of the video quality of this
 220                                  format, irrespective of the file format.
 221                                  -1 for default (order by other properties),
 222                                  -2 or smaller for less than default.
 223                     * source_preference  Order number for this video source
 224                                   (quality takes higher priority)
 225                                  -1 for default (order by other properties),
 226                                  -2 or smaller for less than default.
 227                     * http_headers  A dictionary of additional HTTP headers
 228                                  to add to the request.
 229                     * stretched_ratio  If given and not 1, indicates that the
 230                                  video's pixels are not square.
 231                                  width : height ratio as float.
 232                     * no_resume  The server does not support resuming the
 233                                  (HTTP or RTMP) download. Boolean.
 234                     * has_drm    True if the format has DRM and cannot be downloaded.
 235                                  'maybe' if the format may have DRM and has to be tested before download.
 236                     * extra_param_to_segment_url  A query string to append to each
 237                                  fragment's URL, or to update each existing query string
 238                                  with. Only applied by the native HLS/DASH downloaders.
 239                     * hls_aes    A dictionary of HLS AES-128 decryption information
 240                                  used by the native HLS downloader to override the
 241                                  values in the media playlist when an '#EXT-X-KEY' tag
 242                                  is present in the playlist:
 243                                  * uri  The URI from which the key will be downloaded
 244                                  * key  The key (as hex) used to decrypt fragments.
 245                                         If `key` is given, any key URI will be ignored
 246                                  * iv   The IV (as hex) used to decrypt fragments
 247                     * downloader_options  A dictionary of downloader options
 248                                  (For internal use only)
 249                                  * http_chunk_size Chunk size for HTTP downloads
 250                                  * ffmpeg_args     Extra arguments for ffmpeg downloader (input)
 251                                  * ffmpeg_args_out Extra arguments for ffmpeg downloader (output)
 252                     * is_dash_periods  Whether the format is a result of merging
 253                                  multiple DASH periods.
 254                     RTMP formats can also have the additional fields: page_url,
 255                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 256                     rtmp_protocol, rtmp_real_time
 257
 258     url:            Final video URL.
 259     ext:            Video filename extension.
 260     format:         The video format, defaults to ext (used for --get-format)
 261     player_url:     SWF Player URL (used for rtmpdump).
 262
 263     The following fields are optional:
 264
 265     direct:         True if a direct video file was given (must only be set by GenericIE)
 266     alt_title:      A secondary title of the video.
 267     display_id:     An alternative identifier for the video, not necessarily
 268                     unique, but available before title. Typically, id is
 269                     something like "4234987", title "Dancing naked mole rats",
 270                     and display_id "dancing-naked-mole-rats"
 271     thumbnails:     A list of dictionaries, with the following entries:
 272                         * "id" (optional, string) - Thumbnail format ID
 273                         * "url"
 274                         * "preference" (optional, int) - quality of the image
 275                         * "width" (optional, int)
 276                         * "height" (optional, int)
 277                         * "resolution" (optional, string "{width}x{height}",
 278                                         deprecated)
 279                         * "filesize" (optional, int)
 280                         * "http_headers" (dict) - HTTP headers for the request
 281     thumbnail:      Full URL to a video thumbnail image.
 282     description:    Full video description.
 283     uploader:       Full name of the video uploader.
 284     license:        License name the video is licensed under.
 285     creators:       List of creators of the video.
 286     timestamp:      UNIX timestamp of the moment the video was uploaded
 287     upload_date:    Video upload date in UTC (YYYYMMDD).
 288                     If not explicitly set, calculated from timestamp
 289     release_timestamp: UNIX timestamp of the moment the video was released.
 290                     If it is not clear whether to use timestamp or this, use the former
 291     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 292                     If not explicitly set, calculated from release_timestamp
 293     release_year:   Year (YYYY) as integer when the video or album was released.
 294                     To be used if no exact release date is known.
 295                     If not explicitly set, calculated from release_date.
 296     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 297     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 298                     If not explicitly set, calculated from modified_timestamp
 299     uploader_id:    Nickname or id of the video uploader.
 300     uploader_url:   Full URL to a personal webpage of the video uploader.
 301     channel:        Full name of the channel the video is uploaded on.
 302                     Note that channel fields may or may not repeat uploader
 303                     fields. This depends on a particular extractor.
 304     channel_id:     Id of the channel.
 305     channel_url:    Full URL to a channel webpage.
 306     channel_follower_count: Number of followers of the channel.
 307     channel_is_verified: Whether the channel is verified on the platform.
 308     location:       Physical location where the video was filmed.
 309     subtitles:      The available subtitles as a dictionary in the format
 310                     {tag: subformats}. "tag" is usually a language code, and
 311                     "subformats" is a list sorted from lower to higher
 312                     preference, each element is a dictionary with the "ext"
 313                     entry and one of:
 314                         * "data": The subtitles file contents
 315                         * "url": A URL pointing to the subtitles file
 316                     It can optionally also have:
 317                         * "name": Name or description of the subtitles
 318                         * "http_headers": A dictionary of additional HTTP headers
 319                                   to add to the request.
 320                     "ext" will be calculated from URL if missing
 321     automatic_captions: Like 'subtitles'; contains automatically generated
 322                     captions instead of normal subtitles
 323     duration:       Length of the video in seconds, as an integer or float.
 324     view_count:     How many users have watched the video on the platform.
 325     concurrent_view_count: How many users are currently watching the video on the platform.
 326     like_count:     Number of positive ratings of the video
 327     dislike_count:  Number of negative ratings of the video
 328     repost_count:   Number of reposts of the video
 329     average_rating: Average rating give by users, the scale used depends on the webpage
 330     comment_count:  Number of comments on the video
 331     comments:       A list of comments, each with one or more of the following
 332                     properties (all but one of text or html optional):
 333                         * "author" - human-readable name of the comment author
 334                         * "author_id" - user ID of the comment author
 335                         * "author_thumbnail" - The thumbnail of the comment author
 336                         * "author_url" - The url to the comment author's page
 337                         * "author_is_verified" - Whether the author is verified
 338                                                  on the platform
 339                         * "author_is_uploader" - Whether the comment is made by
 340                                                  the video uploader
 341                         * "id" - Comment ID
 342                         * "html" - Comment as HTML
 343                         * "text" - Plain text of the comment
 344                         * "timestamp" - UNIX timestamp of comment
 345                         * "parent" - ID of the comment this one is replying to.
 346                                      Set to "root" to indicate that this is a
 347                                      comment to the original video.
 348                         * "like_count" - Number of positive ratings of the comment
 349                         * "dislike_count" - Number of negative ratings of the comment
 350                         * "is_favorited" - Whether the comment is marked as
 351                                            favorite by the video uploader
 352                         * "is_pinned" - Whether the comment is pinned to
 353                                         the top of the comments
 354     age_limit:      Age restriction for the video, as an integer (years)
 355     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 356                     should allow to get the same result again. (It will be set
 357                     by YoutubeDL if it's missing)
 358     categories:     A list of categories that the video falls in, for example
 359                     ["Sports", "Berlin"]
 360     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 361     cast:           A list of the video cast
 362     is_live:        True, False, or None (=unknown). Whether this video is a
 363                     live stream that goes on instead of a fixed-length video.
 364     was_live:       True, False, or None (=unknown). Whether this video was
 365                     originally a live stream.
 366     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 367                     or 'post_live' (was live, but VOD is not yet processed)
 368                     If absent, automatically set from is_live, was_live
 369     start_time:     Time in seconds where the reproduction should start, as
 370                     specified in the URL.
 371     end_time:       Time in seconds where the reproduction should end, as
 372                     specified in the URL.
 373     chapters:       A list of dictionaries, with the following entries:
 374                         * "start_time" - The start time of the chapter in seconds
 375                         * "end_time" - The end time of the chapter in seconds
 376                         * "title" (optional, string)
 377     heatmap:        A list of dictionaries, with the following entries:
 378                         * "start_time" - The start time of the data point in seconds
 379                         * "end_time" - The end time of the data point in seconds
 380                         * "value" - The normalized value of the data point (float between 0 and 1)
 381     playable_in_embed: Whether this video is allowed to play in embedded
 382                     players on other sites. Can be True (=always allowed),
 383                     False (=never allowed), None (=unknown), or a string
 384                     specifying the criteria for embedability; e.g. 'whitelist'
 385     availability:   Under what condition the video is available. One of
 386                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 387                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 388                     to set it
 389     media_type:     The type of media as classified by the site, e.g. "episode", "clip", "trailer"
 390     _old_archive_ids: A list of old archive ids needed for backward compatibility
 391     _format_sort_fields: A list of fields to use for sorting formats
 392     __post_extractor: A function to be called just before the metadata is
 393                     written to either disk, logger or console. The function
 394                     must return a dict which will be added to the info_dict.
 395                     This is usefull for additional information that is
 396                     time-consuming to extract. Note that the fields thus
 397                     extracted will not be available to output template and
 398                     match_filter. So, only "comments" and "comment_count" are
 399                     currently allowed to be extracted via this method.
 400
 401     The following fields should only be used when the video belongs to some logical
 402     chapter or section:
 403
 404     chapter:        Name or title of the chapter the video belongs to.
 405     chapter_number: Number of the chapter the video belongs to, as an integer.
 406     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 407
 408     The following fields should only be used when the video is an episode of some
 409     series, programme or podcast:
 410
 411     series:         Title of the series or programme the video episode belongs to.
 412     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 413     season:         Title of the season the video episode belongs to.
 414     season_number:  Number of the season the video episode belongs to, as an integer.
 415     season_id:      Id of the season the video episode belongs to, as a unicode string.
 416     episode:        Title of the video episode. Unlike mandatory video title field,
 417                     this field should denote the exact title of the video episode
 418                     without any kind of decoration.
 419     episode_number: Number of the video episode within a season, as an integer.
 420     episode_id:     Id of the video episode, as a unicode string.
 421
 422     The following fields should only be used when the media is a track or a part of
 423     a music album:
 424
 425     track:          Title of the track.
 426     track_number:   Number of the track within an album or a disc, as an integer.
 427     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 428                     as a unicode string.
 429     artists:        List of artists of the track.
 430     composers:      List of composers of the piece.
 431     genres:         List of genres of the track.
 432     album:          Title of the album the track belongs to.
 433     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 434     album_artists:  List of all artists appeared on the album.
 435                     E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
 436                     Useful for splits and compilations.
 437     disc_number:    Number of the disc or other physical medium the track belongs to,
 438                     as an integer.
 439
 440     The following fields should only be set for clips that should be cut from the original video:
 441
 442     section_start:  Start time of the section in seconds
 443     section_end:    End time of the section in seconds
 444
 445     The following fields should only be set for storyboards:
 446     rows:           Number of rows in each storyboard fragment, as an integer
 447     columns:        Number of columns in each storyboard fragment, as an integer
 448
 449     The following fields are deprecated and should not be set by new code:
 450     composer:       Use "composers" instead.
 451                     Composer(s) of the piece, comma-separated.
 452     artist:         Use "artists" instead.
 453                     Artist(s) of the track, comma-separated.
 454     genre:          Use "genres" instead.
 455                     Genre(s) of the track, comma-separated.
 456     album_artist:   Use "album_artists" instead.
 457                     All artists appeared on the album, comma-separated.
 458     creator:        Use "creators" instead.
 459                     The creator of the video.
 460
 461     Unless mentioned otherwise, the fields should be Unicode strings.
 462
 463     Unless mentioned otherwise, None is equivalent to absence of information.
 464
 465
 466     _type "playlist" indicates multiple videos.
 467     There must be a key "entries", which is a list, an iterable, or a PagedList
 468     object, each element of which is a valid dictionary by this specification.
 469
 470     Additionally, playlists can have "id", "title", and any other relevant
 471     attributes with the same semantics as videos (see above).
 472
 473     It can also have the following optional fields:
 474
 475     playlist_count: The total number of videos in a playlist. If not given,
 476                     YoutubeDL tries to calculate it from "entries"
 477
 478
 479     _type "multi_video" indicates that there are multiple videos that
 480     form a single show, for examples multiple acts of an opera or TV episode.
 481     It must have an entries key like a playlist and contain all the keys
 482     required for a video at the same time.
 483
 484
 485     _type "url" indicates that the video must be extracted from another
 486     location, possibly by a different extractor. Its only required key is:
 487     "url" - the next URL to extract.
 488     The key "ie_key" can be set to the class name (minus the trailing "IE",
 489     e.g. "Youtube") if the extractor class is known in advance.
 490     Additionally, the dictionary may have any properties of the resolved entity
 491     known in advance, for example "title" if the title of the referred video is
 492     known ahead of time.
 493
 494
 495     _type "url_transparent" entities have the same specification as "url", but
 496     indicate that the given additional information is more precise than the one
 497     associated with the resolved URL.
 498     This is useful when a site employs a video service that hosts the video and
 499     its technical metadata, but that video service does not embed a useful
 500     title, description etc.
 501
 502
 503     Subclasses of this should also be added to the list of extractors and
 504     should define _VALID_URL as a regexp or a Sequence of regexps, and
 505     re-define the _real_extract() and (optionally) _real_initialize() methods.
 506
 507     Subclasses may also override suitable() if necessary, but ensure the function
 508     signature is preserved and that this function imports everything it needs
 509     (except other extractors), so that lazy_extractors works correctly.
 510
 511     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 512     the HTML of Generic webpages. It may also override _extract_embed_urls
 513     or _extract_from_webpage as necessary. While these are normally classmethods,
 514     _extract_from_webpage is allowed to be an instance method.
 515
 516     _extract_from_webpage may raise self.StopExtraction() to stop further
 517     processing of the webpage and obtain exclusive rights to it. This is useful
 518     when the extractor cannot reliably be matched using just the URL,
 519     e.g. invidious/peertube instances
 520
 521     Embed-only extractors can be defined by setting _VALID_URL = False.
 522
 523     To support username + password (or netrc) login, the extractor must define a
 524     _NETRC_MACHINE and re-define _perform_login(username, password) and
 525     (optionally) _initialize_pre_login() methods. The _perform_login method will
 526     be called between _initialize_pre_login and _real_initialize if credentials
 527     are passed by the user. In cases where it is necessary to have the login
 528     process as part of the extraction rather than initialization, _perform_login
 529     can be left undefined.
 530
 531     _GEO_BYPASS attribute may be set to False in order to disable
 532     geo restriction bypass mechanisms for a particular extractor.
 533     Though it won't disable explicit geo restriction bypass based on
 534     country code provided with geo_bypass_country.
 535
 536     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 537     countries for this extractor. One of these countries will be used by
 538     geo restriction bypass mechanism right away in order to bypass
 539     geo restriction, of course, if the mechanism is not disabled.
 540
 541     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 542     IP blocks in CIDR notation for this extractor. One of these IP blocks
 543     will be used by geo restriction bypass mechanism similarly
 544     to _GEO_COUNTRIES.
 545
 546     The _ENABLED attribute should be set to False for IEs that
 547     are disabled by default and must be explicitly enabled.
 548
 549     The _WORKING attribute should be set to False for broken IEs
 550     in order to warn the users and skip the tests.
 551     """
 552
 553     _ready = False
 554     _downloader = None
 555     _x_forwarded_for_ip = None
 556     _GEO_BYPASS = True
 557     _GEO_COUNTRIES = None
 558     _GEO_IP_BLOCKS = None
 559     _WORKING = True
 560     _ENABLED = True
 561     _NETRC_MACHINE = None
 562     IE_DESC = None
 563     SEARCH_KEY = None
 564     _VALID_URL = None
 565     _EMBED_REGEX = []
 566
 567     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 568         password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 569         return {
 570             None: '',
 571             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 572             'password': f'Use {password_hint}',
 573             'cookies': (
 574                 'Use --cookies-from-browser or --cookies for the authentication. '
 575                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 576         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 577
 578     def __init__(self, downloader=None):
 579         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 580         If a downloader is not passed during initialization,
 581         it must be set using "set_downloader()" before "extract()" is called"""
 582         self._ready = False
 583         self._x_forwarded_for_ip = None
 584         self._printed_messages = set()
 585         self.set_downloader(downloader)
 586
 587     @classmethod
 588     def _match_valid_url(cls, url):
 589         if cls._VALID_URL is False:
 590             return None
 591         # This does not use has/getattr intentionally - we want to know whether
 592         # we have cached the regexp for *this* class, whereas getattr would also
 593         # match the superclass
 594         if '_VALID_URL_RE' not in cls.__dict__:
 595             cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
 596         return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
 597
 598     @classmethod
 599     def suitable(cls, url):
 600         """Receives a URL and returns True if suitable for this IE."""
 601         # This function must import everything it needs (except other extractors),
 602         # so that lazy_extractors works correctly
 603         return cls._match_valid_url(url) is not None
 604
 605     @classmethod
 606     def _match_id(cls, url):
 607         return cls._match_valid_url(url).group('id')
 608
 609     @classmethod
 610     def get_temp_id(cls, url):
 611         try:
 612             return cls._match_id(url)
 613         except (IndexError, AttributeError):
 614             return None
 615
 616     @classmethod
 617     def working(cls):
 618         """Getter method for _WORKING."""
 619         return cls._WORKING
 620
 621     @classmethod
 622     def supports_login(cls):
 623         return bool(cls._NETRC_MACHINE)
 624
 625     def initialize(self):
 626         """Initializes an instance (authentication, etc)."""
 627         self._printed_messages = set()
 628         self._initialize_geo_bypass({
 629             'countries': self._GEO_COUNTRIES,
 630             'ip_blocks': self._GEO_IP_BLOCKS,
 631         })
 632         if not self._ready:
 633             self._initialize_pre_login()
 634             if self.supports_login():
 635                 username, password = self._get_login_info()
 636                 if username:
 637                     self._perform_login(username, password)
 638             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 639                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 640             self._real_initialize()
 641             self._ready = True
 642
 643     def _initialize_geo_bypass(self, geo_bypass_context):
 644         """
 645         Initialize geo restriction bypass mechanism.
 646
 647         This method is used to initialize geo bypass mechanism based on faking
 648         X-Forwarded-For HTTP header. A random country from provided country list
 649         is selected and a random IP belonging to this country is generated. This
 650         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 651         HTTP requests.
 652
 653         This method will be used for initial geo bypass mechanism initialization
 654         during the instance initialization with _GEO_COUNTRIES and
 655         _GEO_IP_BLOCKS.
 656
 657         You may also manually call it from extractor's code if geo bypass
 658         information is not available beforehand (e.g. obtained during
 659         extraction) or due to some other reason. In this case you should pass
 660         this information in geo bypass context passed as first argument. It may
 661         contain following fields:
 662
 663         countries:  List of geo unrestricted countries (similar
 664                     to _GEO_COUNTRIES)
 665         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 666                     (similar to _GEO_IP_BLOCKS)
 667
 668         """
 669         if not self._x_forwarded_for_ip:
 670
 671             # Geo bypass mechanism is explicitly disabled by user
 672             if not self.get_param('geo_bypass', True):
 673                 return
 674
 675             if not geo_bypass_context:
 676                 geo_bypass_context = {}
 677
 678             # Backward compatibility: previously _initialize_geo_bypass
 679             # expected a list of countries, some 3rd party code may still use
 680             # it this way
 681             if isinstance(geo_bypass_context, (list, tuple)):
 682                 geo_bypass_context = {
 683                     'countries': geo_bypass_context,
 684                 }
 685
 686             # The whole point of geo bypass mechanism is to fake IP
 687             # as X-Forwarded-For HTTP header based on some IP block or
 688             # country code.
 689
 690             # Path 1: bypassing based on IP block in CIDR notation
 691
 692             # Explicit IP block specified by user, use it right away
 693             # regardless of whether extractor is geo bypassable or not
 694             ip_block = self.get_param('geo_bypass_ip_block', None)
 695
 696             # Otherwise use random IP block from geo bypass context but only
 697             # if extractor is known as geo bypassable
 698             if not ip_block:
 699                 ip_blocks = geo_bypass_context.get('ip_blocks')
 700                 if self._GEO_BYPASS and ip_blocks:
 701                     ip_block = random.choice(ip_blocks)
 702
 703             if ip_block:
 704                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 705                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 706                 return
 707
 708             # Path 2: bypassing based on country code
 709
 710             # Explicit country code specified by user, use it right away
 711             # regardless of whether extractor is geo bypassable or not
 712             country = self.get_param('geo_bypass_country', None)
 713
 714             # Otherwise use random country code from geo bypass context but
 715             # only if extractor is known as geo bypassable
 716             if not country:
 717                 countries = geo_bypass_context.get('countries')
 718                 if self._GEO_BYPASS and countries:
 719                     country = random.choice(countries)
 720
 721             if country:
 722                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 723                 self._downloader.write_debug(
 724                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 725
 726     def extract(self, url):
 727         """Extracts URL information and returns it in list of dicts."""
 728         try:
 729             for _ in range(2):
 730                 try:
 731                     self.initialize()
 732                     self.to_screen('Extracting URL: %s' % (
 733                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 734                     ie_result = self._real_extract(url)
 735                     if ie_result is None:
 736                         return None
 737                     if self._x_forwarded_for_ip:
 738                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 739                     subtitles = ie_result.get('subtitles') or {}
 740                     if 'no-live-chat' in self.get_param('compat_opts'):
 741                         for lang in ('live_chat', 'comments', 'danmaku'):
 742                             subtitles.pop(lang, None)
 743                     return ie_result
 744                 except GeoRestrictedError as e:
 745                     if self.__maybe_fake_ip_and_retry(e.countries):
 746                         continue
 747                     raise
 748         except UnsupportedError:
 749             raise
 750         except ExtractorError as e:
 751             e.video_id = e.video_id or self.get_temp_id(url)
 752             e.ie = e.ie or self.IE_NAME
 753             e.traceback = e.traceback or sys.exc_info()[2]
 754             raise
 755         except IncompleteRead as e:
 756             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 757         except (KeyError, StopIteration) as e:
 758             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 759
 760     def __maybe_fake_ip_and_retry(self, countries):
 761         if (not self.get_param('geo_bypass_country', None)
 762                 and self._GEO_BYPASS
 763                 and self.get_param('geo_bypass', True)
 764                 and not self._x_forwarded_for_ip
 765                 and countries):
 766             country_code = random.choice(countries)
 767             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 768             if self._x_forwarded_for_ip:
 769                 self.report_warning(
 770                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 771                     % (self._x_forwarded_for_ip, country_code.upper()))
 772                 return True
 773         return False
 774
 775     def set_downloader(self, downloader):
 776         """Sets a YoutubeDL instance as the downloader for this IE."""
 777         self._downloader = downloader
 778
 779     @property
 780     def cache(self):
 781         return self._downloader.cache
 782
 783     @property
 784     def cookiejar(self):
 785         return self._downloader.cookiejar
 786
 787     def _initialize_pre_login(self):
 788         """ Initialization before login. Redefine in subclasses."""
 789         pass
 790
 791     def _perform_login(self, username, password):
 792         """ Login with username and password. Redefine in subclasses."""
 793         pass
 794
 795     def _real_initialize(self):
 796         """Real initialization process. Redefine in subclasses."""
 797         pass
 798
 799     def _real_extract(self, url):
 800         """Real extraction process. Redefine in subclasses."""
 801         raise NotImplementedError('This method must be implemented by subclasses')
 802
 803     @classmethod
 804     def ie_key(cls):
 805         """A string for getting the InfoExtractor with get_info_extractor"""
 806         return cls.__name__[:-2]
 807
 808     @classproperty
 809     def IE_NAME(cls):
 810         return cls.__name__[:-2]
 811
 812     @staticmethod
 813     def __can_accept_status_code(err, expected_status):
 814         assert isinstance(err, HTTPError)
 815         if expected_status is None:
 816             return False
 817         elif callable(expected_status):
 818             return expected_status(err.status) is True
 819         else:
 820             return err.status in variadic(expected_status)
 821
 822     def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None):
 823         if isinstance(url_or_request, urllib.request.Request):
 824             self._downloader.deprecation_warning(
 825                 'Passing a urllib.request.Request to _create_request() is deprecated. '
 826                 'Use yt_dlp.networking.common.Request instead.')
 827             url_or_request = urllib_req_to_req(url_or_request)
 828         elif not isinstance(url_or_request, Request):
 829             url_or_request = Request(url_or_request)
 830
 831         url_or_request.update(data=data, headers=headers, query=query, extensions=extensions)
 832         return url_or_request
 833
 834     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None,
 835                          headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False):
 836         """
 837         Return the response handle.
 838
 839         See _download_webpage docstring for arguments specification.
 840         """
 841         if not self._downloader._first_webpage_request:
 842             sleep_interval = self.get_param('sleep_interval_requests') or 0
 843             if sleep_interval > 0:
 844                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 845                 time.sleep(sleep_interval)
 846         else:
 847             self._downloader._first_webpage_request = False
 848
 849         if note is None:
 850             self.report_download_webpage(video_id)
 851         elif note is not False:
 852             if video_id is None:
 853                 self.to_screen(str(note))
 854             else:
 855                 self.to_screen(f'{video_id}: {note}')
 856
 857         # Some sites check X-Forwarded-For HTTP header in order to figure out
 858         # the origin of the client behind proxy. This allows bypassing geo
 859         # restriction by faking this header's value to IP that belongs to some
 860         # geo unrestricted country. We will do so once we encounter any
 861         # geo restriction error.
 862         if self._x_forwarded_for_ip:
 863             headers = (headers or {}).copy()
 864             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 865
 866         extensions = {}
 867
 868         if impersonate in (True, ''):
 869             impersonate = ImpersonateTarget()
 870         requested_targets = [
 871             t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t)
 872             for t in variadic(impersonate)
 873         ] if impersonate else []
 874
 875         available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None)
 876         if available_target:
 877             extensions['impersonate'] = available_target
 878         elif requested_targets:
 879             message = 'The extractor is attempting impersonation, but '
 880             message += (
 881                 'no impersonate target is available' if not str(impersonate)
 882                 else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"')
 883             info_msg = ('see  https://github.com/yt-dlp/yt-dlp#impersonation  '
 884                         'for information on installing the required dependencies')
 885             if require_impersonation:
 886                 raise ExtractorError(f'{message}; {info_msg}', expected=True)
 887             self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True)
 888
 889         try:
 890             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions))
 891         except network_exceptions as err:
 892             if isinstance(err, HTTPError):
 893                 if self.__can_accept_status_code(err, expected_status):
 894                     return err.response
 895
 896             if errnote is False:
 897                 return False
 898             if errnote is None:
 899                 errnote = 'Unable to download webpage'
 900
 901             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 902             if fatal:
 903                 raise ExtractorError(errmsg, cause=err)
 904             else:
 905                 self.report_warning(errmsg)
 906                 return False
 907
 908     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 909                                  encoding=None, data=None, headers={}, query={}, expected_status=None,
 910                                  impersonate=None, require_impersonation=False):
 911         """
 912         Return a tuple (page content as string, URL handle).
 913
 914         Arguments:
 915         url_or_request -- plain text URL as a string or
 916             a yt_dlp.networking.Request object
 917         video_id -- Video/playlist/item identifier (string)
 918
 919         Keyword arguments:
 920         note -- note printed before downloading (string)
 921         errnote -- note printed in case of an error (string)
 922         fatal -- flag denoting whether error should be considered fatal,
 923             i.e. whether it should cause ExtractionError to be raised,
 924             otherwise a warning will be reported and extraction continued
 925         encoding -- encoding for a page content decoding, guessed automatically
 926             when not explicitly specified
 927         data -- POST data (bytes)
 928         headers -- HTTP headers (dict)
 929         query -- URL query (dict)
 930         expected_status -- allows to accept failed HTTP requests (non 2xx
 931             status code) by explicitly specifying a set of accepted status
 932             codes. Can be any of the following entities:
 933                 - an integer type specifying an exact failed status code to
 934                   accept
 935                 - a list or a tuple of integer types specifying a list of
 936                   failed status codes to accept
 937                 - a callable accepting an actual failed status code and
 938                   returning True if it should be accepted
 939             Note that this argument does not affect success status codes (2xx)
 940             which are always accepted.
 941         impersonate -- the impersonate target. Can be any of the following entities:
 942                 - an instance of yt_dlp.networking.impersonate.ImpersonateTarget
 943                 - a string in the format of CLIENT[:OS]
 944                 - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances
 945                 - a boolean value; True means any impersonate target is sufficient
 946         require_impersonation -- flag to toggle whether the request should raise an error
 947             if impersonation is not possible (bool, default: False)
 948         """
 949
 950         # Strip hashes from the URL (#1038)
 951         if isinstance(url_or_request, str):
 952             url_or_request = url_or_request.partition('#')[0]
 953
 954         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data,
 955                                      headers=headers, query=query, expected_status=expected_status,
 956                                      impersonate=impersonate, require_impersonation=require_impersonation)
 957         if urlh is False:
 958             assert not fatal
 959             return False
 960         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 961         return (content, urlh)
 962
 963     @staticmethod
 964     def _guess_encoding_from_content(content_type, webpage_bytes):
 965         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 966         if m:
 967             encoding = m.group(1)
 968         else:
 969             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 970                           webpage_bytes[:1024])
 971             if m:
 972                 encoding = m.group(1).decode('ascii')
 973             elif webpage_bytes.startswith(b'\xff\xfe'):
 974                 encoding = 'utf-16'
 975             else:
 976                 encoding = 'utf-8'
 977
 978         return encoding
 979
 980     def __check_blocked(self, content):
 981         first_block = content[:512]
 982         if ('<title>Access to this site is blocked</title>' in content
 983                 and 'Websense' in first_block):
 984             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 985             blocked_iframe = self._html_search_regex(
 986                 r'<iframe src="([^"]+)"', content,
 987                 'Websense information URL', default=None)
 988             if blocked_iframe:
 989                 msg += ' Visit %s for more details' % blocked_iframe
 990             raise ExtractorError(msg, expected=True)
 991         if '<title>The URL you requested has been blocked</title>' in first_block:
 992             msg = (
 993                 'Access to this webpage has been blocked by Indian censorship. '
 994                 'Use a VPN or proxy server (with --proxy) to route around it.')
 995             block_msg = self._html_search_regex(
 996                 r'</h1><p>(.*?)</p>',
 997                 content, 'block message', default=None)
 998             if block_msg:
 999                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
1000             raise ExtractorError(msg, expected=True)
1001         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
1002                 and 'blocklist.rkn.gov.ru' in content):
1003             raise ExtractorError(
1004                 'Access to this webpage has been blocked by decision of the Russian government. '
1005                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
1006                 expected=True)
1007
1008     def _request_dump_filename(self, url, video_id):
1009         basen = f'{video_id}_{url}'
1010         trim_length = self.get_param('trim_file_name') or 240
1011         if len(basen) > trim_length:
1012             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
1013             basen = basen[:trim_length - len(h)] + h
1014         filename = sanitize_filename(f'{basen}.dump', restricted=True)
1015         # Working around MAX_PATH limitation on Windows (see
1016         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
1017         if compat_os_name == 'nt':
1018             absfilepath = os.path.abspath(filename)
1019             if len(absfilepath) > 259:
1020                 filename = fR'\\?\{absfilepath}'
1021         return filename
1022
1023     def __decode_webpage(self, webpage_bytes, encoding, headers):
1024         if not encoding:
1025             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
1026         try:
1027             return webpage_bytes.decode(encoding, 'replace')
1028         except LookupError:
1029             return webpage_bytes.decode('utf-8', 'replace')
1030
1031     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
1032         webpage_bytes = urlh.read()
1033         if prefix is not None:
1034             webpage_bytes = prefix + webpage_bytes
1035         if self.get_param('dump_intermediate_pages', False):
1036             self.to_screen('Dumping request to ' + urlh.url)
1037             dump = base64.b64encode(webpage_bytes).decode('ascii')
1038             self._downloader.to_screen(dump)
1039         if self.get_param('write_pages'):
1040             filename = self._request_dump_filename(urlh.url, video_id)
1041             self.to_screen(f'Saving request to {filename}')
1042             with open(filename, 'wb') as outf:
1043                 outf.write(webpage_bytes)
1044
1045         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
1046         self.__check_blocked(content)
1047
1048         return content
1049
1050     def __print_error(self, errnote, fatal, video_id, err):
1051         if fatal:
1052             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
1053         elif errnote:
1054             self.report_warning(f'{video_id}: {errnote}: {err}')
1055
1056     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
1057         if transform_source:
1058             xml_string = transform_source(xml_string)
1059         try:
1060             return compat_etree_fromstring(xml_string.encode('utf-8'))
1061         except xml.etree.ElementTree.ParseError as ve:
1062             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1063
1064     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1065         try:
1066             return json.loads(
1067                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1068         except ValueError as ve:
1069             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1070
1071     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1072         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1073
1074     def __create_download_methods(name, parser, note, errnote, return_value):
1075
1076         def parse(ie, content, *args, errnote=errnote, **kwargs):
1077             if parser is None:
1078                 return content
1079             if errnote is False:
1080                 kwargs['errnote'] = errnote
1081             # parser is fetched by name so subclasses can override it
1082             return getattr(ie, parser)(content, *args, **kwargs)
1083
1084         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1085                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
1086                             impersonate=None, require_impersonation=False):
1087             res = self._download_webpage_handle(
1088                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1089                 data=data, headers=headers, query=query, expected_status=expected_status,
1090                 impersonate=impersonate, require_impersonation=require_impersonation)
1091             if res is False:
1092                 return res
1093             content, urlh = res
1094             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1095
1096         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1097                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
1098                              impersonate=None, require_impersonation=False):
1099             if self.get_param('load_pages'):
1100                 url_or_request = self._create_request(url_or_request, data, headers, query)
1101                 filename = self._request_dump_filename(url_or_request.url, video_id)
1102                 self.to_screen(f'Loading request from {filename}')
1103                 try:
1104                     with open(filename, 'rb') as dumpf:
1105                         webpage_bytes = dumpf.read()
1106                 except OSError as e:
1107                     self.report_warning(f'Unable to load request from disk: {e}')
1108                 else:
1109                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1110                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1111             kwargs = {
1112                 'note': note,
1113                 'errnote': errnote,
1114                 'transform_source': transform_source,
1115                 'fatal': fatal,
1116                 'encoding': encoding,
1117                 'data': data,
1118                 'headers': headers,
1119                 'query': query,
1120                 'expected_status': expected_status,
1121                 'impersonate': impersonate,
1122                 'require_impersonation': require_impersonation,
1123             }
1124             if parser is None:
1125                 kwargs.pop('transform_source')
1126             # The method is fetched by name so subclasses can override _download_..._handle
1127             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1128             return res if res is False else res[0]
1129
1130         def impersonate(func, name, return_value):
1131             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1132             func.__doc__ = f'''
1133                 @param transform_source     Apply this transformation before parsing
1134                 @returns                    {return_value}
1135
1136                 See _download_webpage_handle docstring for other arguments specification
1137             '''
1138
1139         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1140         impersonate(download_content, f'_download_{name}', f'{return_value}')
1141         return download_handle, download_content
1142
1143     _download_xml_handle, _download_xml = __create_download_methods(
1144         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1145     _download_json_handle, _download_json = __create_download_methods(
1146         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1147     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1148         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1149     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1150
1151     def _download_webpage(
1152             self, url_or_request, video_id, note=None, errnote=None,
1153             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1154         """
1155         Return the data of the page as a string.
1156
1157         Keyword arguments:
1158         tries -- number of tries
1159         timeout -- sleep interval between tries
1160
1161         See _download_webpage_handle docstring for other arguments specification.
1162         """
1163
1164         R''' # NB: These are unused; should they be deprecated?
1165         if tries != 1:
1166             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1167         if timeout is NO_DEFAULT:
1168             timeout = 5
1169         else:
1170             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1171         '''
1172
1173         try_count = 0
1174         while True:
1175             try:
1176                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1177             except IncompleteRead as e:
1178                 try_count += 1
1179                 if try_count >= tries:
1180                     raise e
1181                 self._sleep(timeout, video_id)
1182
1183     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1184         idstr = format_field(video_id, None, '%s: ')
1185         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1186         if only_once:
1187             if f'WARNING: {msg}' in self._printed_messages:
1188                 return
1189             self._printed_messages.add(f'WARNING: {msg}')
1190         self._downloader.report_warning(msg, *args, **kwargs)
1191
1192     def to_screen(self, msg, *args, **kwargs):
1193         """Print msg to screen, prefixing it with '[ie_name]'"""
1194         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1195
1196     def write_debug(self, msg, *args, **kwargs):
1197         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1198
1199     def get_param(self, name, default=None, *args, **kwargs):
1200         if self._downloader:
1201             return self._downloader.params.get(name, default, *args, **kwargs)
1202         return default
1203
1204     def report_drm(self, video_id, partial=NO_DEFAULT):
1205         if partial is not NO_DEFAULT:
1206             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1207         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1208
1209     def report_extraction(self, id_or_name):
1210         """Report information extraction."""
1211         self.to_screen('%s: Extracting information' % id_or_name)
1212
1213     def report_download_webpage(self, video_id):
1214         """Report webpage download."""
1215         self.to_screen('%s: Downloading webpage' % video_id)
1216
1217     def report_age_confirmation(self):
1218         """Report attempt to confirm age."""
1219         self.to_screen('Confirming age')
1220
1221     def report_login(self):
1222         """Report attempt to log in."""
1223         self.to_screen('Logging in')
1224
1225     def raise_login_required(
1226             self, msg='This video is only available for registered users',
1227             metadata_available=False, method=NO_DEFAULT):
1228         if metadata_available and (
1229                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1230             self.report_warning(msg)
1231             return
1232         msg += format_field(self._login_hint(method), None, '. %s')
1233         raise ExtractorError(msg, expected=True)
1234
1235     def raise_geo_restricted(
1236             self, msg='This video is not available from your location due to geo restriction',
1237             countries=None, metadata_available=False):
1238         if metadata_available and (
1239                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1240             self.report_warning(msg)
1241         else:
1242             raise GeoRestrictedError(msg, countries=countries)
1243
1244     def raise_no_formats(self, msg, expected=False, video_id=None):
1245         if expected and (
1246                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1247             self.report_warning(msg, video_id)
1248         elif isinstance(msg, ExtractorError):
1249             raise msg
1250         else:
1251             raise ExtractorError(msg, expected=expected, video_id=video_id)
1252
1253     # Methods for following #608
1254     @staticmethod
1255     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1256         """Returns a URL that points to a page that should be processed"""
1257         if ie is not None:
1258             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1259         if video_id is not None:
1260             kwargs['id'] = video_id
1261         if video_title is not None:
1262             kwargs['title'] = video_title
1263         return {
1264             **kwargs,
1265             '_type': 'url_transparent' if url_transparent else 'url',
1266             'url': url,
1267         }
1268
1269     @classmethod
1270     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1271                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1272         return cls.playlist_result(
1273             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1274             playlist_id, playlist_title, **kwargs)
1275
1276     @staticmethod
1277     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1278         """Returns a playlist"""
1279         if playlist_id:
1280             kwargs['id'] = playlist_id
1281         if playlist_title:
1282             kwargs['title'] = playlist_title
1283         if playlist_description is not None:
1284             kwargs['description'] = playlist_description
1285         return {
1286             **kwargs,
1287             '_type': 'multi_video' if multi_video else 'playlist',
1288             'entries': entries,
1289         }
1290
1291     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1292         """
1293         Perform a regex search on the given string, using a single or a list of
1294         patterns returning the first matching group.
1295         In case of failure return a default value or raise a WARNING or a
1296         RegexNotFoundError, depending on fatal, specifying the field name.
1297         """
1298         if string is None:
1299             mobj = None
1300         elif isinstance(pattern, (str, re.Pattern)):
1301             mobj = re.search(pattern, string, flags)
1302         else:
1303             for p in pattern:
1304                 mobj = re.search(p, string, flags)
1305                 if mobj:
1306                     break
1307
1308         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1309
1310         if mobj:
1311             if group is None:
1312                 # return the first matching group
1313                 return next(g for g in mobj.groups() if g is not None)
1314             elif isinstance(group, (list, tuple)):
1315                 return tuple(mobj.group(g) for g in group)
1316             else:
1317                 return mobj.group(group)
1318         elif default is not NO_DEFAULT:
1319             return default
1320         elif fatal:
1321             raise RegexNotFoundError('Unable to extract %s' % _name)
1322         else:
1323             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1324             return None
1325
1326     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1327                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1328         """Searches string for the JSON object specified by start_pattern"""
1329         # NB: end_pattern is only used to reduce the size of the initial match
1330         if default is NO_DEFAULT:
1331             default, has_default = {}, False
1332         else:
1333             fatal, has_default = False, True
1334
1335         json_string = self._search_regex(
1336             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1337             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1338         if not json_string:
1339             return default
1340
1341         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1342         try:
1343             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1344         except ExtractorError as e:
1345             if fatal:
1346                 raise ExtractorError(
1347                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1348             elif not has_default:
1349                 self.report_warning(
1350                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1351         return default
1352
1353     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1354         """
1355         Like _search_regex, but strips HTML tags and unescapes entities.
1356         """
1357         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1358         if isinstance(res, tuple):
1359             return tuple(map(clean_html, res))
1360         return clean_html(res)
1361
1362     def _get_netrc_login_info(self, netrc_machine=None):
1363         netrc_machine = netrc_machine or self._NETRC_MACHINE
1364
1365         cmd = self.get_param('netrc_cmd')
1366         if cmd:
1367             cmd = cmd.replace('{}', netrc_machine)
1368             self.to_screen(f'Executing command: {cmd}')
1369             stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1370             if ret != 0:
1371                 raise OSError(f'Command returned error code {ret}')
1372             info = netrc_from_content(stdout).authenticators(netrc_machine)
1373
1374         elif self.get_param('usenetrc', False):
1375             netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1376             if os.path.isdir(netrc_file):
1377                 netrc_file = os.path.join(netrc_file, '.netrc')
1378             info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1379
1380         else:
1381             return None, None
1382         if not info:
1383             self.to_screen(f'No authenticators for {netrc_machine}')
1384             return None, None
1385
1386         self.write_debug(f'Using netrc for {netrc_machine} authentication')
1387         return info[0], info[2]
1388
1389     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1390         """
1391         Get the login info as (username, password)
1392         First look for the manually specified credentials using username_option
1393         and password_option as keys in params dictionary. If no such credentials
1394         are available try the netrc_cmd if it is defined or look in the
1395         netrc file using the netrc_machine or _NETRC_MACHINE value.
1396         If there's no info available, return (None, None)
1397         """
1398
1399         username = self.get_param(username_option)
1400         if username is not None:
1401             password = self.get_param(password_option)
1402         else:
1403             try:
1404                 username, password = self._get_netrc_login_info(netrc_machine)
1405             except (OSError, netrc.NetrcParseError) as err:
1406                 self.report_warning(f'Failed to parse .netrc: {err}')
1407                 return None, None
1408         return username, password
1409
1410     def _get_tfa_info(self, note='two-factor verification code'):
1411         """
1412         Get the two-factor authentication info
1413         TODO - asking the user will be required for sms/phone verify
1414         currently just uses the command line option
1415         If there's no info available, return None
1416         """
1417
1418         tfa = self.get_param('twofactor')
1419         if tfa is not None:
1420             return tfa
1421
1422         return getpass.getpass('Type %s and press [Return]: ' % note)
1423
1424     # Helper functions for extracting OpenGraph info
1425     @staticmethod
1426     def _og_regexes(prop):
1427         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1428         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1429                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1430         template = r'<meta[^>]+?%s[^>]+?%s'
1431         return [
1432             template % (property_re, content_re),
1433             template % (content_re, property_re),
1434         ]
1435
1436     @staticmethod
1437     def _meta_regex(prop):
1438         return r'''(?isx)<meta
1439                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1440                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1441
1442     def _og_search_property(self, prop, html, name=None, **kargs):
1443         prop = variadic(prop)
1444         if name is None:
1445             name = 'OpenGraph %s' % prop[0]
1446         og_regexes = []
1447         for p in prop:
1448             og_regexes.extend(self._og_regexes(p))
1449         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1450         if escaped is None:
1451             return None
1452         return unescapeHTML(escaped)
1453
1454     def _og_search_thumbnail(self, html, **kargs):
1455         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1456
1457     def _og_search_description(self, html, **kargs):
1458         return self._og_search_property('description', html, fatal=False, **kargs)
1459
1460     def _og_search_title(self, html, *, fatal=False, **kargs):
1461         return self._og_search_property('title', html, fatal=fatal, **kargs)
1462
1463     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1464         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1465         if secure:
1466             regexes = self._og_regexes('video:secure_url') + regexes
1467         return self._html_search_regex(regexes, html, name, **kargs)
1468
1469     def _og_search_url(self, html, **kargs):
1470         return self._og_search_property('url', html, **kargs)
1471
1472     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1473         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1474
1475     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1476         name = variadic(name)
1477         if display_name is None:
1478             display_name = name[0]
1479         return self._html_search_regex(
1480             [self._meta_regex(n) for n in name],
1481             html, display_name, fatal=fatal, group='content', **kwargs)
1482
1483     def _dc_search_uploader(self, html):
1484         return self._html_search_meta('dc.creator', html, 'uploader')
1485
1486     @staticmethod
1487     def _rta_search(html):
1488         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1489         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1490                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1491                      html):
1492             return 18
1493
1494         # And then there are the jokers who advertise that they use RTA, but actually don't.
1495         AGE_LIMIT_MARKERS = [
1496             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1497             r'>[^<]*you acknowledge you are at least (\d+) years old',
1498             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1499         ]
1500
1501         age_limit = 0
1502         for marker in AGE_LIMIT_MARKERS:
1503             mobj = re.search(marker, html)
1504             if mobj:
1505                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1506         return age_limit
1507
1508     def _media_rating_search(self, html):
1509         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1510         rating = self._html_search_meta('rating', html)
1511
1512         if not rating:
1513             return None
1514
1515         RATING_TABLE = {
1516             'safe for kids': 0,
1517             'general': 8,
1518             '14 years': 14,
1519             'mature': 17,
1520             'restricted': 19,
1521         }
1522         return RATING_TABLE.get(rating.lower())
1523
1524     def _family_friendly_search(self, html):
1525         # See http://schema.org/VideoObject
1526         family_friendly = self._html_search_meta(
1527             'isFamilyFriendly', html, default=None)
1528
1529         if not family_friendly:
1530             return None
1531
1532         RATING_TABLE = {
1533             '1': 0,
1534             'true': 0,
1535             '0': 18,
1536             'false': 18,
1537         }
1538         return RATING_TABLE.get(family_friendly.lower())
1539
1540     def _twitter_search_player(self, html):
1541         return self._html_search_meta('twitter:player', html,
1542                                       'twitter card player')
1543
1544     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1545         """Yield all json ld objects in the html"""
1546         if default is not NO_DEFAULT:
1547             fatal = False
1548         for mobj in re.finditer(JSON_LD_RE, html):
1549             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1550             for json_ld in variadic(json_ld_item):
1551                 if isinstance(json_ld, dict):
1552                     yield json_ld
1553
1554     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1555         """Search for a video in any json ld in the html"""
1556         if default is not NO_DEFAULT:
1557             fatal = False
1558         info = self._json_ld(
1559             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1560             video_id, fatal=fatal, expected_type=expected_type)
1561         if info:
1562             return info
1563         if default is not NO_DEFAULT:
1564             return default
1565         elif fatal:
1566             raise RegexNotFoundError('Unable to extract JSON-LD')
1567         else:
1568             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1569             return {}
1570
1571     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1572         if isinstance(json_ld, str):
1573             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1574         if not json_ld:
1575             return {}
1576         info = {}
1577
1578         INTERACTION_TYPE_MAP = {
1579             'CommentAction': 'comment',
1580             'AgreeAction': 'like',
1581             'DisagreeAction': 'dislike',
1582             'LikeAction': 'like',
1583             'DislikeAction': 'dislike',
1584             'ListenAction': 'view',
1585             'WatchAction': 'view',
1586             'ViewAction': 'view',
1587         }
1588
1589         def is_type(e, *expected_types):
1590             type = variadic(traverse_obj(e, '@type'))
1591             return any(x in type for x in expected_types)
1592
1593         def extract_interaction_type(e):
1594             interaction_type = e.get('interactionType')
1595             if isinstance(interaction_type, dict):
1596                 interaction_type = interaction_type.get('@type')
1597             return str_or_none(interaction_type)
1598
1599         def extract_interaction_statistic(e):
1600             interaction_statistic = e.get('interactionStatistic')
1601             if isinstance(interaction_statistic, dict):
1602                 interaction_statistic = [interaction_statistic]
1603             if not isinstance(interaction_statistic, list):
1604                 return
1605             for is_e in interaction_statistic:
1606                 if not is_type(is_e, 'InteractionCounter'):
1607                     continue
1608                 interaction_type = extract_interaction_type(is_e)
1609                 if not interaction_type:
1610                     continue
1611                 # For interaction count some sites provide string instead of
1612                 # an integer (as per spec) with non digit characters (e.g. ",")
1613                 # so extracting count with more relaxed str_to_int
1614                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1615                 if interaction_count is None:
1616                     continue
1617                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1618                 if not count_kind:
1619                     continue
1620                 count_key = '%s_count' % count_kind
1621                 if info.get(count_key) is not None:
1622                     continue
1623                 info[count_key] = interaction_count
1624
1625         def extract_chapter_information(e):
1626             chapters = [{
1627                 'title': part.get('name'),
1628                 'start_time': part.get('startOffset'),
1629                 'end_time': part.get('endOffset'),
1630             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1631             for idx, (last_c, current_c, next_c) in enumerate(zip(
1632                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1633                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1634                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1635                 if None in current_c.values():
1636                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1637                     return
1638             if chapters:
1639                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1640                 info['chapters'] = chapters
1641
1642         def extract_video_object(e):
1643             author = e.get('author')
1644             info.update({
1645                 'url': url_or_none(e.get('contentUrl')),
1646                 'ext': mimetype2ext(e.get('encodingFormat')),
1647                 'title': unescapeHTML(e.get('name')),
1648                 'description': unescapeHTML(e.get('description')),
1649                 'thumbnails': [{'url': unescapeHTML(url)}
1650                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1651                                if url_or_none(url)],
1652                 'duration': parse_duration(e.get('duration')),
1653                 'timestamp': unified_timestamp(e.get('uploadDate')),
1654                 # author can be an instance of 'Organization' or 'Person' types.
1655                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1656                 # however some websites are using 'Text' type instead.
1657                 # 1. https://schema.org/VideoObject
1658                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1659                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1660                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1661                 'tbr': int_or_none(e.get('bitrate')),
1662                 'width': int_or_none(e.get('width')),
1663                 'height': int_or_none(e.get('height')),
1664                 'view_count': int_or_none(e.get('interactionCount')),
1665                 'tags': try_call(lambda: e.get('keywords').split(',')),
1666             })
1667             if is_type(e, 'AudioObject'):
1668                 info.update({
1669                     'vcodec': 'none',
1670                     'abr': int_or_none(e.get('bitrate')),
1671                 })
1672             extract_interaction_statistic(e)
1673             extract_chapter_information(e)
1674
1675         def traverse_json_ld(json_ld, at_top_level=True):
1676             for e in variadic(json_ld):
1677                 if not isinstance(e, dict):
1678                     continue
1679                 if at_top_level and '@context' not in e:
1680                     continue
1681                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1682                     traverse_json_ld(e['@graph'], at_top_level=False)
1683                     continue
1684                 if expected_type is not None and not is_type(e, expected_type):
1685                     continue
1686                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1687                 if rating is not None:
1688                     info['average_rating'] = rating
1689                 if is_type(e, 'TVEpisode', 'Episode'):
1690                     episode_name = unescapeHTML(e.get('name'))
1691                     info.update({
1692                         'episode': episode_name,
1693                         'episode_number': int_or_none(e.get('episodeNumber')),
1694                         'description': unescapeHTML(e.get('description')),
1695                     })
1696                     if not info.get('title') and episode_name:
1697                         info['title'] = episode_name
1698                     part_of_season = e.get('partOfSeason')
1699                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1700                         info.update({
1701                             'season': unescapeHTML(part_of_season.get('name')),
1702                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1703                         })
1704                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1705                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1706                         info['series'] = unescapeHTML(part_of_series.get('name'))
1707                 elif is_type(e, 'Movie'):
1708                     info.update({
1709                         'title': unescapeHTML(e.get('name')),
1710                         'description': unescapeHTML(e.get('description')),
1711                         'duration': parse_duration(e.get('duration')),
1712                         'timestamp': unified_timestamp(e.get('dateCreated')),
1713                     })
1714                 elif is_type(e, 'Article', 'NewsArticle'):
1715                     info.update({
1716                         'timestamp': parse_iso8601(e.get('datePublished')),
1717                         'title': unescapeHTML(e.get('headline')),
1718                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1719                     })
1720                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1721                         extract_video_object(e['video'][0])
1722                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1723                         extract_video_object(e['subjectOf'][0])
1724                 elif is_type(e, 'VideoObject', 'AudioObject'):
1725                     extract_video_object(e)
1726                     if expected_type is None:
1727                         continue
1728                     else:
1729                         break
1730                 video = e.get('video')
1731                 if is_type(video, 'VideoObject'):
1732                     extract_video_object(video)
1733                 if expected_type is None:
1734                     continue
1735                 else:
1736                     break
1737
1738         traverse_json_ld(json_ld)
1739         return filter_dict(info)
1740
1741     def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw):
1742         if default == '{}':
1743             self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead')
1744             default = {}
1745         if default is not NO_DEFAULT:
1746             fatal = False
1747
1748         return self._search_json(
1749             r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
1750             video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
1751
1752     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1753         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1754         rectx = re.escape(context_name)
1755         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1756         js, arg_keys, arg_vals = self._search_regex(
1757             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1758             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1759             default=NO_DEFAULT if fatal else (None, None, None))
1760         if js is None:
1761             return {}
1762
1763         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1764             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1765
1766         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1767         return traverse_obj(ret, traverse) or {}
1768
1769     @staticmethod
1770     def _hidden_inputs(html):
1771         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1772         hidden_inputs = {}
1773         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1774             attrs = extract_attributes(input)
1775             if not input:
1776                 continue
1777             if attrs.get('type') not in ('hidden', 'submit'):
1778                 continue
1779             name = attrs.get('name') or attrs.get('id')
1780             value = attrs.get('value')
1781             if name and value is not None:
1782                 hidden_inputs[name] = value
1783         return hidden_inputs
1784
1785     def _form_hidden_inputs(self, form_id, html):
1786         form = self._search_regex(
1787             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1788             html, '%s form' % form_id, group='form')
1789         return self._hidden_inputs(form)
1790
1791     @classproperty(cache=True)
1792     def FormatSort(cls):
1793         class FormatSort(FormatSorter):
1794             def __init__(ie, *args, **kwargs):
1795                 super().__init__(ie._downloader, *args, **kwargs)
1796
1797         deprecation_warning(
1798             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1799             'Use yt_dlp.utils.FormatSorter instead')
1800         return FormatSort
1801
1802     def _sort_formats(self, formats, field_preference=[]):
1803         if not field_preference:
1804             self._downloader.deprecation_warning(
1805                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1806             return
1807         self._downloader.deprecation_warning(
1808             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1809             'Return _format_sort_fields in the info_dict instead')
1810         if formats:
1811             formats[0]['__sort_fields'] = field_preference
1812
1813     def _check_formats(self, formats, video_id):
1814         if formats:
1815             formats[:] = filter(
1816                 lambda f: self._is_valid_url(
1817                     f['url'], video_id,
1818                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1819                 formats)
1820
1821     @staticmethod
1822     def _remove_duplicate_formats(formats):
1823         format_urls = set()
1824         unique_formats = []
1825         for f in formats:
1826             if f['url'] not in format_urls:
1827                 format_urls.add(f['url'])
1828                 unique_formats.append(f)
1829         formats[:] = unique_formats
1830
1831     def _is_valid_url(self, url, video_id, item='video', headers={}):
1832         url = self._proto_relative_url(url, scheme='http:')
1833         # For now assume non HTTP(S) URLs always valid
1834         if not (url.startswith('http://') or url.startswith('https://')):
1835             return True
1836         try:
1837             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1838             return True
1839         except ExtractorError as e:
1840             self.to_screen(
1841                 '%s: %s URL is invalid, skipping: %s'
1842                 % (video_id, item, error_to_compat_str(e.cause)))
1843             return False
1844
1845     def http_scheme(self):
1846         """ Either "http:" or "https:", depending on the user's preferences """
1847         return (
1848             'http:'
1849             if self.get_param('prefer_insecure', False)
1850             else 'https:')
1851
1852     def _proto_relative_url(self, url, scheme=None):
1853         scheme = scheme or self.http_scheme()
1854         assert scheme.endswith(':')
1855         return sanitize_url(url, scheme=scheme[:-1])
1856
1857     def _sleep(self, timeout, video_id, msg_template=None):
1858         if msg_template is None:
1859             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1860         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1861         self.to_screen(msg)
1862         time.sleep(timeout)
1863
1864     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1865                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1866                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1867         if self.get_param('ignore_no_formats_error'):
1868             fatal = False
1869
1870         res = self._download_xml_handle(
1871             manifest_url, video_id, 'Downloading f4m manifest',
1872             'Unable to download f4m manifest',
1873             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1874             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1875             transform_source=transform_source,
1876             fatal=fatal, data=data, headers=headers, query=query)
1877         if res is False:
1878             return []
1879
1880         manifest, urlh = res
1881         manifest_url = urlh.url
1882
1883         return self._parse_f4m_formats(
1884             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1885             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1886
1887     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1888                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1889                            fatal=True, m3u8_id=None):
1890         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1891             return []
1892
1893         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1894         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1895         if akamai_pv is not None and ';' in akamai_pv.text:
1896             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1897             if playerVerificationChallenge.strip() != '':
1898                 return []
1899
1900         formats = []
1901         manifest_version = '1.0'
1902         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1903         if not media_nodes:
1904             manifest_version = '2.0'
1905             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1906         # Remove unsupported DRM protected media from final formats
1907         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1908         media_nodes = remove_encrypted_media(media_nodes)
1909         if not media_nodes:
1910             return formats
1911
1912         manifest_base_url = get_base_url(manifest)
1913
1914         bootstrap_info = xpath_element(
1915             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1916             'bootstrap info', default=None)
1917
1918         vcodec = None
1919         mime_type = xpath_text(
1920             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1921             'base URL', default=None)
1922         if mime_type and mime_type.startswith('audio/'):
1923             vcodec = 'none'
1924
1925         for i, media_el in enumerate(media_nodes):
1926             tbr = int_or_none(media_el.attrib.get('bitrate'))
1927             width = int_or_none(media_el.attrib.get('width'))
1928             height = int_or_none(media_el.attrib.get('height'))
1929             format_id = join_nonempty(f4m_id, tbr or i)
1930             # If <bootstrapInfo> is present, the specified f4m is a
1931             # stream-level manifest, and only set-level manifests may refer to
1932             # external resources.  See section 11.4 and section 4 of F4M spec
1933             if bootstrap_info is None:
1934                 media_url = None
1935                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1936                 if manifest_version == '2.0':
1937                     media_url = media_el.attrib.get('href')
1938                 if media_url is None:
1939                     media_url = media_el.attrib.get('url')
1940                 if not media_url:
1941                     continue
1942                 manifest_url = (
1943                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1944                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1945                 # If media_url is itself a f4m manifest do the recursive extraction
1946                 # since bitrates in parent manifest (this one) and media_url manifest
1947                 # may differ leading to inability to resolve the format by requested
1948                 # bitrate in f4m downloader
1949                 ext = determine_ext(manifest_url)
1950                 if ext == 'f4m':
1951                     f4m_formats = self._extract_f4m_formats(
1952                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1953                         transform_source=transform_source, fatal=fatal)
1954                     # Sometimes stream-level manifest contains single media entry that
1955                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1956                     # At the same time parent's media entry in set-level manifest may
1957                     # contain it. We will copy it from parent in such cases.
1958                     if len(f4m_formats) == 1:
1959                         f = f4m_formats[0]
1960                         f.update({
1961                             'tbr': f.get('tbr') or tbr,
1962                             'width': f.get('width') or width,
1963                             'height': f.get('height') or height,
1964                             'format_id': f.get('format_id') if not tbr else format_id,
1965                             'vcodec': vcodec,
1966                         })
1967                     formats.extend(f4m_formats)
1968                     continue
1969                 elif ext == 'm3u8':
1970                     formats.extend(self._extract_m3u8_formats(
1971                         manifest_url, video_id, 'mp4', preference=preference,
1972                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1973                     continue
1974             formats.append({
1975                 'format_id': format_id,
1976                 'url': manifest_url,
1977                 'manifest_url': manifest_url,
1978                 'ext': 'flv' if bootstrap_info is not None else None,
1979                 'protocol': 'f4m',
1980                 'tbr': tbr,
1981                 'width': width,
1982                 'height': height,
1983                 'vcodec': vcodec,
1984                 'preference': preference,
1985                 'quality': quality,
1986             })
1987         return formats
1988
1989     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1990         return {
1991             'format_id': join_nonempty(m3u8_id, 'meta'),
1992             'url': m3u8_url,
1993             'ext': ext,
1994             'protocol': 'm3u8',
1995             'preference': preference - 100 if preference else -100,
1996             'quality': quality,
1997             'resolution': 'multiple',
1998             'format_note': 'Quality selection URL',
1999         }
2000
2001     def _report_ignoring_subs(self, name):
2002         self.report_warning(bug_reports_message(
2003             f'Ignoring subtitle tracks found in the {name} manifest; '
2004             'if any subtitle tracks are missing,'
2005         ), only_once=True)
2006
2007     def _extract_m3u8_formats(self, *args, **kwargs):
2008         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2009         if subs:
2010             self._report_ignoring_subs('HLS')
2011         return fmts
2012
2013     def _extract_m3u8_formats_and_subtitles(
2014             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2015             preference=None, quality=None, m3u8_id=None, note=None,
2016             errnote=None, fatal=True, live=False, data=None, headers={},
2017             query={}):
2018
2019         if self.get_param('ignore_no_formats_error'):
2020             fatal = False
2021
2022         if not m3u8_url:
2023             if errnote is not False:
2024                 errnote = errnote or 'Failed to obtain m3u8 URL'
2025                 if fatal:
2026                     raise ExtractorError(errnote, video_id=video_id)
2027                 self.report_warning(f'{errnote}{bug_reports_message()}')
2028             return [], {}
2029
2030         res = self._download_webpage_handle(
2031             m3u8_url, video_id,
2032             note='Downloading m3u8 information' if note is None else note,
2033             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2034             fatal=fatal, data=data, headers=headers, query=query)
2035
2036         if res is False:
2037             return [], {}
2038
2039         m3u8_doc, urlh = res
2040         m3u8_url = urlh.url
2041
2042         return self._parse_m3u8_formats_and_subtitles(
2043             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2044             preference=preference, quality=quality, m3u8_id=m3u8_id,
2045             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2046             headers=headers, query=query, video_id=video_id)
2047
2048     def _parse_m3u8_formats_and_subtitles(
2049             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2050             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2051             errnote=None, fatal=True, data=None, headers={}, query={},
2052             video_id=None):
2053         formats, subtitles = [], {}
2054         has_drm = HlsFD._has_drm(m3u8_doc)
2055
2056         def format_url(url):
2057             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2058
2059         if self.get_param('hls_split_discontinuity', False):
2060             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2061                 if not m3u8_doc:
2062                     if not manifest_url:
2063                         return []
2064                     m3u8_doc = self._download_webpage(
2065                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2066                         note=False, errnote='Failed to download m3u8 playlist information')
2067                     if m3u8_doc is False:
2068                         return []
2069                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2070
2071         else:
2072             def _extract_m3u8_playlist_indices(*args, **kwargs):
2073                 return [None]
2074
2075         # References:
2076         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2077         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2078         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2079
2080         # We should try extracting formats only from master playlists [1, 4.3.4],
2081         # i.e. playlists that describe available qualities. On the other hand
2082         # media playlists [1, 4.3.3] should be returned as is since they contain
2083         # just the media without qualities renditions.
2084         # Fortunately, master playlist can be easily distinguished from media
2085         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2086         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2087         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2088         # media playlist and MUST NOT appear in master playlist thus we can
2089         # clearly detect media playlist with this criterion.
2090
2091         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2092             formats = [{
2093                 'format_id': join_nonempty(m3u8_id, idx),
2094                 'format_index': idx,
2095                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2096                 'ext': ext,
2097                 'protocol': entry_protocol,
2098                 'preference': preference,
2099                 'quality': quality,
2100                 'has_drm': has_drm,
2101             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2102
2103             return formats, subtitles
2104
2105         groups = {}
2106         last_stream_inf = {}
2107
2108         def extract_media(x_media_line):
2109             media = parse_m3u8_attributes(x_media_line)
2110             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2111             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2112             if not (media_type and group_id and name):
2113                 return
2114             groups.setdefault(group_id, []).append(media)
2115             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2116             if media_type == 'SUBTITLES':
2117                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2118                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2119                 # However, lack of URI has been spotted in the wild.
2120                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2121                 if not media.get('URI'):
2122                     return
2123                 url = format_url(media['URI'])
2124                 sub_info = {
2125                     'url': url,
2126                     'ext': determine_ext(url),
2127                 }
2128                 if sub_info['ext'] == 'm3u8':
2129                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2130                     # files may contain is WebVTT:
2131                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2132                     sub_info['ext'] = 'vtt'
2133                     sub_info['protocol'] = 'm3u8_native'
2134                 lang = media.get('LANGUAGE') or 'und'
2135                 subtitles.setdefault(lang, []).append(sub_info)
2136             if media_type not in ('VIDEO', 'AUDIO'):
2137                 return
2138             media_url = media.get('URI')
2139             if media_url:
2140                 manifest_url = format_url(media_url)
2141                 formats.extend({
2142                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2143                     'format_note': name,
2144                     'format_index': idx,
2145                     'url': manifest_url,
2146                     'manifest_url': m3u8_url,
2147                     'language': media.get('LANGUAGE'),
2148                     'ext': ext,
2149                     'protocol': entry_protocol,
2150                     'preference': preference,
2151                     'quality': quality,
2152                     'has_drm': has_drm,
2153                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2154                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2155
2156         def build_stream_name():
2157             # Despite specification does not mention NAME attribute for
2158             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2159             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2160             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2161             stream_name = last_stream_inf.get('NAME')
2162             if stream_name:
2163                 return stream_name
2164             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2165             # from corresponding rendition group
2166             stream_group_id = last_stream_inf.get('VIDEO')
2167             if not stream_group_id:
2168                 return
2169             stream_group = groups.get(stream_group_id)
2170             if not stream_group:
2171                 return stream_group_id
2172             rendition = stream_group[0]
2173             return rendition.get('NAME') or stream_group_id
2174
2175         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2176         # chance to detect video only formats when EXT-X-STREAM-INF tags
2177         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2178         for line in m3u8_doc.splitlines():
2179             if line.startswith('#EXT-X-MEDIA:'):
2180                 extract_media(line)
2181
2182         for line in m3u8_doc.splitlines():
2183             if line.startswith('#EXT-X-STREAM-INF:'):
2184                 last_stream_inf = parse_m3u8_attributes(line)
2185             elif line.startswith('#') or not line.strip():
2186                 continue
2187             else:
2188                 tbr = float_or_none(
2189                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2190                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2191                 manifest_url = format_url(line.strip())
2192
2193                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2194                     format_id = [m3u8_id, None, idx]
2195                     # Bandwidth of live streams may differ over time thus making
2196                     # format_id unpredictable. So it's better to keep provided
2197                     # format_id intact.
2198                     if not live:
2199                         stream_name = build_stream_name()
2200                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2201                     f = {
2202                         'format_id': join_nonempty(*format_id),
2203                         'format_index': idx,
2204                         'url': manifest_url,
2205                         'manifest_url': m3u8_url,
2206                         'tbr': tbr,
2207                         'ext': ext,
2208                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2209                         'protocol': entry_protocol,
2210                         'preference': preference,
2211                         'quality': quality,
2212                         'has_drm': has_drm,
2213                     }
2214                     resolution = last_stream_inf.get('RESOLUTION')
2215                     if resolution:
2216                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2217                         if mobj:
2218                             f['width'] = int(mobj.group('width'))
2219                             f['height'] = int(mobj.group('height'))
2220                     # Unified Streaming Platform
2221                     mobj = re.search(
2222                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2223                     if mobj:
2224                         abr, vbr = mobj.groups()
2225                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2226                         f.update({
2227                             'vbr': vbr,
2228                             'abr': abr,
2229                         })
2230                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2231                     f.update(codecs)
2232                     audio_group_id = last_stream_inf.get('AUDIO')
2233                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2234                     # references a rendition group MUST have a CODECS attribute.
2235                     # However, this is not always respected. E.g. [2]
2236                     # contains EXT-X-STREAM-INF tag which references AUDIO
2237                     # rendition group but does not have CODECS and despite
2238                     # referencing an audio group it represents a complete
2239                     # (with audio and video) format. So, for such cases we will
2240                     # ignore references to rendition groups and treat them
2241                     # as complete formats.
2242                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2243                         audio_group = groups.get(audio_group_id)
2244                         if audio_group and audio_group[0].get('URI'):
2245                             # TODO: update acodec for audio only formats with
2246                             # the same GROUP-ID
2247                             f['acodec'] = 'none'
2248                     if not f.get('ext'):
2249                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2250                     formats.append(f)
2251
2252                     # for DailyMotion
2253                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2254                     if progressive_uri:
2255                         http_f = f.copy()
2256                         del http_f['manifest_url']
2257                         http_f.update({
2258                             'format_id': f['format_id'].replace('hls-', 'http-'),
2259                             'protocol': 'http',
2260                             'url': progressive_uri,
2261                         })
2262                         formats.append(http_f)
2263
2264                 last_stream_inf = {}
2265         return formats, subtitles
2266
2267     def _extract_m3u8_vod_duration(
2268             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2269
2270         m3u8_vod = self._download_webpage(
2271             m3u8_vod_url, video_id,
2272             note='Downloading m3u8 VOD manifest' if note is None else note,
2273             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2274             fatal=False, data=data, headers=headers, query=query)
2275
2276         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2277
2278     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2279         if '#EXT-X-ENDLIST' not in m3u8_vod:
2280             return None
2281
2282         return int(sum(
2283             float(line[len('#EXTINF:'):].split(',')[0])
2284             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2285
2286     def _extract_mpd_vod_duration(
2287             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2288
2289         mpd_doc = self._download_xml(
2290             mpd_url, video_id,
2291             note='Downloading MPD VOD manifest' if note is None else note,
2292             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2293             fatal=False, data=data, headers=headers, query=query)
2294         if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2295             return None
2296         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2297
2298     @staticmethod
2299     def _xpath_ns(path, namespace=None):
2300         if not namespace:
2301             return path
2302         out = []
2303         for c in path.split('/'):
2304             if not c or c == '.':
2305                 out.append(c)
2306             else:
2307                 out.append('{%s}%s' % (namespace, c))
2308         return '/'.join(out)
2309
2310     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2311         if self.get_param('ignore_no_formats_error'):
2312             fatal = False
2313
2314         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2315         if res is False:
2316             assert not fatal
2317             return [], {}
2318         smil, urlh = res
2319
2320         return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2321                                                       namespace=self._parse_smil_namespace(smil))
2322
2323     def _extract_smil_formats(self, *args, **kwargs):
2324         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2325         if subs:
2326             self._report_ignoring_subs('SMIL')
2327         return fmts
2328
2329     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2330         res = self._download_smil(smil_url, video_id, fatal=fatal)
2331         if res is False:
2332             return {}
2333
2334         smil, urlh = res
2335         smil_url = urlh.url
2336
2337         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2338
2339     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2340         return self._download_xml_handle(
2341             smil_url, video_id, 'Downloading SMIL file',
2342             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2343
2344     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2345         namespace = self._parse_smil_namespace(smil)
2346
2347         formats, subtitles = self._parse_smil_formats_and_subtitles(
2348             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2349
2350         video_id = os.path.splitext(url_basename(smil_url))[0]
2351         title = None
2352         description = None
2353         upload_date = None
2354         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2355             name = meta.attrib.get('name')
2356             content = meta.attrib.get('content')
2357             if not name or not content:
2358                 continue
2359             if not title and name == 'title':
2360                 title = content
2361             elif not description and name in ('description', 'abstract'):
2362                 description = content
2363             elif not upload_date and name == 'date':
2364                 upload_date = unified_strdate(content)
2365
2366         thumbnails = [{
2367             'id': image.get('type'),
2368             'url': image.get('src'),
2369             'width': int_or_none(image.get('width')),
2370             'height': int_or_none(image.get('height')),
2371         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2372
2373         return {
2374             'id': video_id,
2375             'title': title or video_id,
2376             'description': description,
2377             'upload_date': upload_date,
2378             'thumbnails': thumbnails,
2379             'formats': formats,
2380             'subtitles': subtitles,
2381         }
2382
2383     def _parse_smil_namespace(self, smil):
2384         return self._search_regex(
2385             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2386
2387     def _parse_smil_formats(self, *args, **kwargs):
2388         fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2389         if subs:
2390             self._report_ignoring_subs('SMIL')
2391         return fmts
2392
2393     def _parse_smil_formats_and_subtitles(
2394             self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2395         base = smil_url
2396         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2397             b = meta.get('base') or meta.get('httpBase')
2398             if b:
2399                 base = b
2400                 break
2401
2402         formats, subtitles = [], {}
2403         rtmp_count = 0
2404         http_count = 0
2405         m3u8_count = 0
2406         imgs_count = 0
2407
2408         srcs = set()
2409         media = itertools.chain.from_iterable(
2410             smil.findall(self._xpath_ns(arg, namespace))
2411             for arg in ['.//video', './/audio', './/media'])
2412         for medium in media:
2413             src = medium.get('src')
2414             if not src or src in srcs:
2415                 continue
2416             srcs.add(src)
2417
2418             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2419             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2420             width = int_or_none(medium.get('width'))
2421             height = int_or_none(medium.get('height'))
2422             proto = medium.get('proto')
2423             ext = medium.get('ext')
2424             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2425                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2426             streamer = medium.get('streamer') or base
2427
2428             if proto == 'rtmp' or streamer.startswith('rtmp'):
2429                 rtmp_count += 1
2430                 formats.append({
2431                     'url': streamer,
2432                     'play_path': src,
2433                     'ext': 'flv',
2434                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2435                     'tbr': bitrate,
2436                     'filesize': filesize,
2437                     'width': width,
2438                     'height': height,
2439                 })
2440                 if transform_rtmp_url:
2441                     streamer, src = transform_rtmp_url(streamer, src)
2442                     formats[-1].update({
2443                         'url': streamer,
2444                         'play_path': src,
2445                     })
2446                 continue
2447
2448             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2449             src_url = src_url.strip()
2450
2451             if proto == 'm3u8' or src_ext == 'm3u8':
2452                 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
2453                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2454                 self._merge_subtitles(m3u8_subs, target=subtitles)
2455                 if len(m3u8_formats) == 1:
2456                     m3u8_count += 1
2457                     m3u8_formats[0].update({
2458                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2459                         'tbr': bitrate,
2460                         'width': width,
2461                         'height': height,
2462                     })
2463                 formats.extend(m3u8_formats)
2464             elif src_ext == 'f4m':
2465                 f4m_url = src_url
2466                 if not f4m_params:
2467                     f4m_params = {
2468                         'hdcore': '3.2.0',
2469                         'plugin': 'flowplayer-3.2.0.1',
2470                     }
2471                 f4m_url += '&' if '?' in f4m_url else '?'
2472                 f4m_url += urllib.parse.urlencode(f4m_params)
2473                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2474             elif src_ext == 'mpd':
2475                 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2476                     src_url, video_id, mpd_id='dash', fatal=False)
2477                 formats.extend(mpd_formats)
2478                 self._merge_subtitles(mpd_subs, target=subtitles)
2479             elif re.search(r'\.ism/[Mm]anifest', src_url):
2480                 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2481                     src_url, video_id, ism_id='mss', fatal=False)
2482                 formats.extend(ism_formats)
2483                 self._merge_subtitles(ism_subs, target=subtitles)
2484             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2485                 http_count += 1
2486                 formats.append({
2487                     'url': src_url,
2488                     'ext': ext or src_ext or 'flv',
2489                     'format_id': 'http-%d' % (bitrate or http_count),
2490                     'tbr': bitrate,
2491                     'filesize': filesize,
2492                     'width': width,
2493                     'height': height,
2494                 })
2495
2496         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2497             src = medium.get('src')
2498             if not src or src in srcs:
2499                 continue
2500             srcs.add(src)
2501
2502             imgs_count += 1
2503             formats.append({
2504                 'format_id': 'imagestream-%d' % (imgs_count),
2505                 'url': src,
2506                 'ext': mimetype2ext(medium.get('type')),
2507                 'acodec': 'none',
2508                 'vcodec': 'none',
2509                 'width': int_or_none(medium.get('width')),
2510                 'height': int_or_none(medium.get('height')),
2511                 'format_note': 'SMIL storyboards',
2512             })
2513
2514         smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2515         self._merge_subtitles(smil_subs, target=subtitles)
2516
2517         return formats, subtitles
2518
2519     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2520         urls = []
2521         subtitles = {}
2522         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2523             src = textstream.get('src')
2524             if not src or src in urls:
2525                 continue
2526             urls.append(src)
2527             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2528             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2529             subtitles.setdefault(lang, []).append({
2530                 'url': src,
2531                 'ext': ext,
2532             })
2533         return subtitles
2534
2535     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2536         res = self._download_xml_handle(
2537             xspf_url, playlist_id, 'Downloading xpsf playlist',
2538             'Unable to download xspf manifest', fatal=fatal)
2539         if res is False:
2540             return []
2541
2542         xspf, urlh = res
2543         xspf_url = urlh.url
2544
2545         return self._parse_xspf(
2546             xspf, playlist_id, xspf_url=xspf_url,
2547             xspf_base_url=base_url(xspf_url))
2548
2549     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2550         NS_MAP = {
2551             'xspf': 'http://xspf.org/ns/0/',
2552             's1': 'http://static.streamone.nl/player/ns/0',
2553         }
2554
2555         entries = []
2556         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2557             title = xpath_text(
2558                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2559             description = xpath_text(
2560                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2561             thumbnail = xpath_text(
2562                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2563             duration = float_or_none(
2564                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2565
2566             formats = []
2567             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2568                 format_url = urljoin(xspf_base_url, location.text)
2569                 if not format_url:
2570                     continue
2571                 formats.append({
2572                     'url': format_url,
2573                     'manifest_url': xspf_url,
2574                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2575                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2576                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2577                 })
2578
2579             entries.append({
2580                 'id': playlist_id,
2581                 'title': title,
2582                 'description': description,
2583                 'thumbnail': thumbnail,
2584                 'duration': duration,
2585                 'formats': formats,
2586             })
2587         return entries
2588
2589     def _extract_mpd_formats(self, *args, **kwargs):
2590         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2591         if subs:
2592             self._report_ignoring_subs('DASH')
2593         return fmts
2594
2595     def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
2596         periods = self._extract_mpd_periods(*args, **kwargs)
2597         return self._merge_mpd_periods(periods)
2598
2599     def _extract_mpd_periods(
2600             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2601             fatal=True, data=None, headers={}, query={}):
2602
2603         if self.get_param('ignore_no_formats_error'):
2604             fatal = False
2605
2606         res = self._download_xml_handle(
2607             mpd_url, video_id,
2608             note='Downloading MPD manifest' if note is None else note,
2609             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2610             fatal=fatal, data=data, headers=headers, query=query)
2611         if res is False:
2612             return []
2613         mpd_doc, urlh = res
2614         if mpd_doc is None:
2615             return []
2616
2617         # We could have been redirected to a new url when we retrieved our mpd file.
2618         mpd_url = urlh.url
2619         mpd_base_url = base_url(mpd_url)
2620
2621         return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
2622
2623     def _parse_mpd_formats(self, *args, **kwargs):
2624         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2625         if subs:
2626             self._report_ignoring_subs('DASH')
2627         return fmts
2628
2629     def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
2630         periods = self._parse_mpd_periods(*args, **kwargs)
2631         return self._merge_mpd_periods(periods)
2632
2633     def _merge_mpd_periods(self, periods):
2634         """
2635         Combine all formats and subtitles from an MPD manifest into a single list,
2636         by concatenate streams with similar formats.
2637         """
2638         formats, subtitles = {}, {}
2639         for period in periods:
2640             for f in period['formats']:
2641                 assert 'is_dash_periods' not in f, 'format already processed'
2642                 f['is_dash_periods'] = True
2643                 format_key = tuple(v for k, v in f.items() if k not in (
2644                     ('format_id', 'fragments', 'manifest_stream_number')))
2645                 if format_key not in formats:
2646                     formats[format_key] = f
2647                 elif 'fragments' in f:
2648                     formats[format_key].setdefault('fragments', []).extend(f['fragments'])
2649
2650             if subtitles and period['subtitles']:
2651                 self.report_warning(bug_reports_message(
2652                     'Found subtitles in multiple periods in the DASH manifest; '
2653                     'if part of the subtitles are missing,'
2654                 ), only_once=True)
2655
2656             for sub_lang, sub_info in period['subtitles'].items():
2657                 subtitles.setdefault(sub_lang, []).extend(sub_info)
2658
2659         return list(formats.values()), subtitles
2660
2661     def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2662         """
2663         Parse formats from MPD manifest.
2664         References:
2665          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2666             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2667          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2668         """
2669         if not self.get_param('dynamic_mpd', True):
2670             if mpd_doc.get('type') == 'dynamic':
2671                 return [], {}
2672
2673         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2674
2675         def _add_ns(path):
2676             return self._xpath_ns(path, namespace)
2677
2678         def is_drm_protected(element):
2679             return element.find(_add_ns('ContentProtection')) is not None
2680
2681         def extract_multisegment_info(element, ms_parent_info):
2682             ms_info = ms_parent_info.copy()
2683
2684             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2685             # common attributes and elements.  We will only extract relevant
2686             # for us.
2687             def extract_common(source):
2688                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2689                 if segment_timeline is not None:
2690                     s_e = segment_timeline.findall(_add_ns('S'))
2691                     if s_e:
2692                         ms_info['total_number'] = 0
2693                         ms_info['s'] = []
2694                         for s in s_e:
2695                             r = int(s.get('r', 0))
2696                             ms_info['total_number'] += 1 + r
2697                             ms_info['s'].append({
2698                                 't': int(s.get('t', 0)),
2699                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2700                                 'd': int(s.attrib['d']),
2701                                 'r': r,
2702                             })
2703                 start_number = source.get('startNumber')
2704                 if start_number:
2705                     ms_info['start_number'] = int(start_number)
2706                 timescale = source.get('timescale')
2707                 if timescale:
2708                     ms_info['timescale'] = int(timescale)
2709                 segment_duration = source.get('duration')
2710                 if segment_duration:
2711                     ms_info['segment_duration'] = float(segment_duration)
2712
2713             def extract_Initialization(source):
2714                 initialization = source.find(_add_ns('Initialization'))
2715                 if initialization is not None:
2716                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2717
2718             segment_list = element.find(_add_ns('SegmentList'))
2719             if segment_list is not None:
2720                 extract_common(segment_list)
2721                 extract_Initialization(segment_list)
2722                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2723                 if segment_urls_e:
2724                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2725             else:
2726                 segment_template = element.find(_add_ns('SegmentTemplate'))
2727                 if segment_template is not None:
2728                     extract_common(segment_template)
2729                     media = segment_template.get('media')
2730                     if media:
2731                         ms_info['media'] = media
2732                     initialization = segment_template.get('initialization')
2733                     if initialization:
2734                         ms_info['initialization'] = initialization
2735                     else:
2736                         extract_Initialization(segment_template)
2737             return ms_info
2738
2739         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2740         stream_numbers = collections.defaultdict(int)
2741         for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
2742             period_entry = {
2743                 'id': period.get('id', f'period-{period_idx}'),
2744                 'formats': [],
2745                 'subtitles': collections.defaultdict(list),
2746             }
2747             period_duration = parse_duration(period.get('duration')) or mpd_duration
2748             period_ms_info = extract_multisegment_info(period, {
2749                 'start_number': 1,
2750                 'timescale': 1,
2751             })
2752             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2753                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2754                 for representation in adaptation_set.findall(_add_ns('Representation')):
2755                     representation_attrib = adaptation_set.attrib.copy()
2756                     representation_attrib.update(representation.attrib)
2757                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2758                     mime_type = representation_attrib['mimeType']
2759                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2760
2761                     codec_str = representation_attrib.get('codecs', '')
2762                     # Some kind of binary subtitle found in some youtube livestreams
2763                     if mime_type == 'application/x-rawcc':
2764                         codecs = {'scodec': codec_str}
2765                     else:
2766                         codecs = parse_codecs(codec_str)
2767                     if content_type not in ('video', 'audio', 'text'):
2768                         if mime_type == 'image/jpeg':
2769                             content_type = mime_type
2770                         elif codecs.get('vcodec', 'none') != 'none':
2771                             content_type = 'video'
2772                         elif codecs.get('acodec', 'none') != 'none':
2773                             content_type = 'audio'
2774                         elif codecs.get('scodec', 'none') != 'none':
2775                             content_type = 'text'
2776                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2777                             content_type = 'text'
2778                         else:
2779                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2780                             continue
2781
2782                     base_url = ''
2783                     for element in (representation, adaptation_set, period, mpd_doc):
2784                         base_url_e = element.find(_add_ns('BaseURL'))
2785                         if try_call(lambda: base_url_e.text) is not None:
2786                             base_url = base_url_e.text + base_url
2787                             if re.match(r'^https?://', base_url):
2788                                 break
2789                     if mpd_base_url and base_url.startswith('/'):
2790                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2791                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2792                         if not mpd_base_url.endswith('/'):
2793                             mpd_base_url += '/'
2794                         base_url = mpd_base_url + base_url
2795                     representation_id = representation_attrib.get('id')
2796                     lang = representation_attrib.get('lang')
2797                     url_el = representation.find(_add_ns('BaseURL'))
2798                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2799                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2800                     if representation_id is not None:
2801                         format_id = representation_id
2802                     else:
2803                         format_id = content_type
2804                     if mpd_id:
2805                         format_id = mpd_id + '-' + format_id
2806                     if content_type in ('video', 'audio'):
2807                         f = {
2808                             'format_id': format_id,
2809                             'manifest_url': mpd_url,
2810                             'ext': mimetype2ext(mime_type),
2811                             'width': int_or_none(representation_attrib.get('width')),
2812                             'height': int_or_none(representation_attrib.get('height')),
2813                             'tbr': float_or_none(bandwidth, 1000),
2814                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2815                             'fps': int_or_none(representation_attrib.get('frameRate')),
2816                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2817                             'format_note': 'DASH %s' % content_type,
2818                             'filesize': filesize,
2819                             'container': mimetype2ext(mime_type) + '_dash',
2820                             **codecs
2821                         }
2822                     elif content_type == 'text':
2823                         f = {
2824                             'ext': mimetype2ext(mime_type),
2825                             'manifest_url': mpd_url,
2826                             'filesize': filesize,
2827                         }
2828                     elif content_type == 'image/jpeg':
2829                         # See test case in VikiIE
2830                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2831                         f = {
2832                             'format_id': format_id,
2833                             'ext': 'mhtml',
2834                             'manifest_url': mpd_url,
2835                             'format_note': 'DASH storyboards (jpeg)',
2836                             'acodec': 'none',
2837                             'vcodec': 'none',
2838                         }
2839                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2840                         f['has_drm'] = True
2841                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2842
2843                     def prepare_template(template_name, identifiers):
2844                         tmpl = representation_ms_info[template_name]
2845                         if representation_id is not None:
2846                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2847                         # First of, % characters outside $...$ templates
2848                         # must be escaped by doubling for proper processing
2849                         # by % operator string formatting used further (see
2850                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2851                         t = ''
2852                         in_template = False
2853                         for c in tmpl:
2854                             t += c
2855                             if c == '$':
2856                                 in_template = not in_template
2857                             elif c == '%' and not in_template:
2858                                 t += c
2859                         # Next, $...$ templates are translated to their
2860                         # %(...) counterparts to be used with % operator
2861                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2862                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2863                         t.replace('$$', '$')
2864                         return t
2865
2866                     # @initialization is a regular template like @media one
2867                     # so it should be handled just the same way (see
2868                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2869                     if 'initialization' in representation_ms_info:
2870                         initialization_template = prepare_template(
2871                             'initialization',
2872                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2873                             # $Time$ shall not be included for @initialization thus
2874                             # only $Bandwidth$ remains
2875                             ('Bandwidth', ))
2876                         representation_ms_info['initialization_url'] = initialization_template % {
2877                             'Bandwidth': bandwidth,
2878                         }
2879
2880                     def location_key(location):
2881                         return 'url' if re.match(r'^https?://', location) else 'path'
2882
2883                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2884
2885                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2886                         media_location_key = location_key(media_template)
2887
2888                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2889                         # can't be used at the same time
2890                         if '%(Number' in media_template and 's' not in representation_ms_info:
2891                             segment_duration = None
2892                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2893                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2894                                 representation_ms_info['total_number'] = int(math.ceil(
2895                                     float_or_none(period_duration, segment_duration, default=0)))
2896                             representation_ms_info['fragments'] = [{
2897                                 media_location_key: media_template % {
2898                                     'Number': segment_number,
2899                                     'Bandwidth': bandwidth,
2900                                 },
2901                                 'duration': segment_duration,
2902                             } for segment_number in range(
2903                                 representation_ms_info['start_number'],
2904                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2905                         else:
2906                             # $Number*$ or $Time$ in media template with S list available
2907                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2908                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2909                             representation_ms_info['fragments'] = []
2910                             segment_time = 0
2911                             segment_d = None
2912                             segment_number = representation_ms_info['start_number']
2913
2914                             def add_segment_url():
2915                                 segment_url = media_template % {
2916                                     'Time': segment_time,
2917                                     'Bandwidth': bandwidth,
2918                                     'Number': segment_number,
2919                                 }
2920                                 representation_ms_info['fragments'].append({
2921                                     media_location_key: segment_url,
2922                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2923                                 })
2924
2925                             for num, s in enumerate(representation_ms_info['s']):
2926                                 segment_time = s.get('t') or segment_time
2927                                 segment_d = s['d']
2928                                 add_segment_url()
2929                                 segment_number += 1
2930                                 for r in range(s.get('r', 0)):
2931                                     segment_time += segment_d
2932                                     add_segment_url()
2933                                     segment_number += 1
2934                                 segment_time += segment_d
2935                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2936                         # No media template,
2937                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2938                         # or any YouTube dashsegments video
2939                         fragments = []
2940                         segment_index = 0
2941                         timescale = representation_ms_info['timescale']
2942                         for s in representation_ms_info['s']:
2943                             duration = float_or_none(s['d'], timescale)
2944                             for r in range(s.get('r', 0) + 1):
2945                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2946                                 fragments.append({
2947                                     location_key(segment_uri): segment_uri,
2948                                     'duration': duration,
2949                                 })
2950                                 segment_index += 1
2951                         representation_ms_info['fragments'] = fragments
2952                     elif 'segment_urls' in representation_ms_info:
2953                         # Segment URLs with no SegmentTimeline
2954                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2955                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2956                         fragments = []
2957                         segment_duration = float_or_none(
2958                             representation_ms_info['segment_duration'],
2959                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2960                         for segment_url in representation_ms_info['segment_urls']:
2961                             fragment = {
2962                                 location_key(segment_url): segment_url,
2963                             }
2964                             if segment_duration:
2965                                 fragment['duration'] = segment_duration
2966                             fragments.append(fragment)
2967                         representation_ms_info['fragments'] = fragments
2968                     # If there is a fragments key available then we correctly recognized fragmented media.
2969                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2970                     # assumption is not necessarily correct since we may simply have no support for
2971                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2972                     if 'fragments' in representation_ms_info:
2973                         f.update({
2974                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2975                             'url': mpd_url or base_url,
2976                             'fragment_base_url': base_url,
2977                             'fragments': [],
2978                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2979                         })
2980                         if 'initialization_url' in representation_ms_info:
2981                             initialization_url = representation_ms_info['initialization_url']
2982                             if not f.get('url'):
2983                                 f['url'] = initialization_url
2984                             f['fragments'].append({location_key(initialization_url): initialization_url})
2985                         f['fragments'].extend(representation_ms_info['fragments'])
2986                         if not period_duration:
2987                             period_duration = try_get(
2988                                 representation_ms_info,
2989                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2990                     else:
2991                         # Assuming direct URL to unfragmented media.
2992                         f['url'] = base_url
2993                     if content_type in ('video', 'audio', 'image/jpeg'):
2994                         f['manifest_stream_number'] = stream_numbers[f['url']]
2995                         stream_numbers[f['url']] += 1
2996                         period_entry['formats'].append(f)
2997                     elif content_type == 'text':
2998                         period_entry['subtitles'][lang or 'und'].append(f)
2999             yield period_entry
3000
3001     def _extract_ism_formats(self, *args, **kwargs):
3002         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3003         if subs:
3004             self._report_ignoring_subs('ISM')
3005         return fmts
3006
3007     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3008         if self.get_param('ignore_no_formats_error'):
3009             fatal = False
3010
3011         res = self._download_xml_handle(
3012             ism_url, video_id,
3013             note='Downloading ISM manifest' if note is None else note,
3014             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3015             fatal=fatal, data=data, headers=headers, query=query)
3016         if res is False:
3017             return [], {}
3018         ism_doc, urlh = res
3019         if ism_doc is None:
3020             return [], {}
3021
3022         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
3023
3024     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3025         """
3026         Parse formats from ISM manifest.
3027         References:
3028          1. [MS-SSTR]: Smooth Streaming Protocol,
3029             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3030         """
3031         if ism_doc.get('IsLive') == 'TRUE':
3032             return [], {}
3033
3034         duration = int(ism_doc.attrib['Duration'])
3035         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3036
3037         formats = []
3038         subtitles = {}
3039         for stream in ism_doc.findall('StreamIndex'):
3040             stream_type = stream.get('Type')
3041             if stream_type not in ('video', 'audio', 'text'):
3042                 continue
3043             url_pattern = stream.attrib['Url']
3044             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3045             stream_name = stream.get('Name')
3046             stream_language = stream.get('Language', 'und')
3047             for track in stream.findall('QualityLevel'):
3048                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3049                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3050                 # TODO: add support for WVC1 and WMAP
3051                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3052                     self.report_warning('%s is not a supported codec' % fourcc)
3053                     continue
3054                 tbr = int(track.attrib['Bitrate']) // 1000
3055                 # [1] does not mention Width and Height attributes. However,
3056                 # they're often present while MaxWidth and MaxHeight are
3057                 # missing, so should be used as fallbacks
3058                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3059                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3060                 sampling_rate = int_or_none(track.get('SamplingRate'))
3061
3062                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3063                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3064
3065                 fragments = []
3066                 fragment_ctx = {
3067                     'time': 0,
3068                 }
3069                 stream_fragments = stream.findall('c')
3070                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3071                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3072                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3073                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3074                     if not fragment_ctx['duration']:
3075                         try:
3076                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3077                         except IndexError:
3078                             next_fragment_time = duration
3079                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3080                     for _ in range(fragment_repeat):
3081                         fragments.append({
3082                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3083                             'duration': fragment_ctx['duration'] / stream_timescale,
3084                         })
3085                         fragment_ctx['time'] += fragment_ctx['duration']
3086
3087                 if stream_type == 'text':
3088                     subtitles.setdefault(stream_language, []).append({
3089                         'ext': 'ismt',
3090                         'protocol': 'ism',
3091                         'url': ism_url,
3092                         'manifest_url': ism_url,
3093                         'fragments': fragments,
3094                         '_download_params': {
3095                             'stream_type': stream_type,
3096                             'duration': duration,
3097                             'timescale': stream_timescale,
3098                             'fourcc': fourcc,
3099                             'language': stream_language,
3100                             'codec_private_data': track.get('CodecPrivateData'),
3101                         }
3102                     })
3103                 elif stream_type in ('video', 'audio'):
3104                     formats.append({
3105                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3106                         'url': ism_url,
3107                         'manifest_url': ism_url,
3108                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3109                         'width': width,
3110                         'height': height,
3111                         'tbr': tbr,
3112                         'asr': sampling_rate,
3113                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3114                         'acodec': 'none' if stream_type == 'video' else fourcc,
3115                         'protocol': 'ism',
3116                         'fragments': fragments,
3117                         'has_drm': ism_doc.find('Protection') is not None,
3118                         'language': stream_language,
3119                         'audio_channels': int_or_none(track.get('Channels')),
3120                         '_download_params': {
3121                             'stream_type': stream_type,
3122                             'duration': duration,
3123                             'timescale': stream_timescale,
3124                             'width': width or 0,
3125                             'height': height or 0,
3126                             'fourcc': fourcc,
3127                             'language': stream_language,
3128                             'codec_private_data': track.get('CodecPrivateData'),
3129                             'sampling_rate': sampling_rate,
3130                             'channels': int_or_none(track.get('Channels', 2)),
3131                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3132                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3133                         },
3134                     })
3135         return formats, subtitles
3136
3137     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3138         def absolute_url(item_url):
3139             return urljoin(base_url, item_url)
3140
3141         def parse_content_type(content_type):
3142             if not content_type:
3143                 return {}
3144             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3145             if ctr:
3146                 mimetype, codecs = ctr.groups()
3147                 f = parse_codecs(codecs)
3148                 f['ext'] = mimetype2ext(mimetype)
3149                 return f
3150             return {}
3151
3152         def _media_formats(src, cur_media_type, type_info=None):
3153             type_info = type_info or {}
3154             full_url = absolute_url(src)
3155             ext = type_info.get('ext') or determine_ext(full_url)
3156             if ext == 'm3u8':
3157                 is_plain_url = False
3158                 formats = self._extract_m3u8_formats(
3159                     full_url, video_id, ext='mp4',
3160                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3161                     preference=preference, quality=quality, fatal=False)
3162             elif ext == 'mpd':
3163                 is_plain_url = False
3164                 formats = self._extract_mpd_formats(
3165                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3166             else:
3167                 is_plain_url = True
3168                 formats = [{
3169                     'url': full_url,
3170                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3171                     'ext': ext,
3172                 }]
3173             return is_plain_url, formats
3174
3175         entries = []
3176         # amp-video and amp-audio are very similar to their HTML5 counterparts
3177         # so we will include them right here (see
3178         # https://www.ampproject.org/docs/reference/components/amp-video)
3179         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3180         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3181         media_tags = [(media_tag, media_tag_name, media_type, '')
3182                       for media_tag, media_tag_name, media_type
3183                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3184         media_tags.extend(re.findall(
3185             # We only allow video|audio followed by a whitespace or '>'.
3186             # Allowing more characters may end up in significant slow down (see
3187             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3188             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3189             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3190         for media_tag, _, media_type, media_content in media_tags:
3191             media_info = {
3192                 'formats': [],
3193                 'subtitles': {},
3194             }
3195             media_attributes = extract_attributes(media_tag)
3196             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3197             if src:
3198                 f = parse_content_type(media_attributes.get('type'))
3199                 _, formats = _media_formats(src, media_type, f)
3200                 media_info['formats'].extend(formats)
3201             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3202             if media_content:
3203                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3204                     s_attr = extract_attributes(source_tag)
3205                     # data-video-src and data-src are non standard but seen
3206                     # several times in the wild
3207                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3208                     if not src:
3209                         continue
3210                     f = parse_content_type(s_attr.get('type'))
3211                     is_plain_url, formats = _media_formats(src, media_type, f)
3212                     if is_plain_url:
3213                         # width, height, res, label and title attributes are
3214                         # all not standard but seen several times in the wild
3215                         labels = [
3216                             s_attr.get(lbl)
3217                             for lbl in ('label', 'title')
3218                             if str_or_none(s_attr.get(lbl))
3219                         ]
3220                         width = int_or_none(s_attr.get('width'))
3221                         height = (int_or_none(s_attr.get('height'))
3222                                   or int_or_none(s_attr.get('res')))
3223                         if not width or not height:
3224                             for lbl in labels:
3225                                 resolution = parse_resolution(lbl)
3226                                 if not resolution:
3227                                     continue
3228                                 width = width or resolution.get('width')
3229                                 height = height or resolution.get('height')
3230                         for lbl in labels:
3231                             tbr = parse_bitrate(lbl)
3232                             if tbr:
3233                                 break
3234                         else:
3235                             tbr = None
3236                         f.update({
3237                             'width': width,
3238                             'height': height,
3239                             'tbr': tbr,
3240                             'format_id': s_attr.get('label') or s_attr.get('title'),
3241                         })
3242                         f.update(formats[0])
3243                         media_info['formats'].append(f)
3244                     else:
3245                         media_info['formats'].extend(formats)
3246                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3247                     track_attributes = extract_attributes(track_tag)
3248                     kind = track_attributes.get('kind')
3249                     if not kind or kind in ('subtitles', 'captions'):
3250                         src = strip_or_none(track_attributes.get('src'))
3251                         if not src:
3252                             continue
3253                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3254                         media_info['subtitles'].setdefault(lang, []).append({
3255                             'url': absolute_url(src),
3256                         })
3257             for f in media_info['formats']:
3258                 f.setdefault('http_headers', {})['Referer'] = base_url
3259             if media_info['formats'] or media_info['subtitles']:
3260                 entries.append(media_info)
3261         return entries
3262
3263     def _extract_akamai_formats(self, *args, **kwargs):
3264         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3265         if subs:
3266             self._report_ignoring_subs('akamai')
3267         return fmts
3268
3269     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3270         signed = 'hdnea=' in manifest_url
3271         if not signed:
3272             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3273             manifest_url = re.sub(
3274                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3275                 '', manifest_url).strip('?')
3276
3277         formats = []
3278         subtitles = {}
3279
3280         hdcore_sign = 'hdcore=3.7.0'
3281         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3282         hds_host = hosts.get('hds')
3283         if hds_host:
3284             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3285         if 'hdcore=' not in f4m_url:
3286             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3287         f4m_formats = self._extract_f4m_formats(
3288             f4m_url, video_id, f4m_id='hds', fatal=False)
3289         for entry in f4m_formats:
3290             entry.update({'extra_param_to_segment_url': hdcore_sign})
3291         formats.extend(f4m_formats)
3292
3293         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3294         hls_host = hosts.get('hls')
3295         if hls_host:
3296             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3297         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3298             m3u8_url, video_id, 'mp4', 'm3u8_native',
3299             m3u8_id='hls', fatal=False)
3300         formats.extend(m3u8_formats)
3301         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3302
3303         http_host = hosts.get('http')
3304         if http_host and m3u8_formats and not signed:
3305             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3306             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3307             qualities_length = len(qualities)
3308             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3309                 i = 0
3310                 for f in m3u8_formats:
3311                     if f['vcodec'] != 'none':
3312                         for protocol in ('http', 'https'):
3313                             http_f = f.copy()
3314                             del http_f['manifest_url']
3315                             http_url = re.sub(
3316                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3317                             http_f.update({
3318                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3319                                 'url': http_url,
3320                                 'protocol': protocol,
3321                             })
3322                             formats.append(http_f)
3323                         i += 1
3324
3325         return formats, subtitles
3326
3327     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3328         query = urllib.parse.urlparse(url).query
3329         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3330         mobj = re.search(
3331             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3332         url_base = mobj.group('url')
3333         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3334         formats = []
3335
3336         def manifest_url(manifest):
3337             m_url = f'{http_base_url}/{manifest}'
3338             if query:
3339                 m_url += '?%s' % query
3340             return m_url
3341
3342         if 'm3u8' not in skip_protocols:
3343             formats.extend(self._extract_m3u8_formats(
3344                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3345                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3346         if 'f4m' not in skip_protocols:
3347             formats.extend(self._extract_f4m_formats(
3348                 manifest_url('manifest.f4m'),
3349                 video_id, f4m_id='hds', fatal=False))
3350         if 'dash' not in skip_protocols:
3351             formats.extend(self._extract_mpd_formats(
3352                 manifest_url('manifest.mpd'),
3353                 video_id, mpd_id='dash', fatal=False))
3354         if re.search(r'(?:/smil:|\.smil)', url_base):
3355             if 'smil' not in skip_protocols:
3356                 rtmp_formats = self._extract_smil_formats(
3357                     manifest_url('jwplayer.smil'),
3358                     video_id, fatal=False)
3359                 for rtmp_format in rtmp_formats:
3360                     rtsp_format = rtmp_format.copy()
3361                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3362                     del rtsp_format['play_path']
3363                     del rtsp_format['ext']
3364                     rtsp_format.update({
3365                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3366                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3367                         'protocol': 'rtsp',
3368                     })
3369                     formats.extend([rtmp_format, rtsp_format])
3370         else:
3371             for protocol in ('rtmp', 'rtsp'):
3372                 if protocol not in skip_protocols:
3373                     formats.append({
3374                         'url': f'{protocol}:{url_base}',
3375                         'format_id': protocol,
3376                         'protocol': protocol,
3377                     })
3378         return formats
3379
3380     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3381         mobj = re.search(
3382             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3383             webpage)
3384         if mobj:
3385             try:
3386                 jwplayer_data = self._parse_json(mobj.group('options'),
3387                                                  video_id=video_id,
3388                                                  transform_source=transform_source)
3389             except ExtractorError:
3390                 pass
3391             else:
3392                 if isinstance(jwplayer_data, dict):
3393                     return jwplayer_data
3394
3395     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3396         jwplayer_data = self._find_jwplayer_data(
3397             webpage, video_id, transform_source=js_to_json)
3398         return self._parse_jwplayer_data(
3399             jwplayer_data, video_id, *args, **kwargs)
3400
3401     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3402                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3403         entries = []
3404         if not isinstance(jwplayer_data, dict):
3405             return entries
3406
3407         playlist_items = jwplayer_data.get('playlist')
3408         # JWPlayer backward compatibility: single playlist item/flattened playlists
3409         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3410         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3411         if not isinstance(playlist_items, list):
3412             playlist_items = (playlist_items or jwplayer_data, )
3413
3414         for video_data in playlist_items:
3415             if not isinstance(video_data, dict):
3416                 continue
3417             # JWPlayer backward compatibility: flattened sources
3418             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3419             if 'sources' not in video_data:
3420                 video_data['sources'] = [video_data]
3421
3422             this_video_id = video_id or video_data['mediaid']
3423
3424             formats = self._parse_jwplayer_formats(
3425                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3426                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3427
3428             subtitles = {}
3429             tracks = video_data.get('tracks')
3430             if tracks and isinstance(tracks, list):
3431                 for track in tracks:
3432                     if not isinstance(track, dict):
3433                         continue
3434                     track_kind = track.get('kind')
3435                     if not track_kind or not isinstance(track_kind, str):
3436                         continue
3437                     if track_kind.lower() not in ('captions', 'subtitles'):
3438                         continue
3439                     track_url = urljoin(base_url, track.get('file'))
3440                     if not track_url:
3441                         continue
3442                     subtitles.setdefault(track.get('label') or 'en', []).append({
3443                         'url': self._proto_relative_url(track_url)
3444                     })
3445
3446             entry = {
3447                 'id': this_video_id,
3448                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3449                 'description': clean_html(video_data.get('description')),
3450                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3451                 'timestamp': int_or_none(video_data.get('pubdate')),
3452                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3453                 'subtitles': subtitles,
3454                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3455                 'genre': clean_html(video_data.get('genre')),
3456                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3457                 'season_number': int_or_none(video_data.get('season')),
3458                 'episode_number': int_or_none(video_data.get('episode')),
3459                 'release_year': int_or_none(video_data.get('releasedate')),
3460                 'age_limit': int_or_none(video_data.get('age_restriction')),
3461             }
3462             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3463             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3464                 entry.update({
3465                     '_type': 'url_transparent',
3466                     'url': formats[0]['url'],
3467                 })
3468             else:
3469                 entry['formats'] = formats
3470             entries.append(entry)
3471         if len(entries) == 1:
3472             return entries[0]
3473         else:
3474             return self.playlist_result(entries)
3475
3476     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3477                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3478         urls = set()
3479         formats = []
3480         for source in jwplayer_sources_data:
3481             if not isinstance(source, dict):
3482                 continue
3483             source_url = urljoin(
3484                 base_url, self._proto_relative_url(source.get('file')))
3485             if not source_url or source_url in urls:
3486                 continue
3487             urls.add(source_url)
3488             source_type = source.get('type') or ''
3489             ext = mimetype2ext(source_type) or determine_ext(source_url)
3490             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3491                 formats.extend(self._extract_m3u8_formats(
3492                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3493                     m3u8_id=m3u8_id, fatal=False))
3494             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3495                 formats.extend(self._extract_mpd_formats(
3496                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3497             elif ext == 'smil':
3498                 formats.extend(self._extract_smil_formats(
3499                     source_url, video_id, fatal=False))
3500             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3501             elif source_type.startswith('audio') or ext in (
3502                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3503                 formats.append({
3504                     'url': source_url,
3505                     'vcodec': 'none',
3506                     'ext': ext,
3507                 })
3508             else:
3509                 format_id = str_or_none(source.get('label'))
3510                 height = int_or_none(source.get('height'))
3511                 if height is None and format_id:
3512                     # Often no height is provided but there is a label in
3513                     # format like "1080p", "720p SD", or 1080.
3514                     height = parse_resolution(format_id).get('height')
3515                 a_format = {
3516                     'url': source_url,
3517                     'width': int_or_none(source.get('width')),
3518                     'height': height,
3519                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3520                     'filesize': int_or_none(source.get('filesize')),
3521                     'ext': ext,
3522                     'format_id': format_id
3523                 }
3524                 if source_url.startswith('rtmp'):
3525                     a_format['ext'] = 'flv'
3526                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3527                     # of jwplayer.flash.swf
3528                     rtmp_url_parts = re.split(
3529                         r'((?:mp4|mp3|flv):)', source_url, 1)
3530                     if len(rtmp_url_parts) == 3:
3531                         rtmp_url, prefix, play_path = rtmp_url_parts
3532                         a_format.update({
3533                             'url': rtmp_url,
3534                             'play_path': prefix + play_path,
3535                         })
3536                     if rtmp_params:
3537                         a_format.update(rtmp_params)
3538                 formats.append(a_format)
3539         return formats
3540
3541     def _live_title(self, name):
3542         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3543         return name
3544
3545     def _int(self, v, name, fatal=False, **kwargs):
3546         res = int_or_none(v, **kwargs)
3547         if res is None:
3548             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3549             if fatal:
3550                 raise ExtractorError(msg)
3551             else:
3552                 self.report_warning(msg)
3553         return res
3554
3555     def _float(self, v, name, fatal=False, **kwargs):
3556         res = float_or_none(v, **kwargs)
3557         if res is None:
3558             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3559             if fatal:
3560                 raise ExtractorError(msg)
3561             else:
3562                 self.report_warning(msg)
3563         return res
3564
3565     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3566                     path='/', secure=False, discard=False, rest={}, **kwargs):
3567         cookie = http.cookiejar.Cookie(
3568             0, name, value, port, port is not None, domain, True,
3569             domain.startswith('.'), path, True, secure, expire_time,
3570             discard, None, None, rest)
3571         self.cookiejar.set_cookie(cookie)
3572
3573     def _get_cookies(self, url):
3574         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3575         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3576
3577     def _apply_first_set_cookie_header(self, url_handle, cookie):
3578         """
3579         Apply first Set-Cookie header instead of the last. Experimental.
3580
3581         Some sites (e.g. [1-3]) may serve two cookies under the same name
3582         in Set-Cookie header and expect the first (old) one to be set rather
3583         than second (new). However, as of RFC6265 the newer one cookie
3584         should be set into cookie store what actually happens.
3585         We will workaround this issue by resetting the cookie to
3586         the first one manually.
3587         1. https://new.vk.com/
3588         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3589         3. https://learning.oreilly.com/
3590         """
3591         for header, cookies in url_handle.headers.items():
3592             if header.lower() != 'set-cookie':
3593                 continue
3594             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3595             cookie_value = re.search(
3596                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3597             if cookie_value:
3598                 value, domain = cookie_value.groups()
3599                 self._set_cookie(domain, cookie, value)
3600                 break
3601
3602     @classmethod
3603     def get_testcases(cls, include_onlymatching=False):
3604         # Do not look in super classes
3605         t = vars(cls).get('_TEST')
3606         if t:
3607             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3608             tests = [t]
3609         else:
3610             tests = vars(cls).get('_TESTS', [])
3611         for t in tests:
3612             if not include_onlymatching and t.get('only_matching', False):
3613                 continue
3614             t['name'] = cls.ie_key()
3615             yield t
3616         if getattr(cls, '__wrapped__', None):
3617             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3618
3619     @classmethod
3620     def get_webpage_testcases(cls):
3621         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3622         for t in tests:
3623             t['name'] = cls.ie_key()
3624             yield t
3625         if getattr(cls, '__wrapped__', None):
3626             yield from cls.__wrapped__.get_webpage_testcases()
3627
3628     @classproperty(cache=True)
3629     def age_limit(cls):
3630         """Get age limit from the testcases"""
3631         return max(traverse_obj(
3632             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3633             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3634
3635     @classproperty(cache=True)
3636     def _RETURN_TYPE(cls):
3637         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3638         tests = tuple(cls.get_testcases(include_onlymatching=False))
3639         if not tests:
3640             return None
3641         elif not any(k.startswith('playlist') for test in tests for k in test):
3642             return 'video'
3643         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3644             return 'playlist'
3645         return 'any'
3646
3647     @classmethod
3648     def is_single_video(cls, url):
3649         """Returns whether the URL is of a single video, None if unknown"""
3650         if cls.suitable(url):
3651             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3652
3653     @classmethod
3654     def is_suitable(cls, age_limit):
3655         """Test whether the extractor is generally suitable for the given age limit"""
3656         return not age_restricted(cls.age_limit, age_limit)
3657
3658     @classmethod
3659     def description(cls, *, markdown=True, search_examples=None):
3660         """Description of the extractor"""
3661         desc = ''
3662         if cls._NETRC_MACHINE:
3663             if markdown:
3664                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3665             else:
3666                 desc += f' [{cls._NETRC_MACHINE}]'
3667         if cls.IE_DESC is False:
3668             desc += ' [HIDDEN]'
3669         elif cls.IE_DESC:
3670             desc += f' {cls.IE_DESC}'
3671         if cls.SEARCH_KEY:
3672             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3673             if search_examples:
3674                 _COUNTS = ('', '5', '10', 'all')
3675                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3676         if not cls.working():
3677             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3678
3679         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3680         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3681         return f'{name}:{desc}' if desc else name
3682
3683     def extract_subtitles(self, *args, **kwargs):
3684         if (self.get_param('writesubtitles', False)
3685                 or self.get_param('listsubtitles')):
3686             return self._get_subtitles(*args, **kwargs)
3687         return {}
3688
3689     def _get_subtitles(self, *args, **kwargs):
3690         raise NotImplementedError('This method must be implemented by subclasses')
3691
3692     class CommentsDisabled(Exception):
3693         """Raise in _get_comments if comments are disabled for the video"""
3694
3695     def extract_comments(self, *args, **kwargs):
3696         if not self.get_param('getcomments'):
3697             return None
3698         generator = self._get_comments(*args, **kwargs)
3699
3700         def extractor():
3701             comments = []
3702             interrupted = True
3703             try:
3704                 while True:
3705                     comments.append(next(generator))
3706             except StopIteration:
3707                 interrupted = False
3708             except KeyboardInterrupt:
3709                 self.to_screen('Interrupted by user')
3710             except self.CommentsDisabled:
3711                 return {'comments': None, 'comment_count': None}
3712             except Exception as e:
3713                 if self.get_param('ignoreerrors') is not True:
3714                     raise
3715                 self._downloader.report_error(e)
3716             comment_count = len(comments)
3717             self.to_screen(f'Extracted {comment_count} comments')
3718             return {
3719                 'comments': comments,
3720                 'comment_count': None if interrupted else comment_count
3721             }
3722         return extractor
3723
3724     def _get_comments(self, *args, **kwargs):
3725         raise NotImplementedError('This method must be implemented by subclasses')
3726
3727     @staticmethod
3728     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3729         """ Merge subtitle items for one language. Items with duplicated URLs/data
3730         will be dropped. """
3731         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3732         ret = list(subtitle_list1)
3733         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3734         return ret
3735
3736     @classmethod
3737     def _merge_subtitles(cls, *dicts, target=None):
3738         """ Merge subtitle dictionaries, language by language. """
3739         if target is None:
3740             target = {}
3741         for d in dicts:
3742             for lang, subs in d.items():
3743                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3744         return target
3745
3746     def extract_automatic_captions(self, *args, **kwargs):
3747         if (self.get_param('writeautomaticsub', False)
3748                 or self.get_param('listsubtitles')):
3749             return self._get_automatic_captions(*args, **kwargs)
3750         return {}
3751
3752     def _get_automatic_captions(self, *args, **kwargs):
3753         raise NotImplementedError('This method must be implemented by subclasses')
3754
3755     @functools.cached_property
3756     def _cookies_passed(self):
3757         """Whether cookies have been passed to YoutubeDL"""
3758         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3759
3760     def mark_watched(self, *args, **kwargs):
3761         if not self.get_param('mark_watched', False):
3762             return
3763         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3764             self._mark_watched(*args, **kwargs)
3765
3766     def _mark_watched(self, *args, **kwargs):
3767         raise NotImplementedError('This method must be implemented by subclasses')
3768
3769     def geo_verification_headers(self):
3770         headers = {}
3771         geo_verification_proxy = self.get_param('geo_verification_proxy')
3772         if geo_verification_proxy:
3773             headers['Ytdl-request-proxy'] = geo_verification_proxy
3774         return headers
3775
3776     @staticmethod
3777     def _generic_id(url):
3778         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3779
3780     def _generic_title(self, url='', webpage='', *, default=None):
3781         return (self._og_search_title(webpage, default=None)
3782                 or self._html_extract_title(webpage, default=None)
3783                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3784                 or default)
3785
3786     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3787         if not duration:
3788             return
3789         chapter_list = [{
3790             'start_time': start_function(chapter),
3791             'title': title_function(chapter),
3792         } for chapter in chapter_list or []]
3793         if strict:
3794             warn = self.report_warning
3795         else:
3796             warn = self.write_debug
3797             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3798
3799         chapters = [{'start_time': 0}]
3800         for idx, chapter in enumerate(chapter_list):
3801             if chapter['start_time'] is None:
3802                 warn(f'Incomplete chapter {idx}')
3803             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3804                 chapters.append(chapter)
3805             elif chapter not in chapters:
3806                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3807                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3808                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3809         return chapters[1:]
3810
3811     def _extract_chapters_from_description(self, description, duration):
3812         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3813         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3814         return self._extract_chapters_helper(
3815             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3816             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3817             duration=duration, strict=False) or self._extract_chapters_helper(
3818             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3819             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3820             duration=duration, strict=False)
3821
3822     @staticmethod
3823     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3824         all_known = all(map(
3825             lambda x: x is not None,
3826             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3827         return (
3828             'private' if is_private
3829             else 'premium_only' if needs_premium
3830             else 'subscriber_only' if needs_subscription
3831             else 'needs_auth' if needs_auth
3832             else 'unlisted' if is_unlisted
3833             else 'public' if all_known
3834             else None)
3835
3836     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3837         '''
3838         @returns            A list of values for the extractor argument given by "key"
3839                             or "default" if no such key is present
3840         @param default      The default value to return when the key is not present (default: [])
3841         @param casesense    When false, the values are converted to lower case
3842         '''
3843         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3844         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3845         if val is None:
3846             return [] if default is NO_DEFAULT else default
3847         return list(val) if casesense else [x.lower() for x in val]
3848
3849     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3850         if not playlist_id or not video_id:
3851             return not video_id
3852
3853         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3854         if no_playlist is not None:
3855             return not no_playlist
3856
3857         video_id = '' if video_id is True else f' {video_id}'
3858         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3859         if self.get_param('noplaylist'):
3860             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3861             return False
3862         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3863         return True
3864
3865     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3866         RetryManager.report_retry(
3867             err, _count or int(fatal), _retries,
3868             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3869             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3870
3871     def RetryManager(self, **kwargs):
3872         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3873
3874     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3875         display_id = traverse_obj(info_dict, 'display_id', 'id')
3876         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3877         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3878             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3879
3880     @classmethod
3881     def extract_from_webpage(cls, ydl, url, webpage):
3882         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3883               else ydl.get_info_extractor(cls.ie_key()))
3884         for info in ie._extract_from_webpage(url, webpage) or []:
3885             # url = None since we do not want to set (webpage/original)_url
3886             ydl.add_default_extra_info(info, ie, None)
3887             yield info
3888
3889     @classmethod
3890     def _extract_from_webpage(cls, url, webpage):
3891         for embed_url in orderedSet(
3892                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3893             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3894
3895     @classmethod
3896     def _extract_embed_urls(cls, url, webpage):
3897         """@returns all the embed urls on the webpage"""
3898         if '_EMBED_URL_RE' not in cls.__dict__:
3899             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3900             for idx, regex in enumerate(cls._EMBED_REGEX):
3901                 assert regex.count('(?P<url>') == 1, \
3902                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3903             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3904
3905         for regex in cls._EMBED_URL_RE:
3906             for mobj in regex.finditer(webpage):
3907                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3908                 if cls._VALID_URL is False or cls.suitable(embed_url):
3909                     yield embed_url
3910
3911     class StopExtraction(Exception):
3912         pass
3913
3914     @classmethod
3915     def _extract_url(cls, webpage):  # TODO: Remove
3916         """Only for compatibility with some older extractors"""
3917         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3918
3919     @classmethod
3920     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3921         if plugin_name:
3922             mro = inspect.getmro(cls)
3923             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3924             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3925             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3926             while getattr(super_class, '__wrapped__', None):
3927                 super_class = super_class.__wrapped__
3928             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3929             _PLUGIN_OVERRIDES[super_class].append(cls)
3930
3931         return super().__init_subclass__(**kwargs)
3932
3933
3934 class SearchInfoExtractor(InfoExtractor):
3935     """
3936     Base class for paged search queries extractors.
3937     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3938     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3939     """
3940
3941     _MAX_RESULTS = float('inf')
3942     _RETURN_TYPE = 'playlist'
3943
3944     @classproperty
3945     def _VALID_URL(cls):
3946         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3947
3948     def _real_extract(self, query):
3949         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3950         if prefix == '':
3951             return self._get_n_results(query, 1)
3952         elif prefix == 'all':
3953             return self._get_n_results(query, self._MAX_RESULTS)
3954         else:
3955             n = int(prefix)
3956             if n <= 0:
3957                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3958             elif n > self._MAX_RESULTS:
3959                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3960                 n = self._MAX_RESULTS
3961             return self._get_n_results(query, n)
3962
3963     def _get_n_results(self, query, n):
3964         """Get a specified number of results for a query.
3965         Either this function or _search_results must be overridden by subclasses """
3966         return self.playlist_result(
3967             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3968             query, query)
3969
3970     def _search_results(self, query):
3971         """Returns an iterator of search results"""
3972         raise NotImplementedError('This method must be implemented by subclasses')
3973
3974     @classproperty
3975     def SEARCH_KEY(cls):
3976         return cls._SEARCH_KEY
3977
3978
3979 class UnsupportedURLIE(InfoExtractor):
3980     _VALID_URL = '.*'
3981     _ENABLED = False
3982     IE_DESC = False
3983
3984     def _real_extract(self, url):
3985         raise UnsupportedError(url)
3986
3987
3988 _PLUGIN_OVERRIDES = collections.defaultdict(list)