yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import functools
   4 import getpass
   5 import hashlib
   6 import http.client
   7 import http.cookiejar
   8 import http.cookies
   9 import inspect
  10 import itertools
  11 import json
  12 import math
  13 import netrc
  14 import os
  15 import random
  16 import re
  17 import subprocess
  18 import sys
  19 import time
  20 import types
  21 import urllib.parse
  22 import urllib.request
  23 import xml.etree.ElementTree
  24
  25 from ..compat import (
  26     compat_etree_fromstring,
  27     compat_expanduser,
  28     compat_os_name,
  29     urllib_req_to_req,
  30 )
  31 from ..cookies import LenientSimpleCookie
  32 from ..downloader.f4m import get_base_url, remove_encrypted_media
  33 from ..downloader.hls import HlsFD
  34 from ..networking import HEADRequest, Request
  35 from ..networking.exceptions import (
  36     HTTPError,
  37     IncompleteRead,
  38     network_exceptions,
  39 )
  40 from ..networking.impersonate import ImpersonateTarget
  41 from ..utils import (
  42     IDENTITY,
  43     JSON_LD_RE,
  44     NO_DEFAULT,
  45     ExtractorError,
  46     FormatSorter,
  47     GeoRestrictedError,
  48     GeoUtils,
  49     LenientJSONDecoder,
  50     Popen,
  51     RegexNotFoundError,
  52     RetryManager,
  53     UnsupportedError,
  54     age_restricted,
  55     base_url,
  56     bug_reports_message,
  57     classproperty,
  58     clean_html,
  59     deprecation_warning,
  60     determine_ext,
  61     dict_get,
  62     encode_data_uri,
  63     extract_attributes,
  64     filter_dict,
  65     fix_xml_ampersands,
  66     float_or_none,
  67     format_field,
  68     int_or_none,
  69     join_nonempty,
  70     js_to_json,
  71     mimetype2ext,
  72     netrc_from_content,
  73     orderedSet,
  74     parse_bitrate,
  75     parse_codecs,
  76     parse_duration,
  77     parse_iso8601,
  78     parse_m3u8_attributes,
  79     parse_resolution,
  80     sanitize_filename,
  81     sanitize_url,
  82     smuggle_url,
  83     str_or_none,
  84     str_to_int,
  85     strip_or_none,
  86     traverse_obj,
  87     truncate_string,
  88     try_call,
  89     try_get,
  90     unescapeHTML,
  91     unified_strdate,
  92     unified_timestamp,
  93     url_basename,
  94     url_or_none,
  95     urlhandle_detect_ext,
  96     urljoin,
  97     variadic,
  98     xpath_element,
  99     xpath_text,
 100     xpath_with_ns,
 101 )
 102
 103
 104 class InfoExtractor:
 105     """Information Extractor class.
 106
 107     Information extractors are the classes that, given a URL, extract
 108     information about the video (or videos) the URL refers to. This
 109     information includes the real video URL, the video title, author and
 110     others. The information is stored in a dictionary which is then
 111     passed to the YoutubeDL. The YoutubeDL processes this
 112     information possibly downloading the video to the file system, among
 113     other possible outcomes.
 114
 115     The type field determines the type of the result.
 116     By far the most common value (and the default if _type is missing) is
 117     "video", which indicates a single video.
 118
 119     For a video, the dictionaries must include the following fields:
 120
 121     id:             Video identifier.
 122     title:          Video title, unescaped. Set to an empty string if video has
 123                     no title as opposed to "None" which signifies that the
 124                     extractor failed to obtain a title
 125
 126     Additionally, it must contain either a formats entry or a url one:
 127
 128     formats:        A list of dictionaries for each format available, ordered
 129                     from worst to best quality.
 130
 131                     Potential fields:
 132                     * url        The mandatory URL representing the media:
 133                                    for plain file media - HTTP URL of this file,
 134                                    for RTMP - RTMP URL,
 135                                    for HLS - URL of the M3U8 media playlist,
 136                                    for HDS - URL of the F4M manifest,
 137                                    for DASH
 138                                      - HTTP URL to plain file media (in case of
 139                                        unfragmented media)
 140                                      - URL of the MPD manifest or base URL
 141                                        representing the media if MPD manifest
 142                                        is parsed from a string (in case of
 143                                        fragmented media)
 144                                    for MSS - URL of the ISM manifest.
 145                     * request_data  Data to send in POST request to the URL
 146                     * manifest_url
 147                                  The URL of the manifest file in case of
 148                                  fragmented media:
 149                                    for HLS - URL of the M3U8 master playlist,
 150                                    for HDS - URL of the F4M manifest,
 151                                    for DASH - URL of the MPD manifest,
 152                                    for MSS - URL of the ISM manifest.
 153                     * manifest_stream_number  (For internal use only)
 154                                  The index of the stream in the manifest file
 155                     * ext        Will be calculated from URL if missing
 156                     * format     A human-readable description of the format
 157                                  ("mp4 container with h264/opus").
 158                                  Calculated from the format_id, width, height.
 159                                  and format_note fields if missing.
 160                     * format_id  A short description of the format
 161                                  ("mp4_h264_opus" or "19").
 162                                 Technically optional, but strongly recommended.
 163                     * format_note Additional info about the format
 164                                  ("3D" or "DASH video")
 165                     * width      Width of the video, if known
 166                     * height     Height of the video, if known
 167                     * aspect_ratio  Aspect ratio of the video, if known
 168                                  Automatically calculated from width and height
 169                     * resolution Textual description of width and height
 170                                  Automatically calculated from width and height
 171                     * dynamic_range The dynamic range of the video. One of:
 172                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 173                     * tbr        Average bitrate of audio and video in kbps (1000 bits/sec)
 174                     * abr        Average audio bitrate in kbps (1000 bits/sec)
 175                     * acodec     Name of the audio codec in use
 176                     * asr        Audio sampling rate in Hertz
 177                     * audio_channels  Number of audio channels
 178                     * vbr        Average video bitrate in kbps (1000 bits/sec)
 179                     * fps        Frame rate
 180                     * vcodec     Name of the video codec in use
 181                     * container  Name of the container format
 182                     * filesize   The number of bytes, if known in advance
 183                     * filesize_approx  An estimate for the number of bytes
 184                     * player_url SWF Player URL (used for rtmpdump).
 185                     * protocol   The protocol that will be used for the actual
 186                                  download, lower-case. One of "http", "https" or
 187                                  one of the protocols defined in downloader.PROTOCOL_MAP
 188                     * fragment_base_url
 189                                  Base URL for fragments. Each fragment's path
 190                                  value (if present) will be relative to
 191                                  this URL.
 192                     * fragments  A list of fragments of a fragmented media.
 193                                  Each fragment entry must contain either an url
 194                                  or a path. If an url is present it should be
 195                                  considered by a client. Otherwise both path and
 196                                  fragment_base_url must be present. Here is
 197                                  the list of all potential fields:
 198                                  * "url" - fragment's URL
 199                                  * "path" - fragment's path relative to
 200                                             fragment_base_url
 201                                  * "duration" (optional, int or float)
 202                                  * "filesize" (optional, int)
 203                     * is_from_start  Is a live format that can be downloaded
 204                                 from the start. Boolean
 205                     * preference Order number of this format. If this field is
 206                                  present and not None, the formats get sorted
 207                                  by this field, regardless of all other values.
 208                                  -1 for default (order by other properties),
 209                                  -2 or smaller for less than default.
 210                                  < -1000 to hide the format (if there is
 211                                     another one which is strictly better)
 212                     * language   Language code, e.g. "de" or "en-US".
 213                     * language_preference  Is this in the language mentioned in
 214                                  the URL?
 215                                  10 if it's what the URL is about,
 216                                  -1 for default (don't know),
 217                                  -10 otherwise, other values reserved for now.
 218                     * quality    Order number of the video quality of this
 219                                  format, irrespective of the file format.
 220                                  -1 for default (order by other properties),
 221                                  -2 or smaller for less than default.
 222                     * source_preference  Order number for this video source
 223                                   (quality takes higher priority)
 224                                  -1 for default (order by other properties),
 225                                  -2 or smaller for less than default.
 226                     * http_headers  A dictionary of additional HTTP headers
 227                                  to add to the request.
 228                     * stretched_ratio  If given and not 1, indicates that the
 229                                  video's pixels are not square.
 230                                  width : height ratio as float.
 231                     * no_resume  The server does not support resuming the
 232                                  (HTTP or RTMP) download. Boolean.
 233                     * has_drm    True if the format has DRM and cannot be downloaded.
 234                                  'maybe' if the format may have DRM and has to be tested before download.
 235                     * extra_param_to_segment_url  A query string to append to each
 236                                  fragment's URL, or to update each existing query string
 237                                  with. If it is an HLS stream with an AES-128 decryption key,
 238                                  the query paramaters will be passed to the key URI as well,
 239                                  unless there is an `extra_param_to_key_url` given,
 240                                  or unless an external key URI is provided via `hls_aes`.
 241                                  Only applied by the native HLS/DASH downloaders.
 242                     * extra_param_to_key_url  A query string to append to the URL
 243                                  of the format's HLS AES-128 decryption key.
 244                                  Only applied by the native HLS downloader.
 245                     * hls_aes    A dictionary of HLS AES-128 decryption information
 246                                  used by the native HLS downloader to override the
 247                                  values in the media playlist when an '#EXT-X-KEY' tag
 248                                  is present in the playlist:
 249                                  * uri  The URI from which the key will be downloaded
 250                                  * key  The key (as hex) used to decrypt fragments.
 251                                         If `key` is given, any key URI will be ignored
 252                                  * iv   The IV (as hex) used to decrypt fragments
 253                     * downloader_options  A dictionary of downloader options
 254                                  (For internal use only)
 255                                  * http_chunk_size Chunk size for HTTP downloads
 256                                  * ffmpeg_args     Extra arguments for ffmpeg downloader (input)
 257                                  * ffmpeg_args_out Extra arguments for ffmpeg downloader (output)
 258                     * is_dash_periods  Whether the format is a result of merging
 259                                  multiple DASH periods.
 260                     RTMP formats can also have the additional fields: page_url,
 261                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 262                     rtmp_protocol, rtmp_real_time
 263
 264     url:            Final video URL.
 265     ext:            Video filename extension.
 266     format:         The video format, defaults to ext (used for --get-format)
 267     player_url:     SWF Player URL (used for rtmpdump).
 268
 269     The following fields are optional:
 270
 271     direct:         True if a direct video file was given (must only be set by GenericIE)
 272     alt_title:      A secondary title of the video.
 273     display_id:     An alternative identifier for the video, not necessarily
 274                     unique, but available before title. Typically, id is
 275                     something like "4234987", title "Dancing naked mole rats",
 276                     and display_id "dancing-naked-mole-rats"
 277     thumbnails:     A list of dictionaries, with the following entries:
 278                         * "id" (optional, string) - Thumbnail format ID
 279                         * "url"
 280                         * "preference" (optional, int) - quality of the image
 281                         * "width" (optional, int)
 282                         * "height" (optional, int)
 283                         * "resolution" (optional, string "{width}x{height}",
 284                                         deprecated)
 285                         * "filesize" (optional, int)
 286                         * "http_headers" (dict) - HTTP headers for the request
 287     thumbnail:      Full URL to a video thumbnail image.
 288     description:    Full video description.
 289     uploader:       Full name of the video uploader.
 290     license:        License name the video is licensed under.
 291     creators:       List of creators of the video.
 292     timestamp:      UNIX timestamp of the moment the video was uploaded
 293     upload_date:    Video upload date in UTC (YYYYMMDD).
 294                     If not explicitly set, calculated from timestamp
 295     release_timestamp: UNIX timestamp of the moment the video was released.
 296                     If it is not clear whether to use timestamp or this, use the former
 297     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 298                     If not explicitly set, calculated from release_timestamp
 299     release_year:   Year (YYYY) as integer when the video or album was released.
 300                     To be used if no exact release date is known.
 301                     If not explicitly set, calculated from release_date.
 302     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 303     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 304                     If not explicitly set, calculated from modified_timestamp
 305     uploader_id:    Nickname or id of the video uploader.
 306     uploader_url:   Full URL to a personal webpage of the video uploader.
 307     channel:        Full name of the channel the video is uploaded on.
 308                     Note that channel fields may or may not repeat uploader
 309                     fields. This depends on a particular extractor.
 310     channel_id:     Id of the channel.
 311     channel_url:    Full URL to a channel webpage.
 312     channel_follower_count: Number of followers of the channel.
 313     channel_is_verified: Whether the channel is verified on the platform.
 314     location:       Physical location where the video was filmed.
 315     subtitles:      The available subtitles as a dictionary in the format
 316                     {tag: subformats}. "tag" is usually a language code, and
 317                     "subformats" is a list sorted from lower to higher
 318                     preference, each element is a dictionary with the "ext"
 319                     entry and one of:
 320                         * "data": The subtitles file contents
 321                         * "url": A URL pointing to the subtitles file
 322                     It can optionally also have:
 323                         * "name": Name or description of the subtitles
 324                         * "http_headers": A dictionary of additional HTTP headers
 325                                   to add to the request.
 326                     "ext" will be calculated from URL if missing
 327     automatic_captions: Like 'subtitles'; contains automatically generated
 328                     captions instead of normal subtitles
 329     duration:       Length of the video in seconds, as an integer or float.
 330     view_count:     How many users have watched the video on the platform.
 331     concurrent_view_count: How many users are currently watching the video on the platform.
 332     like_count:     Number of positive ratings of the video
 333     dislike_count:  Number of negative ratings of the video
 334     repost_count:   Number of reposts of the video
 335     average_rating: Average rating give by users, the scale used depends on the webpage
 336     comment_count:  Number of comments on the video
 337     comments:       A list of comments, each with one or more of the following
 338                     properties (all but one of text or html optional):
 339                         * "author" - human-readable name of the comment author
 340                         * "author_id" - user ID of the comment author
 341                         * "author_thumbnail" - The thumbnail of the comment author
 342                         * "author_url" - The url to the comment author's page
 343                         * "author_is_verified" - Whether the author is verified
 344                                                  on the platform
 345                         * "author_is_uploader" - Whether the comment is made by
 346                                                  the video uploader
 347                         * "id" - Comment ID
 348                         * "html" - Comment as HTML
 349                         * "text" - Plain text of the comment
 350                         * "timestamp" - UNIX timestamp of comment
 351                         * "parent" - ID of the comment this one is replying to.
 352                                      Set to "root" to indicate that this is a
 353                                      comment to the original video.
 354                         * "like_count" - Number of positive ratings of the comment
 355                         * "dislike_count" - Number of negative ratings of the comment
 356                         * "is_favorited" - Whether the comment is marked as
 357                                            favorite by the video uploader
 358                         * "is_pinned" - Whether the comment is pinned to
 359                                         the top of the comments
 360     age_limit:      Age restriction for the video, as an integer (years)
 361     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 362                     should allow to get the same result again. (It will be set
 363                     by YoutubeDL if it's missing)
 364     categories:     A list of categories that the video falls in, for example
 365                     ["Sports", "Berlin"]
 366     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 367     cast:           A list of the video cast
 368     is_live:        True, False, or None (=unknown). Whether this video is a
 369                     live stream that goes on instead of a fixed-length video.
 370     was_live:       True, False, or None (=unknown). Whether this video was
 371                     originally a live stream.
 372     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 373                     or 'post_live' (was live, but VOD is not yet processed)
 374                     If absent, automatically set from is_live, was_live
 375     start_time:     Time in seconds where the reproduction should start, as
 376                     specified in the URL.
 377     end_time:       Time in seconds where the reproduction should end, as
 378                     specified in the URL.
 379     chapters:       A list of dictionaries, with the following entries:
 380                         * "start_time" - The start time of the chapter in seconds
 381                         * "end_time" - The end time of the chapter in seconds
 382                         * "title" (optional, string)
 383     heatmap:        A list of dictionaries, with the following entries:
 384                         * "start_time" - The start time of the data point in seconds
 385                         * "end_time" - The end time of the data point in seconds
 386                         * "value" - The normalized value of the data point (float between 0 and 1)
 387     playable_in_embed: Whether this video is allowed to play in embedded
 388                     players on other sites. Can be True (=always allowed),
 389                     False (=never allowed), None (=unknown), or a string
 390                     specifying the criteria for embedability; e.g. 'whitelist'
 391     availability:   Under what condition the video is available. One of
 392                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 393                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 394                     to set it
 395     media_type:     The type of media as classified by the site, e.g. "episode", "clip", "trailer"
 396     _old_archive_ids: A list of old archive ids needed for backward compatibility
 397     _format_sort_fields: A list of fields to use for sorting formats
 398     __post_extractor: A function to be called just before the metadata is
 399                     written to either disk, logger or console. The function
 400                     must return a dict which will be added to the info_dict.
 401                     This is usefull for additional information that is
 402                     time-consuming to extract. Note that the fields thus
 403                     extracted will not be available to output template and
 404                     match_filter. So, only "comments" and "comment_count" are
 405                     currently allowed to be extracted via this method.
 406
 407     The following fields should only be used when the video belongs to some logical
 408     chapter or section:
 409
 410     chapter:        Name or title of the chapter the video belongs to.
 411     chapter_number: Number of the chapter the video belongs to, as an integer.
 412     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 413
 414     The following fields should only be used when the video is an episode of some
 415     series, programme or podcast:
 416
 417     series:         Title of the series or programme the video episode belongs to.
 418     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 419     season:         Title of the season the video episode belongs to.
 420     season_number:  Number of the season the video episode belongs to, as an integer.
 421     season_id:      Id of the season the video episode belongs to, as a unicode string.
 422     episode:        Title of the video episode. Unlike mandatory video title field,
 423                     this field should denote the exact title of the video episode
 424                     without any kind of decoration.
 425     episode_number: Number of the video episode within a season, as an integer.
 426     episode_id:     Id of the video episode, as a unicode string.
 427
 428     The following fields should only be used when the media is a track or a part of
 429     a music album:
 430
 431     track:          Title of the track.
 432     track_number:   Number of the track within an album or a disc, as an integer.
 433     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 434                     as a unicode string.
 435     artists:        List of artists of the track.
 436     composers:      List of composers of the piece.
 437     genres:         List of genres of the track.
 438     album:          Title of the album the track belongs to.
 439     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 440     album_artists:  List of all artists appeared on the album.
 441                     E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
 442                     Useful for splits and compilations.
 443     disc_number:    Number of the disc or other physical medium the track belongs to,
 444                     as an integer.
 445
 446     The following fields should only be set for clips that should be cut from the original video:
 447
 448     section_start:  Start time of the section in seconds
 449     section_end:    End time of the section in seconds
 450
 451     The following fields should only be set for storyboards:
 452     rows:           Number of rows in each storyboard fragment, as an integer
 453     columns:        Number of columns in each storyboard fragment, as an integer
 454
 455     The following fields are deprecated and should not be set by new code:
 456     composer:       Use "composers" instead.
 457                     Composer(s) of the piece, comma-separated.
 458     artist:         Use "artists" instead.
 459                     Artist(s) of the track, comma-separated.
 460     genre:          Use "genres" instead.
 461                     Genre(s) of the track, comma-separated.
 462     album_artist:   Use "album_artists" instead.
 463                     All artists appeared on the album, comma-separated.
 464     creator:        Use "creators" instead.
 465                     The creator of the video.
 466
 467     Unless mentioned otherwise, the fields should be Unicode strings.
 468
 469     Unless mentioned otherwise, None is equivalent to absence of information.
 470
 471
 472     _type "playlist" indicates multiple videos.
 473     There must be a key "entries", which is a list, an iterable, or a PagedList
 474     object, each element of which is a valid dictionary by this specification.
 475
 476     Additionally, playlists can have "id", "title", and any other relevant
 477     attributes with the same semantics as videos (see above).
 478
 479     It can also have the following optional fields:
 480
 481     playlist_count: The total number of videos in a playlist. If not given,
 482                     YoutubeDL tries to calculate it from "entries"
 483
 484
 485     _type "multi_video" indicates that there are multiple videos that
 486     form a single show, for examples multiple acts of an opera or TV episode.
 487     It must have an entries key like a playlist and contain all the keys
 488     required for a video at the same time.
 489
 490
 491     _type "url" indicates that the video must be extracted from another
 492     location, possibly by a different extractor. Its only required key is:
 493     "url" - the next URL to extract.
 494     The key "ie_key" can be set to the class name (minus the trailing "IE",
 495     e.g. "Youtube") if the extractor class is known in advance.
 496     Additionally, the dictionary may have any properties of the resolved entity
 497     known in advance, for example "title" if the title of the referred video is
 498     known ahead of time.
 499
 500
 501     _type "url_transparent" entities have the same specification as "url", but
 502     indicate that the given additional information is more precise than the one
 503     associated with the resolved URL.
 504     This is useful when a site employs a video service that hosts the video and
 505     its technical metadata, but that video service does not embed a useful
 506     title, description etc.
 507
 508
 509     Subclasses of this should also be added to the list of extractors and
 510     should define _VALID_URL as a regexp or a Sequence of regexps, and
 511     re-define the _real_extract() and (optionally) _real_initialize() methods.
 512
 513     Subclasses may also override suitable() if necessary, but ensure the function
 514     signature is preserved and that this function imports everything it needs
 515     (except other extractors), so that lazy_extractors works correctly.
 516
 517     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 518     the HTML of Generic webpages. It may also override _extract_embed_urls
 519     or _extract_from_webpage as necessary. While these are normally classmethods,
 520     _extract_from_webpage is allowed to be an instance method.
 521
 522     _extract_from_webpage may raise self.StopExtraction() to stop further
 523     processing of the webpage and obtain exclusive rights to it. This is useful
 524     when the extractor cannot reliably be matched using just the URL,
 525     e.g. invidious/peertube instances
 526
 527     Embed-only extractors can be defined by setting _VALID_URL = False.
 528
 529     To support username + password (or netrc) login, the extractor must define a
 530     _NETRC_MACHINE and re-define _perform_login(username, password) and
 531     (optionally) _initialize_pre_login() methods. The _perform_login method will
 532     be called between _initialize_pre_login and _real_initialize if credentials
 533     are passed by the user. In cases where it is necessary to have the login
 534     process as part of the extraction rather than initialization, _perform_login
 535     can be left undefined.
 536
 537     _GEO_BYPASS attribute may be set to False in order to disable
 538     geo restriction bypass mechanisms for a particular extractor.
 539     Though it won't disable explicit geo restriction bypass based on
 540     country code provided with geo_bypass_country.
 541
 542     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 543     countries for this extractor. One of these countries will be used by
 544     geo restriction bypass mechanism right away in order to bypass
 545     geo restriction, of course, if the mechanism is not disabled.
 546
 547     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 548     IP blocks in CIDR notation for this extractor. One of these IP blocks
 549     will be used by geo restriction bypass mechanism similarly
 550     to _GEO_COUNTRIES.
 551
 552     The _ENABLED attribute should be set to False for IEs that
 553     are disabled by default and must be explicitly enabled.
 554
 555     The _WORKING attribute should be set to False for broken IEs
 556     in order to warn the users and skip the tests.
 557     """
 558
 559     _ready = False
 560     _downloader = None
 561     _x_forwarded_for_ip = None
 562     _GEO_BYPASS = True
 563     _GEO_COUNTRIES = None
 564     _GEO_IP_BLOCKS = None
 565     _WORKING = True
 566     _ENABLED = True
 567     _NETRC_MACHINE = None
 568     IE_DESC = None
 569     SEARCH_KEY = None
 570     _VALID_URL = None
 571     _EMBED_REGEX = []
 572
 573     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 574         password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 575         return {
 576             None: '',
 577             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 578             'password': f'Use {password_hint}',
 579             'cookies': (
 580                 'Use --cookies-from-browser or --cookies for the authentication. '
 581                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 582         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 583
 584     def __init__(self, downloader=None):
 585         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 586         If a downloader is not passed during initialization,
 587         it must be set using "set_downloader()" before "extract()" is called"""
 588         self._ready = False
 589         self._x_forwarded_for_ip = None
 590         self._printed_messages = set()
 591         self.set_downloader(downloader)
 592
 593     @classmethod
 594     def _match_valid_url(cls, url):
 595         if cls._VALID_URL is False:
 596             return None
 597         # This does not use has/getattr intentionally - we want to know whether
 598         # we have cached the regexp for *this* class, whereas getattr would also
 599         # match the superclass
 600         if '_VALID_URL_RE' not in cls.__dict__:
 601             cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
 602         return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
 603
 604     @classmethod
 605     def suitable(cls, url):
 606         """Receives a URL and returns True if suitable for this IE."""
 607         # This function must import everything it needs (except other extractors),
 608         # so that lazy_extractors works correctly
 609         return cls._match_valid_url(url) is not None
 610
 611     @classmethod
 612     def _match_id(cls, url):
 613         return cls._match_valid_url(url).group('id')
 614
 615     @classmethod
 616     def get_temp_id(cls, url):
 617         try:
 618             return cls._match_id(url)
 619         except (IndexError, AttributeError):
 620             return None
 621
 622     @classmethod
 623     def working(cls):
 624         """Getter method for _WORKING."""
 625         return cls._WORKING
 626
 627     @classmethod
 628     def supports_login(cls):
 629         return bool(cls._NETRC_MACHINE)
 630
 631     def initialize(self):
 632         """Initializes an instance (authentication, etc)."""
 633         self._printed_messages = set()
 634         self._initialize_geo_bypass({
 635             'countries': self._GEO_COUNTRIES,
 636             'ip_blocks': self._GEO_IP_BLOCKS,
 637         })
 638         if not self._ready:
 639             self._initialize_pre_login()
 640             if self.supports_login():
 641                 username, password = self._get_login_info()
 642                 if username:
 643                     self._perform_login(username, password)
 644             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 645                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 646             self._real_initialize()
 647             self._ready = True
 648
 649     def _initialize_geo_bypass(self, geo_bypass_context):
 650         """
 651         Initialize geo restriction bypass mechanism.
 652
 653         This method is used to initialize geo bypass mechanism based on faking
 654         X-Forwarded-For HTTP header. A random country from provided country list
 655         is selected and a random IP belonging to this country is generated. This
 656         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 657         HTTP requests.
 658
 659         This method will be used for initial geo bypass mechanism initialization
 660         during the instance initialization with _GEO_COUNTRIES and
 661         _GEO_IP_BLOCKS.
 662
 663         You may also manually call it from extractor's code if geo bypass
 664         information is not available beforehand (e.g. obtained during
 665         extraction) or due to some other reason. In this case you should pass
 666         this information in geo bypass context passed as first argument. It may
 667         contain following fields:
 668
 669         countries:  List of geo unrestricted countries (similar
 670                     to _GEO_COUNTRIES)
 671         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 672                     (similar to _GEO_IP_BLOCKS)
 673
 674         """
 675         if not self._x_forwarded_for_ip:
 676
 677             # Geo bypass mechanism is explicitly disabled by user
 678             if not self.get_param('geo_bypass', True):
 679                 return
 680
 681             if not geo_bypass_context:
 682                 geo_bypass_context = {}
 683
 684             # Backward compatibility: previously _initialize_geo_bypass
 685             # expected a list of countries, some 3rd party code may still use
 686             # it this way
 687             if isinstance(geo_bypass_context, (list, tuple)):
 688                 geo_bypass_context = {
 689                     'countries': geo_bypass_context,
 690                 }
 691
 692             # The whole point of geo bypass mechanism is to fake IP
 693             # as X-Forwarded-For HTTP header based on some IP block or
 694             # country code.
 695
 696             # Path 1: bypassing based on IP block in CIDR notation
 697
 698             # Explicit IP block specified by user, use it right away
 699             # regardless of whether extractor is geo bypassable or not
 700             ip_block = self.get_param('geo_bypass_ip_block', None)
 701
 702             # Otherwise use random IP block from geo bypass context but only
 703             # if extractor is known as geo bypassable
 704             if not ip_block:
 705                 ip_blocks = geo_bypass_context.get('ip_blocks')
 706                 if self._GEO_BYPASS and ip_blocks:
 707                     ip_block = random.choice(ip_blocks)
 708
 709             if ip_block:
 710                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 711                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 712                 return
 713
 714             # Path 2: bypassing based on country code
 715
 716             # Explicit country code specified by user, use it right away
 717             # regardless of whether extractor is geo bypassable or not
 718             country = self.get_param('geo_bypass_country', None)
 719
 720             # Otherwise use random country code from geo bypass context but
 721             # only if extractor is known as geo bypassable
 722             if not country:
 723                 countries = geo_bypass_context.get('countries')
 724                 if self._GEO_BYPASS and countries:
 725                     country = random.choice(countries)
 726
 727             if country:
 728                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 729                 self._downloader.write_debug(
 730                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 731
 732     def extract(self, url):
 733         """Extracts URL information and returns it in list of dicts."""
 734         try:
 735             for _ in range(2):
 736                 try:
 737                     self.initialize()
 738                     self.to_screen('Extracting URL: %s' % (
 739                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 740                     ie_result = self._real_extract(url)
 741                     if ie_result is None:
 742                         return None
 743                     if self._x_forwarded_for_ip:
 744                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 745                     subtitles = ie_result.get('subtitles') or {}
 746                     if 'no-live-chat' in self.get_param('compat_opts'):
 747                         for lang in ('live_chat', 'comments', 'danmaku'):
 748                             subtitles.pop(lang, None)
 749                     return ie_result
 750                 except GeoRestrictedError as e:
 751                     if self.__maybe_fake_ip_and_retry(e.countries):
 752                         continue
 753                     raise
 754         except UnsupportedError:
 755             raise
 756         except ExtractorError as e:
 757             e.video_id = e.video_id or self.get_temp_id(url)
 758             e.ie = e.ie or self.IE_NAME
 759             e.traceback = e.traceback or sys.exc_info()[2]
 760             raise
 761         except IncompleteRead as e:
 762             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 763         except (KeyError, StopIteration) as e:
 764             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 765
 766     def __maybe_fake_ip_and_retry(self, countries):
 767         if (not self.get_param('geo_bypass_country', None)
 768                 and self._GEO_BYPASS
 769                 and self.get_param('geo_bypass', True)
 770                 and not self._x_forwarded_for_ip
 771                 and countries):
 772             country_code = random.choice(countries)
 773             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 774             if self._x_forwarded_for_ip:
 775                 self.report_warning(
 776                     'Video is geo restricted. Retrying extraction with fake IP '
 777                     f'{self._x_forwarded_for_ip} ({country_code.upper()}) as X-Forwarded-For.')
 778                 return True
 779         return False
 780
 781     def set_downloader(self, downloader):
 782         """Sets a YoutubeDL instance as the downloader for this IE."""
 783         self._downloader = downloader
 784
 785     @property
 786     def cache(self):
 787         return self._downloader.cache
 788
 789     @property
 790     def cookiejar(self):
 791         return self._downloader.cookiejar
 792
 793     def _initialize_pre_login(self):
 794         """ Initialization before login. Redefine in subclasses."""
 795         pass
 796
 797     def _perform_login(self, username, password):
 798         """ Login with username and password. Redefine in subclasses."""
 799         pass
 800
 801     def _real_initialize(self):
 802         """Real initialization process. Redefine in subclasses."""
 803         pass
 804
 805     def _real_extract(self, url):
 806         """Real extraction process. Redefine in subclasses."""
 807         raise NotImplementedError('This method must be implemented by subclasses')
 808
 809     @classmethod
 810     def ie_key(cls):
 811         """A string for getting the InfoExtractor with get_info_extractor"""
 812         return cls.__name__[:-2]
 813
 814     @classproperty
 815     def IE_NAME(cls):
 816         return cls.__name__[:-2]
 817
 818     @staticmethod
 819     def __can_accept_status_code(err, expected_status):
 820         assert isinstance(err, HTTPError)
 821         if expected_status is None:
 822             return False
 823         elif callable(expected_status):
 824             return expected_status(err.status) is True
 825         else:
 826             return err.status in variadic(expected_status)
 827
 828     def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None):
 829         if isinstance(url_or_request, urllib.request.Request):
 830             self._downloader.deprecation_warning(
 831                 'Passing a urllib.request.Request to _create_request() is deprecated. '
 832                 'Use yt_dlp.networking.common.Request instead.')
 833             url_or_request = urllib_req_to_req(url_or_request)
 834         elif not isinstance(url_or_request, Request):
 835             url_or_request = Request(url_or_request)
 836
 837         url_or_request.update(data=data, headers=headers, query=query, extensions=extensions)
 838         return url_or_request
 839
 840     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None,
 841                          headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False):
 842         """
 843         Return the response handle.
 844
 845         See _download_webpage docstring for arguments specification.
 846         """
 847         if not self._downloader._first_webpage_request:
 848             sleep_interval = self.get_param('sleep_interval_requests') or 0
 849             if sleep_interval > 0:
 850                 self.to_screen(f'Sleeping {sleep_interval} seconds ...')
 851                 time.sleep(sleep_interval)
 852         else:
 853             self._downloader._first_webpage_request = False
 854
 855         if note is None:
 856             self.report_download_webpage(video_id)
 857         elif note is not False:
 858             if video_id is None:
 859                 self.to_screen(str(note))
 860             else:
 861                 self.to_screen(f'{video_id}: {note}')
 862
 863         # Some sites check X-Forwarded-For HTTP header in order to figure out
 864         # the origin of the client behind proxy. This allows bypassing geo
 865         # restriction by faking this header's value to IP that belongs to some
 866         # geo unrestricted country. We will do so once we encounter any
 867         # geo restriction error.
 868         if self._x_forwarded_for_ip:
 869             headers = (headers or {}).copy()
 870             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 871
 872         extensions = {}
 873
 874         if impersonate in (True, ''):
 875             impersonate = ImpersonateTarget()
 876         requested_targets = [
 877             t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t)
 878             for t in variadic(impersonate)
 879         ] if impersonate else []
 880
 881         available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None)
 882         if available_target:
 883             extensions['impersonate'] = available_target
 884         elif requested_targets:
 885             message = 'The extractor is attempting impersonation, but '
 886             message += (
 887                 'no impersonate target is available' if not str(impersonate)
 888                 else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"')
 889             info_msg = ('see  https://github.com/yt-dlp/yt-dlp#impersonation  '
 890                         'for information on installing the required dependencies')
 891             if require_impersonation:
 892                 raise ExtractorError(f'{message}; {info_msg}', expected=True)
 893             self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True)
 894
 895         try:
 896             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions))
 897         except network_exceptions as err:
 898             if isinstance(err, HTTPError):
 899                 if self.__can_accept_status_code(err, expected_status):
 900                     return err.response
 901
 902             if errnote is False:
 903                 return False
 904             if errnote is None:
 905                 errnote = 'Unable to download webpage'
 906
 907             errmsg = f'{errnote}: {err}'
 908             if fatal:
 909                 raise ExtractorError(errmsg, cause=err)
 910             else:
 911                 self.report_warning(errmsg)
 912                 return False
 913
 914     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 915                                  encoding=None, data=None, headers={}, query={}, expected_status=None,
 916                                  impersonate=None, require_impersonation=False):
 917         """
 918         Return a tuple (page content as string, URL handle).
 919
 920         Arguments:
 921         url_or_request -- plain text URL as a string or
 922             a yt_dlp.networking.Request object
 923         video_id -- Video/playlist/item identifier (string)
 924
 925         Keyword arguments:
 926         note -- note printed before downloading (string)
 927         errnote -- note printed in case of an error (string)
 928         fatal -- flag denoting whether error should be considered fatal,
 929             i.e. whether it should cause ExtractionError to be raised,
 930             otherwise a warning will be reported and extraction continued
 931         encoding -- encoding for a page content decoding, guessed automatically
 932             when not explicitly specified
 933         data -- POST data (bytes)
 934         headers -- HTTP headers (dict)
 935         query -- URL query (dict)
 936         expected_status -- allows to accept failed HTTP requests (non 2xx
 937             status code) by explicitly specifying a set of accepted status
 938             codes. Can be any of the following entities:
 939                 - an integer type specifying an exact failed status code to
 940                   accept
 941                 - a list or a tuple of integer types specifying a list of
 942                   failed status codes to accept
 943                 - a callable accepting an actual failed status code and
 944                   returning True if it should be accepted
 945             Note that this argument does not affect success status codes (2xx)
 946             which are always accepted.
 947         impersonate -- the impersonate target. Can be any of the following entities:
 948                 - an instance of yt_dlp.networking.impersonate.ImpersonateTarget
 949                 - a string in the format of CLIENT[:OS]
 950                 - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances
 951                 - a boolean value; True means any impersonate target is sufficient
 952         require_impersonation -- flag to toggle whether the request should raise an error
 953             if impersonation is not possible (bool, default: False)
 954         """
 955
 956         # Strip hashes from the URL (#1038)
 957         if isinstance(url_or_request, str):
 958             url_or_request = url_or_request.partition('#')[0]
 959
 960         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data,
 961                                      headers=headers, query=query, expected_status=expected_status,
 962                                      impersonate=impersonate, require_impersonation=require_impersonation)
 963         if urlh is False:
 964             assert not fatal
 965             return False
 966         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
 967                                              encoding=encoding, data=data)
 968         return (content, urlh)
 969
 970     @staticmethod
 971     def _guess_encoding_from_content(content_type, webpage_bytes):
 972         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 973         if m:
 974             encoding = m.group(1)
 975         else:
 976             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 977                           webpage_bytes[:1024])
 978             if m:
 979                 encoding = m.group(1).decode('ascii')
 980             elif webpage_bytes.startswith(b'\xff\xfe'):
 981                 encoding = 'utf-16'
 982             else:
 983                 encoding = 'utf-8'
 984
 985         return encoding
 986
 987     def __check_blocked(self, content):
 988         first_block = content[:512]
 989         if ('<title>Access to this site is blocked</title>' in content
 990                 and 'Websense' in first_block):
 991             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 992             blocked_iframe = self._html_search_regex(
 993                 r'<iframe src="([^"]+)"', content,
 994                 'Websense information URL', default=None)
 995             if blocked_iframe:
 996                 msg += f' Visit {blocked_iframe} for more details'
 997             raise ExtractorError(msg, expected=True)
 998         if '<title>The URL you requested has been blocked</title>' in first_block:
 999             msg = (
1000                 'Access to this webpage has been blocked by Indian censorship. '
1001                 'Use a VPN or proxy server (with --proxy) to route around it.')
1002             block_msg = self._html_search_regex(
1003                 r'</h1><p>(.*?)</p>',
1004                 content, 'block message', default=None)
1005             if block_msg:
1006                 msg += ' (Message: "{}")'.format(block_msg.replace('\n', ' '))
1007             raise ExtractorError(msg, expected=True)
1008         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
1009                 and 'blocklist.rkn.gov.ru' in content):
1010             raise ExtractorError(
1011                 'Access to this webpage has been blocked by decision of the Russian government. '
1012                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
1013                 expected=True)
1014
1015     def _request_dump_filename(self, url, video_id, data=None):
1016         if data is not None:
1017             data = hashlib.md5(data).hexdigest()
1018         basen = join_nonempty(video_id, data, url, delim='_')
1019         trim_length = self.get_param('trim_file_name') or 240
1020         if len(basen) > trim_length:
1021             h = '___' + hashlib.md5(basen.encode()).hexdigest()
1022             basen = basen[:trim_length - len(h)] + h
1023         filename = sanitize_filename(f'{basen}.dump', restricted=True)
1024         # Working around MAX_PATH limitation on Windows (see
1025         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
1026         if compat_os_name == 'nt':
1027             absfilepath = os.path.abspath(filename)
1028             if len(absfilepath) > 259:
1029                 filename = fR'\\?\{absfilepath}'
1030         return filename
1031
1032     def __decode_webpage(self, webpage_bytes, encoding, headers):
1033         if not encoding:
1034             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
1035         try:
1036             return webpage_bytes.decode(encoding, 'replace')
1037         except LookupError:
1038             return webpage_bytes.decode('utf-8', 'replace')
1039
1040     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
1041                               prefix=None, encoding=None, data=None):
1042         webpage_bytes = urlh.read()
1043         if prefix is not None:
1044             webpage_bytes = prefix + webpage_bytes
1045         if self.get_param('dump_intermediate_pages', False):
1046             self.to_screen('Dumping request to ' + urlh.url)
1047             dump = base64.b64encode(webpage_bytes).decode('ascii')
1048             self._downloader.to_screen(dump)
1049         if self.get_param('write_pages'):
1050             if isinstance(url_or_request, Request):
1051                 data = self._create_request(url_or_request, data).data
1052             filename = self._request_dump_filename(urlh.url, video_id, data)
1053             self.to_screen(f'Saving request to {filename}')
1054             with open(filename, 'wb') as outf:
1055                 outf.write(webpage_bytes)
1056
1057         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
1058         self.__check_blocked(content)
1059
1060         return content
1061
1062     def __print_error(self, errnote, fatal, video_id, err):
1063         if fatal:
1064             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
1065         elif errnote:
1066             self.report_warning(f'{video_id}: {errnote}: {err}')
1067
1068     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
1069         if transform_source:
1070             xml_string = transform_source(xml_string)
1071         try:
1072             return compat_etree_fromstring(xml_string.encode())
1073         except xml.etree.ElementTree.ParseError as ve:
1074             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1075
1076     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1077         try:
1078             return json.loads(
1079                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1080         except ValueError as ve:
1081             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1082
1083     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1084         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1085
1086     def __create_download_methods(name, parser, note, errnote, return_value):
1087
1088         def parse(ie, content, *args, errnote=errnote, **kwargs):
1089             if parser is None:
1090                 return content
1091             if errnote is False:
1092                 kwargs['errnote'] = errnote
1093             # parser is fetched by name so subclasses can override it
1094             return getattr(ie, parser)(content, *args, **kwargs)
1095
1096         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1097                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
1098                             impersonate=None, require_impersonation=False):
1099             res = self._download_webpage_handle(
1100                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1101                 data=data, headers=headers, query=query, expected_status=expected_status,
1102                 impersonate=impersonate, require_impersonation=require_impersonation)
1103             if res is False:
1104                 return res
1105             content, urlh = res
1106             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1107
1108         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1109                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
1110                              impersonate=None, require_impersonation=False):
1111             if self.get_param('load_pages'):
1112                 url_or_request = self._create_request(url_or_request, data, headers, query)
1113                 filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
1114                 self.to_screen(f'Loading request from {filename}')
1115                 try:
1116                     with open(filename, 'rb') as dumpf:
1117                         webpage_bytes = dumpf.read()
1118                 except OSError as e:
1119                     self.report_warning(f'Unable to load request from disk: {e}')
1120                 else:
1121                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1122                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1123             kwargs = {
1124                 'note': note,
1125                 'errnote': errnote,
1126                 'transform_source': transform_source,
1127                 'fatal': fatal,
1128                 'encoding': encoding,
1129                 'data': data,
1130                 'headers': headers,
1131                 'query': query,
1132                 'expected_status': expected_status,
1133                 'impersonate': impersonate,
1134                 'require_impersonation': require_impersonation,
1135             }
1136             if parser is None:
1137                 kwargs.pop('transform_source')
1138             # The method is fetched by name so subclasses can override _download_..._handle
1139             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1140             return res if res is False else res[0]
1141
1142         def impersonate(func, name, return_value):
1143             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1144             func.__doc__ = f'''
1145                 @param transform_source     Apply this transformation before parsing
1146                 @returns                    {return_value}
1147
1148                 See _download_webpage_handle docstring for other arguments specification
1149             '''
1150
1151         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1152         impersonate(download_content, f'_download_{name}', f'{return_value}')
1153         return download_handle, download_content
1154
1155     _download_xml_handle, _download_xml = __create_download_methods(
1156         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1157     _download_json_handle, _download_json = __create_download_methods(
1158         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1159     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1160         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1161     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1162
1163     def _download_webpage(
1164             self, url_or_request, video_id, note=None, errnote=None,
1165             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1166         """
1167         Return the data of the page as a string.
1168
1169         Keyword arguments:
1170         tries -- number of tries
1171         timeout -- sleep interval between tries
1172
1173         See _download_webpage_handle docstring for other arguments specification.
1174         """
1175
1176         R''' # NB: These are unused; should they be deprecated?
1177         if tries != 1:
1178             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1179         if timeout is NO_DEFAULT:
1180             timeout = 5
1181         else:
1182             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1183         '''
1184
1185         try_count = 0
1186         while True:
1187             try:
1188                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1189             except IncompleteRead as e:
1190                 try_count += 1
1191                 if try_count >= tries:
1192                     raise e
1193                 self._sleep(timeout, video_id)
1194
1195     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1196         idstr = format_field(video_id, None, '%s: ')
1197         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1198         if only_once:
1199             if f'WARNING: {msg}' in self._printed_messages:
1200                 return
1201             self._printed_messages.add(f'WARNING: {msg}')
1202         self._downloader.report_warning(msg, *args, **kwargs)
1203
1204     def to_screen(self, msg, *args, **kwargs):
1205         """Print msg to screen, prefixing it with '[ie_name]'"""
1206         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1207
1208     def write_debug(self, msg, *args, **kwargs):
1209         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1210
1211     def get_param(self, name, default=None, *args, **kwargs):
1212         if self._downloader:
1213             return self._downloader.params.get(name, default, *args, **kwargs)
1214         return default
1215
1216     def report_drm(self, video_id, partial=NO_DEFAULT):
1217         if partial is not NO_DEFAULT:
1218             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1219         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1220
1221     def report_extraction(self, id_or_name):
1222         """Report information extraction."""
1223         self.to_screen(f'{id_or_name}: Extracting information')
1224
1225     def report_download_webpage(self, video_id):
1226         """Report webpage download."""
1227         self.to_screen(f'{video_id}: Downloading webpage')
1228
1229     def report_age_confirmation(self):
1230         """Report attempt to confirm age."""
1231         self.to_screen('Confirming age')
1232
1233     def report_login(self):
1234         """Report attempt to log in."""
1235         self.to_screen('Logging in')
1236
1237     def raise_login_required(
1238             self, msg='This video is only available for registered users',
1239             metadata_available=False, method=NO_DEFAULT):
1240         if metadata_available and (
1241                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1242             self.report_warning(msg)
1243             return
1244         msg += format_field(self._login_hint(method), None, '. %s')
1245         raise ExtractorError(msg, expected=True)
1246
1247     def raise_geo_restricted(
1248             self, msg='This video is not available from your location due to geo restriction',
1249             countries=None, metadata_available=False):
1250         if metadata_available and (
1251                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1252             self.report_warning(msg)
1253         else:
1254             raise GeoRestrictedError(msg, countries=countries)
1255
1256     def raise_no_formats(self, msg, expected=False, video_id=None):
1257         if expected and (
1258                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1259             self.report_warning(msg, video_id)
1260         elif isinstance(msg, ExtractorError):
1261             raise msg
1262         else:
1263             raise ExtractorError(msg, expected=expected, video_id=video_id)
1264
1265     # Methods for following #608
1266     @staticmethod
1267     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1268         """Returns a URL that points to a page that should be processed"""
1269         if ie is not None:
1270             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1271         if video_id is not None:
1272             kwargs['id'] = video_id
1273         if video_title is not None:
1274             kwargs['title'] = video_title
1275         return {
1276             **kwargs,
1277             '_type': 'url_transparent' if url_transparent else 'url',
1278             'url': url,
1279         }
1280
1281     @classmethod
1282     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1283                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1284         return cls.playlist_result(
1285             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1286             playlist_id, playlist_title, **kwargs)
1287
1288     @staticmethod
1289     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1290         """Returns a playlist"""
1291         if playlist_id:
1292             kwargs['id'] = playlist_id
1293         if playlist_title:
1294             kwargs['title'] = playlist_title
1295         if playlist_description is not None:
1296             kwargs['description'] = playlist_description
1297         return {
1298             **kwargs,
1299             '_type': 'multi_video' if multi_video else 'playlist',
1300             'entries': entries,
1301         }
1302
1303     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1304         """
1305         Perform a regex search on the given string, using a single or a list of
1306         patterns returning the first matching group.
1307         In case of failure return a default value or raise a WARNING or a
1308         RegexNotFoundError, depending on fatal, specifying the field name.
1309         """
1310         if string is None:
1311             mobj = None
1312         elif isinstance(pattern, (str, re.Pattern)):
1313             mobj = re.search(pattern, string, flags)
1314         else:
1315             for p in pattern:
1316                 mobj = re.search(p, string, flags)
1317                 if mobj:
1318                     break
1319
1320         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1321
1322         if mobj:
1323             if group is None:
1324                 # return the first matching group
1325                 return next(g for g in mobj.groups() if g is not None)
1326             elif isinstance(group, (list, tuple)):
1327                 return tuple(mobj.group(g) for g in group)
1328             else:
1329                 return mobj.group(group)
1330         elif default is not NO_DEFAULT:
1331             return default
1332         elif fatal:
1333             raise RegexNotFoundError(f'Unable to extract {_name}')
1334         else:
1335             self.report_warning(f'unable to extract {_name}' + bug_reports_message())
1336             return None
1337
1338     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1339                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1340         """Searches string for the JSON object specified by start_pattern"""
1341         # NB: end_pattern is only used to reduce the size of the initial match
1342         if default is NO_DEFAULT:
1343             default, has_default = {}, False
1344         else:
1345             fatal, has_default = False, True
1346
1347         json_string = self._search_regex(
1348             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1349             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1350         if not json_string:
1351             return default
1352
1353         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1354         try:
1355             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1356         except ExtractorError as e:
1357             if fatal:
1358                 raise ExtractorError(
1359                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1360             elif not has_default:
1361                 self.report_warning(
1362                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1363         return default
1364
1365     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1366         """
1367         Like _search_regex, but strips HTML tags and unescapes entities.
1368         """
1369         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1370         if isinstance(res, tuple):
1371             return tuple(map(clean_html, res))
1372         return clean_html(res)
1373
1374     def _get_netrc_login_info(self, netrc_machine=None):
1375         netrc_machine = netrc_machine or self._NETRC_MACHINE
1376
1377         cmd = self.get_param('netrc_cmd')
1378         if cmd:
1379             cmd = cmd.replace('{}', netrc_machine)
1380             self.to_screen(f'Executing command: {cmd}')
1381             stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1382             if ret != 0:
1383                 raise OSError(f'Command returned error code {ret}')
1384             info = netrc_from_content(stdout).authenticators(netrc_machine)
1385
1386         elif self.get_param('usenetrc', False):
1387             netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1388             if os.path.isdir(netrc_file):
1389                 netrc_file = os.path.join(netrc_file, '.netrc')
1390             info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1391
1392         else:
1393             return None, None
1394         if not info:
1395             self.to_screen(f'No authenticators for {netrc_machine}')
1396             return None, None
1397
1398         self.write_debug(f'Using netrc for {netrc_machine} authentication')
1399         return info[0], info[2]
1400
1401     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1402         """
1403         Get the login info as (username, password)
1404         First look for the manually specified credentials using username_option
1405         and password_option as keys in params dictionary. If no such credentials
1406         are available try the netrc_cmd if it is defined or look in the
1407         netrc file using the netrc_machine or _NETRC_MACHINE value.
1408         If there's no info available, return (None, None)
1409         """
1410
1411         username = self.get_param(username_option)
1412         if username is not None:
1413             password = self.get_param(password_option)
1414         else:
1415             try:
1416                 username, password = self._get_netrc_login_info(netrc_machine)
1417             except (OSError, netrc.NetrcParseError) as err:
1418                 self.report_warning(f'Failed to parse .netrc: {err}')
1419                 return None, None
1420         return username, password
1421
1422     def _get_tfa_info(self, note='two-factor verification code'):
1423         """
1424         Get the two-factor authentication info
1425         TODO - asking the user will be required for sms/phone verify
1426         currently just uses the command line option
1427         If there's no info available, return None
1428         """
1429
1430         tfa = self.get_param('twofactor')
1431         if tfa is not None:
1432             return tfa
1433
1434         return getpass.getpass(f'Type {note} and press [Return]: ')
1435
1436     # Helper functions for extracting OpenGraph info
1437     @staticmethod
1438     def _og_regexes(prop):
1439         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1440         property_re = r'(?:name|property)=(?:\'og{sep}{prop}\'|"og{sep}{prop}"|\s*og{sep}{prop}\b)'.format(
1441             prop=re.escape(prop), sep='(?:&#x3A;|[:-])')
1442         template = r'<meta[^>]+?%s[^>]+?%s'
1443         return [
1444             template % (property_re, content_re),
1445             template % (content_re, property_re),
1446         ]
1447
1448     @staticmethod
1449     def _meta_regex(prop):
1450         return rf'''(?isx)<meta
1451                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?){re.escape(prop)}\1)
1452                     [^>]+?content=(["\'])(?P<content>.*?)\2'''
1453
1454     def _og_search_property(self, prop, html, name=None, **kargs):
1455         prop = variadic(prop)
1456         if name is None:
1457             name = f'OpenGraph {prop[0]}'
1458         og_regexes = []
1459         for p in prop:
1460             og_regexes.extend(self._og_regexes(p))
1461         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1462         if escaped is None:
1463             return None
1464         return unescapeHTML(escaped)
1465
1466     def _og_search_thumbnail(self, html, **kargs):
1467         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1468
1469     def _og_search_description(self, html, **kargs):
1470         return self._og_search_property('description', html, fatal=False, **kargs)
1471
1472     def _og_search_title(self, html, *, fatal=False, **kargs):
1473         return self._og_search_property('title', html, fatal=fatal, **kargs)
1474
1475     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1476         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1477         if secure:
1478             regexes = self._og_regexes('video:secure_url') + regexes
1479         return self._html_search_regex(regexes, html, name, **kargs)
1480
1481     def _og_search_url(self, html, **kargs):
1482         return self._og_search_property('url', html, **kargs)
1483
1484     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1485         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1486
1487     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1488         name = variadic(name)
1489         if display_name is None:
1490             display_name = name[0]
1491         return self._html_search_regex(
1492             [self._meta_regex(n) for n in name],
1493             html, display_name, fatal=fatal, group='content', **kwargs)
1494
1495     def _dc_search_uploader(self, html):
1496         return self._html_search_meta('dc.creator', html, 'uploader')
1497
1498     @staticmethod
1499     def _rta_search(html):
1500         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1501         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1502                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1503                      html):
1504             return 18
1505
1506         # And then there are the jokers who advertise that they use RTA, but actually don't.
1507         AGE_LIMIT_MARKERS = [
1508             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1509             r'>[^<]*you acknowledge you are at least (\d+) years old',
1510             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1511         ]
1512
1513         age_limit = 0
1514         for marker in AGE_LIMIT_MARKERS:
1515             mobj = re.search(marker, html)
1516             if mobj:
1517                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1518         return age_limit
1519
1520     def _media_rating_search(self, html):
1521         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1522         rating = self._html_search_meta('rating', html)
1523
1524         if not rating:
1525             return None
1526
1527         RATING_TABLE = {
1528             'safe for kids': 0,
1529             'general': 8,
1530             '14 years': 14,
1531             'mature': 17,
1532             'restricted': 19,
1533         }
1534         return RATING_TABLE.get(rating.lower())
1535
1536     def _family_friendly_search(self, html):
1537         # See http://schema.org/VideoObject
1538         family_friendly = self._html_search_meta(
1539             'isFamilyFriendly', html, default=None)
1540
1541         if not family_friendly:
1542             return None
1543
1544         RATING_TABLE = {
1545             '1': 0,
1546             'true': 0,
1547             '0': 18,
1548             'false': 18,
1549         }
1550         return RATING_TABLE.get(family_friendly.lower())
1551
1552     def _twitter_search_player(self, html):
1553         return self._html_search_meta('twitter:player', html,
1554                                       'twitter card player')
1555
1556     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1557         """Yield all json ld objects in the html"""
1558         if default is not NO_DEFAULT:
1559             fatal = False
1560         for mobj in re.finditer(JSON_LD_RE, html):
1561             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1562             for json_ld in variadic(json_ld_item):
1563                 if isinstance(json_ld, dict):
1564                     yield json_ld
1565
1566     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1567         """Search for a video in any json ld in the html"""
1568         if default is not NO_DEFAULT:
1569             fatal = False
1570         info = self._json_ld(
1571             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1572             video_id, fatal=fatal, expected_type=expected_type)
1573         if info:
1574             return info
1575         if default is not NO_DEFAULT:
1576             return default
1577         elif fatal:
1578             raise RegexNotFoundError('Unable to extract JSON-LD')
1579         else:
1580             self.report_warning(f'unable to extract JSON-LD {bug_reports_message()}')
1581             return {}
1582
1583     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1584         if isinstance(json_ld, str):
1585             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1586         if not json_ld:
1587             return {}
1588         info = {}
1589
1590         INTERACTION_TYPE_MAP = {
1591             'CommentAction': 'comment',
1592             'AgreeAction': 'like',
1593             'DisagreeAction': 'dislike',
1594             'LikeAction': 'like',
1595             'DislikeAction': 'dislike',
1596             'ListenAction': 'view',
1597             'WatchAction': 'view',
1598             'ViewAction': 'view',
1599         }
1600
1601         def is_type(e, *expected_types):
1602             type_ = variadic(traverse_obj(e, '@type'))
1603             return any(x in type_ for x in expected_types)
1604
1605         def extract_interaction_type(e):
1606             interaction_type = e.get('interactionType')
1607             if isinstance(interaction_type, dict):
1608                 interaction_type = interaction_type.get('@type')
1609             return str_or_none(interaction_type)
1610
1611         def extract_interaction_statistic(e):
1612             interaction_statistic = e.get('interactionStatistic')
1613             if isinstance(interaction_statistic, dict):
1614                 interaction_statistic = [interaction_statistic]
1615             if not isinstance(interaction_statistic, list):
1616                 return
1617             for is_e in interaction_statistic:
1618                 if not is_type(is_e, 'InteractionCounter'):
1619                     continue
1620                 interaction_type = extract_interaction_type(is_e)
1621                 if not interaction_type:
1622                     continue
1623                 # For interaction count some sites provide string instead of
1624                 # an integer (as per spec) with non digit characters (e.g. ",")
1625                 # so extracting count with more relaxed str_to_int
1626                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1627                 if interaction_count is None:
1628                     continue
1629                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1630                 if not count_kind:
1631                     continue
1632                 count_key = f'{count_kind}_count'
1633                 if info.get(count_key) is not None:
1634                     continue
1635                 info[count_key] = interaction_count
1636
1637         def extract_chapter_information(e):
1638             chapters = [{
1639                 'title': part.get('name'),
1640                 'start_time': part.get('startOffset'),
1641                 'end_time': part.get('endOffset'),
1642             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1643             for idx, (last_c, current_c, next_c) in enumerate(zip(
1644                     [{'end_time': 0}, *chapters], chapters, chapters[1:])):
1645                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1646                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1647                 if None in current_c.values():
1648                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1649                     return
1650             if chapters:
1651                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1652                 info['chapters'] = chapters
1653
1654         def extract_video_object(e):
1655             author = e.get('author')
1656             info.update({
1657                 'url': url_or_none(e.get('contentUrl')),
1658                 'ext': mimetype2ext(e.get('encodingFormat')),
1659                 'title': unescapeHTML(e.get('name')),
1660                 'description': unescapeHTML(e.get('description')),
1661                 'thumbnails': [{'url': unescapeHTML(url)}
1662                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1663                                if url_or_none(url)],
1664                 'duration': parse_duration(e.get('duration')),
1665                 'timestamp': unified_timestamp(e.get('uploadDate')),
1666                 # author can be an instance of 'Organization' or 'Person' types.
1667                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1668                 # however some websites are using 'Text' type instead.
1669                 # 1. https://schema.org/VideoObject
1670                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1671                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1672                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1673                 'tbr': int_or_none(e.get('bitrate')),
1674                 'width': int_or_none(e.get('width')),
1675                 'height': int_or_none(e.get('height')),
1676                 'view_count': int_or_none(e.get('interactionCount')),
1677                 'tags': try_call(lambda: e.get('keywords').split(',')),
1678             })
1679             if is_type(e, 'AudioObject'):
1680                 info.update({
1681                     'vcodec': 'none',
1682                     'abr': int_or_none(e.get('bitrate')),
1683                 })
1684             extract_interaction_statistic(e)
1685             extract_chapter_information(e)
1686
1687         def traverse_json_ld(json_ld, at_top_level=True):
1688             for e in variadic(json_ld):
1689                 if not isinstance(e, dict):
1690                     continue
1691                 if at_top_level and '@context' not in e:
1692                     continue
1693                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1694                     traverse_json_ld(e['@graph'], at_top_level=False)
1695                     continue
1696                 if expected_type is not None and not is_type(e, expected_type):
1697                     continue
1698                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1699                 if rating is not None:
1700                     info['average_rating'] = rating
1701                 if is_type(e, 'TVEpisode', 'Episode'):
1702                     episode_name = unescapeHTML(e.get('name'))
1703                     info.update({
1704                         'episode': episode_name,
1705                         'episode_number': int_or_none(e.get('episodeNumber')),
1706                         'description': unescapeHTML(e.get('description')),
1707                     })
1708                     if not info.get('title') and episode_name:
1709                         info['title'] = episode_name
1710                     part_of_season = e.get('partOfSeason')
1711                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1712                         info.update({
1713                             'season': unescapeHTML(part_of_season.get('name')),
1714                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1715                         })
1716                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1717                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1718                         info['series'] = unescapeHTML(part_of_series.get('name'))
1719                 elif is_type(e, 'Movie'):
1720                     info.update({
1721                         'title': unescapeHTML(e.get('name')),
1722                         'description': unescapeHTML(e.get('description')),
1723                         'duration': parse_duration(e.get('duration')),
1724                         'timestamp': unified_timestamp(e.get('dateCreated')),
1725                     })
1726                 elif is_type(e, 'Article', 'NewsArticle'):
1727                     info.update({
1728                         'timestamp': parse_iso8601(e.get('datePublished')),
1729                         'title': unescapeHTML(e.get('headline')),
1730                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1731                     })
1732                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1733                         extract_video_object(e['video'][0])
1734                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1735                         extract_video_object(e['subjectOf'][0])
1736                 elif is_type(e, 'VideoObject', 'AudioObject'):
1737                     extract_video_object(e)
1738                     if expected_type is None:
1739                         continue
1740                     else:
1741                         break
1742                 video = e.get('video')
1743                 if is_type(video, 'VideoObject'):
1744                     extract_video_object(video)
1745                 if expected_type is None:
1746                     continue
1747                 else:
1748                     break
1749
1750         traverse_json_ld(json_ld)
1751         return filter_dict(info)
1752
1753     def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw):
1754         if default == '{}':
1755             self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead')
1756             default = {}
1757         if default is not NO_DEFAULT:
1758             fatal = False
1759
1760         return self._search_json(
1761             r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
1762             video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
1763
1764     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1765         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1766         rectx = re.escape(context_name)
1767         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1768         js, arg_keys, arg_vals = self._search_regex(
1769             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1770             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1771             default=NO_DEFAULT if fatal else (None, None, None))
1772         if js is None:
1773             return {}
1774
1775         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1776             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1777
1778         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1779         return traverse_obj(ret, traverse) or {}
1780
1781     @staticmethod
1782     def _hidden_inputs(html):
1783         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1784         hidden_inputs = {}
1785         for input_el in re.findall(r'(?i)(<input[^>]+>)', html):
1786             attrs = extract_attributes(input_el)
1787             if not input_el:
1788                 continue
1789             if attrs.get('type') not in ('hidden', 'submit'):
1790                 continue
1791             name = attrs.get('name') or attrs.get('id')
1792             value = attrs.get('value')
1793             if name and value is not None:
1794                 hidden_inputs[name] = value
1795         return hidden_inputs
1796
1797     def _form_hidden_inputs(self, form_id, html):
1798         form = self._search_regex(
1799             rf'(?is)<form[^>]+?id=(["\']){form_id}\1[^>]*>(?P<form>.+?)</form>',
1800             html, f'{form_id} form', group='form')
1801         return self._hidden_inputs(form)
1802
1803     @classproperty(cache=True)
1804     def FormatSort(cls):
1805         class FormatSort(FormatSorter):
1806             def __init__(ie, *args, **kwargs):
1807                 super().__init__(ie._downloader, *args, **kwargs)
1808
1809         deprecation_warning(
1810             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1811             'Use yt_dlp.utils.FormatSorter instead')
1812         return FormatSort
1813
1814     def _sort_formats(self, formats, field_preference=[]):
1815         if not field_preference:
1816             self._downloader.deprecation_warning(
1817                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1818             return
1819         self._downloader.deprecation_warning(
1820             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1821             'Return _format_sort_fields in the info_dict instead')
1822         if formats:
1823             formats[0]['__sort_fields'] = field_preference
1824
1825     def _check_formats(self, formats, video_id):
1826         if formats:
1827             formats[:] = filter(
1828                 lambda f: self._is_valid_url(
1829                     f['url'], video_id,
1830                     item='{} video format'.format(f.get('format_id')) if f.get('format_id') else 'video'),
1831                 formats)
1832
1833     @staticmethod
1834     def _remove_duplicate_formats(formats):
1835         format_urls = set()
1836         unique_formats = []
1837         for f in formats:
1838             if f['url'] not in format_urls:
1839                 format_urls.add(f['url'])
1840                 unique_formats.append(f)
1841         formats[:] = unique_formats
1842
1843     def _is_valid_url(self, url, video_id, item='video', headers={}):
1844         url = self._proto_relative_url(url, scheme='http:')
1845         # For now assume non HTTP(S) URLs always valid
1846         if not url.startswith(('http://', 'https://')):
1847             return True
1848         try:
1849             self._request_webpage(url, video_id, f'Checking {item} URL', headers=headers)
1850             return True
1851         except ExtractorError as e:
1852             self.to_screen(
1853                 f'{video_id}: {item} URL is invalid, skipping: {e.cause!s}')
1854             return False
1855
1856     def http_scheme(self):
1857         """ Either "http:" or "https:", depending on the user's preferences """
1858         return (
1859             'http:'
1860             if self.get_param('prefer_insecure', False)
1861             else 'https:')
1862
1863     def _proto_relative_url(self, url, scheme=None):
1864         scheme = scheme or self.http_scheme()
1865         assert scheme.endswith(':')
1866         return sanitize_url(url, scheme=scheme[:-1])
1867
1868     def _sleep(self, timeout, video_id, msg_template=None):
1869         if msg_template is None:
1870             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1871         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1872         self.to_screen(msg)
1873         time.sleep(timeout)
1874
1875     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1876                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1877                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1878         if self.get_param('ignore_no_formats_error'):
1879             fatal = False
1880
1881         res = self._download_xml_handle(
1882             manifest_url, video_id, 'Downloading f4m manifest',
1883             'Unable to download f4m manifest',
1884             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1885             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1886             transform_source=transform_source,
1887             fatal=fatal, data=data, headers=headers, query=query)
1888         if res is False:
1889             return []
1890
1891         manifest, urlh = res
1892         manifest_url = urlh.url
1893
1894         return self._parse_f4m_formats(
1895             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1896             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1897
1898     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1899                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1900                            fatal=True, m3u8_id=None):
1901         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1902             return []
1903
1904         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1905         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1906         if akamai_pv is not None and ';' in akamai_pv.text:
1907             player_verification_challenge = akamai_pv.text.split(';')[0]
1908             if player_verification_challenge.strip() != '':
1909                 return []
1910
1911         formats = []
1912         manifest_version = '1.0'
1913         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1914         if not media_nodes:
1915             manifest_version = '2.0'
1916             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1917         # Remove unsupported DRM protected media from final formats
1918         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1919         media_nodes = remove_encrypted_media(media_nodes)
1920         if not media_nodes:
1921             return formats
1922
1923         manifest_base_url = get_base_url(manifest)
1924
1925         bootstrap_info = xpath_element(
1926             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1927             'bootstrap info', default=None)
1928
1929         vcodec = None
1930         mime_type = xpath_text(
1931             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1932             'base URL', default=None)
1933         if mime_type and mime_type.startswith('audio/'):
1934             vcodec = 'none'
1935
1936         for i, media_el in enumerate(media_nodes):
1937             tbr = int_or_none(media_el.attrib.get('bitrate'))
1938             width = int_or_none(media_el.attrib.get('width'))
1939             height = int_or_none(media_el.attrib.get('height'))
1940             format_id = join_nonempty(f4m_id, tbr or i)
1941             # If <bootstrapInfo> is present, the specified f4m is a
1942             # stream-level manifest, and only set-level manifests may refer to
1943             # external resources.  See section 11.4 and section 4 of F4M spec
1944             if bootstrap_info is None:
1945                 media_url = None
1946                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1947                 if manifest_version == '2.0':
1948                     media_url = media_el.attrib.get('href')
1949                 if media_url is None:
1950                     media_url = media_el.attrib.get('url')
1951                 if not media_url:
1952                     continue
1953                 manifest_url = (
1954                     media_url if media_url.startswith(('http://', 'https://'))
1955                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1956                 # If media_url is itself a f4m manifest do the recursive extraction
1957                 # since bitrates in parent manifest (this one) and media_url manifest
1958                 # may differ leading to inability to resolve the format by requested
1959                 # bitrate in f4m downloader
1960                 ext = determine_ext(manifest_url)
1961                 if ext == 'f4m':
1962                     f4m_formats = self._extract_f4m_formats(
1963                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1964                         transform_source=transform_source, fatal=fatal)
1965                     # Sometimes stream-level manifest contains single media entry that
1966                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1967                     # At the same time parent's media entry in set-level manifest may
1968                     # contain it. We will copy it from parent in such cases.
1969                     if len(f4m_formats) == 1:
1970                         f = f4m_formats[0]
1971                         f.update({
1972                             'tbr': f.get('tbr') or tbr,
1973                             'width': f.get('width') or width,
1974                             'height': f.get('height') or height,
1975                             'format_id': f.get('format_id') if not tbr else format_id,
1976                             'vcodec': vcodec,
1977                         })
1978                     formats.extend(f4m_formats)
1979                     continue
1980                 elif ext == 'm3u8':
1981                     formats.extend(self._extract_m3u8_formats(
1982                         manifest_url, video_id, 'mp4', preference=preference,
1983                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1984                     continue
1985             formats.append({
1986                 'format_id': format_id,
1987                 'url': manifest_url,
1988                 'manifest_url': manifest_url,
1989                 'ext': 'flv' if bootstrap_info is not None else None,
1990                 'protocol': 'f4m',
1991                 'tbr': tbr,
1992                 'width': width,
1993                 'height': height,
1994                 'vcodec': vcodec,
1995                 'preference': preference,
1996                 'quality': quality,
1997             })
1998         return formats
1999
2000     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2001         return {
2002             'format_id': join_nonempty(m3u8_id, 'meta'),
2003             'url': m3u8_url,
2004             'ext': ext,
2005             'protocol': 'm3u8',
2006             'preference': preference - 100 if preference else -100,
2007             'quality': quality,
2008             'resolution': 'multiple',
2009             'format_note': 'Quality selection URL',
2010         }
2011
2012     def _report_ignoring_subs(self, name):
2013         self.report_warning(bug_reports_message(
2014             f'Ignoring subtitle tracks found in the {name} manifest; '
2015             'if any subtitle tracks are missing,',
2016         ), only_once=True)
2017
2018     def _extract_m3u8_formats(self, *args, **kwargs):
2019         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2020         if subs:
2021             self._report_ignoring_subs('HLS')
2022         return fmts
2023
2024     def _extract_m3u8_formats_and_subtitles(
2025             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2026             preference=None, quality=None, m3u8_id=None, note=None,
2027             errnote=None, fatal=True, live=False, data=None, headers={},
2028             query={}):
2029
2030         if self.get_param('ignore_no_formats_error'):
2031             fatal = False
2032
2033         if not m3u8_url:
2034             if errnote is not False:
2035                 errnote = errnote or 'Failed to obtain m3u8 URL'
2036                 if fatal:
2037                     raise ExtractorError(errnote, video_id=video_id)
2038                 self.report_warning(f'{errnote}{bug_reports_message()}')
2039             return [], {}
2040
2041         res = self._download_webpage_handle(
2042             m3u8_url, video_id,
2043             note='Downloading m3u8 information' if note is None else note,
2044             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2045             fatal=fatal, data=data, headers=headers, query=query)
2046
2047         if res is False:
2048             return [], {}
2049
2050         m3u8_doc, urlh = res
2051         m3u8_url = urlh.url
2052
2053         return self._parse_m3u8_formats_and_subtitles(
2054             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2055             preference=preference, quality=quality, m3u8_id=m3u8_id,
2056             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2057             headers=headers, query=query, video_id=video_id)
2058
2059     def _parse_m3u8_formats_and_subtitles(
2060             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2061             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2062             errnote=None, fatal=True, data=None, headers={}, query={},
2063             video_id=None):
2064         formats, subtitles = [], {}
2065         has_drm = HlsFD._has_drm(m3u8_doc)
2066
2067         def format_url(url):
2068             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2069
2070         if self.get_param('hls_split_discontinuity', False):
2071             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2072                 if not m3u8_doc:
2073                     if not manifest_url:
2074                         return []
2075                     m3u8_doc = self._download_webpage(
2076                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2077                         note=False, errnote='Failed to download m3u8 playlist information')
2078                     if m3u8_doc is False:
2079                         return []
2080                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2081
2082         else:
2083             def _extract_m3u8_playlist_indices(*args, **kwargs):
2084                 return [None]
2085
2086         # References:
2087         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2088         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2089         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2090
2091         # We should try extracting formats only from master playlists [1, 4.3.4],
2092         # i.e. playlists that describe available qualities. On the other hand
2093         # media playlists [1, 4.3.3] should be returned as is since they contain
2094         # just the media without qualities renditions.
2095         # Fortunately, master playlist can be easily distinguished from media
2096         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2097         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2098         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2099         # media playlist and MUST NOT appear in master playlist thus we can
2100         # clearly detect media playlist with this criterion.
2101
2102         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2103             formats = [{
2104                 'format_id': join_nonempty(m3u8_id, idx),
2105                 'format_index': idx,
2106                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode(), 'application/x-mpegurl'),
2107                 'ext': ext,
2108                 'protocol': entry_protocol,
2109                 'preference': preference,
2110                 'quality': quality,
2111                 'has_drm': has_drm,
2112             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2113
2114             return formats, subtitles
2115
2116         groups = {}
2117         last_stream_inf = {}
2118
2119         def extract_media(x_media_line):
2120             media = parse_m3u8_attributes(x_media_line)
2121             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2122             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2123             if not (media_type and group_id and name):
2124                 return
2125             groups.setdefault(group_id, []).append(media)
2126             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2127             if media_type == 'SUBTITLES':
2128                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2129                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2130                 # However, lack of URI has been spotted in the wild.
2131                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2132                 if not media.get('URI'):
2133                     return
2134                 url = format_url(media['URI'])
2135                 sub_info = {
2136                     'url': url,
2137                     'ext': determine_ext(url),
2138                 }
2139                 if sub_info['ext'] == 'm3u8':
2140                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2141                     # files may contain is WebVTT:
2142                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2143                     sub_info['ext'] = 'vtt'
2144                     sub_info['protocol'] = 'm3u8_native'
2145                 lang = media.get('LANGUAGE') or 'und'
2146                 subtitles.setdefault(lang, []).append(sub_info)
2147             if media_type not in ('VIDEO', 'AUDIO'):
2148                 return
2149             media_url = media.get('URI')
2150             if media_url:
2151                 manifest_url = format_url(media_url)
2152                 formats.extend({
2153                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2154                     'format_note': name,
2155                     'format_index': idx,
2156                     'url': manifest_url,
2157                     'manifest_url': m3u8_url,
2158                     'language': media.get('LANGUAGE'),
2159                     'ext': ext,
2160                     'protocol': entry_protocol,
2161                     'preference': preference,
2162                     'quality': quality,
2163                     'has_drm': has_drm,
2164                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2165                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2166
2167         def build_stream_name():
2168             # Despite specification does not mention NAME attribute for
2169             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2170             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2171             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2172             stream_name = last_stream_inf.get('NAME')
2173             if stream_name:
2174                 return stream_name
2175             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2176             # from corresponding rendition group
2177             stream_group_id = last_stream_inf.get('VIDEO')
2178             if not stream_group_id:
2179                 return
2180             stream_group = groups.get(stream_group_id)
2181             if not stream_group:
2182                 return stream_group_id
2183             rendition = stream_group[0]
2184             return rendition.get('NAME') or stream_group_id
2185
2186         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2187         # chance to detect video only formats when EXT-X-STREAM-INF tags
2188         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2189         for line in m3u8_doc.splitlines():
2190             if line.startswith('#EXT-X-MEDIA:'):
2191                 extract_media(line)
2192
2193         for line in m3u8_doc.splitlines():
2194             if line.startswith('#EXT-X-STREAM-INF:'):
2195                 last_stream_inf = parse_m3u8_attributes(line)
2196             elif line.startswith('#') or not line.strip():
2197                 continue
2198             else:
2199                 tbr = float_or_none(
2200                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2201                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2202                 manifest_url = format_url(line.strip())
2203
2204                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2205                     format_id = [m3u8_id, None, idx]
2206                     # Bandwidth of live streams may differ over time thus making
2207                     # format_id unpredictable. So it's better to keep provided
2208                     # format_id intact.
2209                     if not live:
2210                         stream_name = build_stream_name()
2211                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2212                     f = {
2213                         'format_id': join_nonempty(*format_id),
2214                         'format_index': idx,
2215                         'url': manifest_url,
2216                         'manifest_url': m3u8_url,
2217                         'tbr': tbr,
2218                         'ext': ext,
2219                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2220                         'protocol': entry_protocol,
2221                         'preference': preference,
2222                         'quality': quality,
2223                         'has_drm': has_drm,
2224                     }
2225
2226                     # YouTube-specific
2227                     if yt_audio_content_id := last_stream_inf.get('YT-EXT-AUDIO-CONTENT-ID'):
2228                         f['language'] = yt_audio_content_id.split('.')[0]
2229
2230                     resolution = last_stream_inf.get('RESOLUTION')
2231                     if resolution:
2232                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2233                         if mobj:
2234                             f['width'] = int(mobj.group('width'))
2235                             f['height'] = int(mobj.group('height'))
2236                     # Unified Streaming Platform
2237                     mobj = re.search(
2238                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2239                     if mobj:
2240                         abr, vbr = mobj.groups()
2241                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2242                         f.update({
2243                             'vbr': vbr,
2244                             'abr': abr,
2245                         })
2246                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2247                     f.update(codecs)
2248                     audio_group_id = last_stream_inf.get('AUDIO')
2249                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2250                     # references a rendition group MUST have a CODECS attribute.
2251                     # However, this is not always respected. E.g. [2]
2252                     # contains EXT-X-STREAM-INF tag which references AUDIO
2253                     # rendition group but does not have CODECS and despite
2254                     # referencing an audio group it represents a complete
2255                     # (with audio and video) format. So, for such cases we will
2256                     # ignore references to rendition groups and treat them
2257                     # as complete formats.
2258                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2259                         audio_group = groups.get(audio_group_id)
2260                         if audio_group and audio_group[0].get('URI'):
2261                             # TODO: update acodec for audio only formats with
2262                             # the same GROUP-ID
2263                             f['acodec'] = 'none'
2264                     if not f.get('ext'):
2265                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2266                     formats.append(f)
2267
2268                     # for DailyMotion
2269                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2270                     if progressive_uri:
2271                         http_f = f.copy()
2272                         del http_f['manifest_url']
2273                         http_f.update({
2274                             'format_id': f['format_id'].replace('hls-', 'http-'),
2275                             'protocol': 'http',
2276                             'url': progressive_uri,
2277                         })
2278                         formats.append(http_f)
2279
2280                 last_stream_inf = {}
2281         return formats, subtitles
2282
2283     def _extract_m3u8_vod_duration(
2284             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2285
2286         m3u8_vod = self._download_webpage(
2287             m3u8_vod_url, video_id,
2288             note='Downloading m3u8 VOD manifest' if note is None else note,
2289             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2290             fatal=False, data=data, headers=headers, query=query)
2291
2292         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2293
2294     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2295         if '#EXT-X-ENDLIST' not in m3u8_vod:
2296             return None
2297
2298         return int(sum(
2299             float(line[len('#EXTINF:'):].split(',')[0])
2300             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2301
2302     def _extract_mpd_vod_duration(
2303             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2304
2305         mpd_doc = self._download_xml(
2306             mpd_url, video_id,
2307             note='Downloading MPD VOD manifest' if note is None else note,
2308             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2309             fatal=False, data=data, headers=headers, query=query)
2310         if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2311             return None
2312         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2313
2314     @staticmethod
2315     def _xpath_ns(path, namespace=None):
2316         if not namespace:
2317             return path
2318         out = []
2319         for c in path.split('/'):
2320             if not c or c == '.':
2321                 out.append(c)
2322             else:
2323                 out.append(f'{{{namespace}}}{c}')
2324         return '/'.join(out)
2325
2326     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2327         if self.get_param('ignore_no_formats_error'):
2328             fatal = False
2329
2330         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2331         if res is False:
2332             assert not fatal
2333             return [], {}
2334         smil, urlh = res
2335
2336         return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2337                                                       namespace=self._parse_smil_namespace(smil))
2338
2339     def _extract_smil_formats(self, *args, **kwargs):
2340         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2341         if subs:
2342             self._report_ignoring_subs('SMIL')
2343         return fmts
2344
2345     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2346         res = self._download_smil(smil_url, video_id, fatal=fatal)
2347         if res is False:
2348             return {}
2349
2350         smil, urlh = res
2351         smil_url = urlh.url
2352
2353         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2354
2355     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2356         return self._download_xml_handle(
2357             smil_url, video_id, 'Downloading SMIL file',
2358             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2359
2360     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2361         namespace = self._parse_smil_namespace(smil)
2362
2363         formats, subtitles = self._parse_smil_formats_and_subtitles(
2364             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2365
2366         video_id = os.path.splitext(url_basename(smil_url))[0]
2367         title = None
2368         description = None
2369         upload_date = None
2370         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2371             name = meta.attrib.get('name')
2372             content = meta.attrib.get('content')
2373             if not name or not content:
2374                 continue
2375             if not title and name == 'title':
2376                 title = content
2377             elif not description and name in ('description', 'abstract'):
2378                 description = content
2379             elif not upload_date and name == 'date':
2380                 upload_date = unified_strdate(content)
2381
2382         thumbnails = [{
2383             'id': image.get('type'),
2384             'url': image.get('src'),
2385             'width': int_or_none(image.get('width')),
2386             'height': int_or_none(image.get('height')),
2387         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2388
2389         return {
2390             'id': video_id,
2391             'title': title or video_id,
2392             'description': description,
2393             'upload_date': upload_date,
2394             'thumbnails': thumbnails,
2395             'formats': formats,
2396             'subtitles': subtitles,
2397         }
2398
2399     def _parse_smil_namespace(self, smil):
2400         return self._search_regex(
2401             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2402
2403     def _parse_smil_formats(self, *args, **kwargs):
2404         fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2405         if subs:
2406             self._report_ignoring_subs('SMIL')
2407         return fmts
2408
2409     def _parse_smil_formats_and_subtitles(
2410             self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2411         base = smil_url
2412         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2413             b = meta.get('base') or meta.get('httpBase')
2414             if b:
2415                 base = b
2416                 break
2417
2418         formats, subtitles = [], {}
2419         rtmp_count = 0
2420         http_count = 0
2421         m3u8_count = 0
2422         imgs_count = 0
2423
2424         srcs = set()
2425         media = itertools.chain.from_iterable(
2426             smil.findall(self._xpath_ns(arg, namespace))
2427             for arg in ['.//video', './/audio', './/media'])
2428         for medium in media:
2429             src = medium.get('src')
2430             if not src or src in srcs:
2431                 continue
2432             srcs.add(src)
2433
2434             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2435             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2436             width = int_or_none(medium.get('width'))
2437             height = int_or_none(medium.get('height'))
2438             proto = medium.get('proto')
2439             ext = medium.get('ext')
2440             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2441                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2442             streamer = medium.get('streamer') or base
2443
2444             if proto == 'rtmp' or streamer.startswith('rtmp'):
2445                 rtmp_count += 1
2446                 formats.append({
2447                     'url': streamer,
2448                     'play_path': src,
2449                     'ext': 'flv',
2450                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2451                     'tbr': bitrate,
2452                     'filesize': filesize,
2453                     'width': width,
2454                     'height': height,
2455                 })
2456                 if transform_rtmp_url:
2457                     streamer, src = transform_rtmp_url(streamer, src)
2458                     formats[-1].update({
2459                         'url': streamer,
2460                         'play_path': src,
2461                     })
2462                 continue
2463
2464             src_url = src if src.startswith('http') else urllib.parse.urljoin(f'{base}/', src)
2465             src_url = src_url.strip()
2466
2467             if proto == 'm3u8' or src_ext == 'm3u8':
2468                 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
2469                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2470                 self._merge_subtitles(m3u8_subs, target=subtitles)
2471                 if len(m3u8_formats) == 1:
2472                     m3u8_count += 1
2473                     m3u8_formats[0].update({
2474                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2475                         'tbr': bitrate,
2476                         'width': width,
2477                         'height': height,
2478                     })
2479                 formats.extend(m3u8_formats)
2480             elif src_ext == 'f4m':
2481                 f4m_url = src_url
2482                 if not f4m_params:
2483                     f4m_params = {
2484                         'hdcore': '3.2.0',
2485                         'plugin': 'flowplayer-3.2.0.1',
2486                     }
2487                 f4m_url += '&' if '?' in f4m_url else '?'
2488                 f4m_url += urllib.parse.urlencode(f4m_params)
2489                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2490             elif src_ext == 'mpd':
2491                 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2492                     src_url, video_id, mpd_id='dash', fatal=False)
2493                 formats.extend(mpd_formats)
2494                 self._merge_subtitles(mpd_subs, target=subtitles)
2495             elif re.search(r'\.ism/[Mm]anifest', src_url):
2496                 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2497                     src_url, video_id, ism_id='mss', fatal=False)
2498                 formats.extend(ism_formats)
2499                 self._merge_subtitles(ism_subs, target=subtitles)
2500             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2501                 http_count += 1
2502                 formats.append({
2503                     'url': src_url,
2504                     'ext': ext or src_ext or 'flv',
2505                     'format_id': 'http-%d' % (bitrate or http_count),
2506                     'tbr': bitrate,
2507                     'filesize': filesize,
2508                     'width': width,
2509                     'height': height,
2510                 })
2511
2512         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2513             src = medium.get('src')
2514             if not src or src in srcs:
2515                 continue
2516             srcs.add(src)
2517
2518             imgs_count += 1
2519             formats.append({
2520                 'format_id': f'imagestream-{imgs_count}',
2521                 'url': src,
2522                 'ext': mimetype2ext(medium.get('type')),
2523                 'acodec': 'none',
2524                 'vcodec': 'none',
2525                 'width': int_or_none(medium.get('width')),
2526                 'height': int_or_none(medium.get('height')),
2527                 'format_note': 'SMIL storyboards',
2528             })
2529
2530         smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2531         self._merge_subtitles(smil_subs, target=subtitles)
2532
2533         return formats, subtitles
2534
2535     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2536         urls = []
2537         subtitles = {}
2538         for textstream in smil.findall(self._xpath_ns('.//textstream', namespace)):
2539             src = textstream.get('src')
2540             if not src or src in urls:
2541                 continue
2542             urls.append(src)
2543             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2544             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2545             subtitles.setdefault(lang, []).append({
2546                 'url': src,
2547                 'ext': ext,
2548             })
2549         return subtitles
2550
2551     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2552         res = self._download_xml_handle(
2553             xspf_url, playlist_id, 'Downloading xpsf playlist',
2554             'Unable to download xspf manifest', fatal=fatal)
2555         if res is False:
2556             return []
2557
2558         xspf, urlh = res
2559         xspf_url = urlh.url
2560
2561         return self._parse_xspf(
2562             xspf, playlist_id, xspf_url=xspf_url,
2563             xspf_base_url=base_url(xspf_url))
2564
2565     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2566         NS_MAP = {
2567             'xspf': 'http://xspf.org/ns/0/',
2568             's1': 'http://static.streamone.nl/player/ns/0',
2569         }
2570
2571         entries = []
2572         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2573             title = xpath_text(
2574                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2575             description = xpath_text(
2576                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2577             thumbnail = xpath_text(
2578                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2579             duration = float_or_none(
2580                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2581
2582             formats = []
2583             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2584                 format_url = urljoin(xspf_base_url, location.text)
2585                 if not format_url:
2586                     continue
2587                 formats.append({
2588                     'url': format_url,
2589                     'manifest_url': xspf_url,
2590                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2591                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2592                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2593                 })
2594
2595             entries.append({
2596                 'id': playlist_id,
2597                 'title': title,
2598                 'description': description,
2599                 'thumbnail': thumbnail,
2600                 'duration': duration,
2601                 'formats': formats,
2602             })
2603         return entries
2604
2605     def _extract_mpd_formats(self, *args, **kwargs):
2606         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2607         if subs:
2608             self._report_ignoring_subs('DASH')
2609         return fmts
2610
2611     def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
2612         periods = self._extract_mpd_periods(*args, **kwargs)
2613         return self._merge_mpd_periods(periods)
2614
2615     def _extract_mpd_periods(
2616             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2617             fatal=True, data=None, headers={}, query={}):
2618
2619         if self.get_param('ignore_no_formats_error'):
2620             fatal = False
2621
2622         res = self._download_xml_handle(
2623             mpd_url, video_id,
2624             note='Downloading MPD manifest' if note is None else note,
2625             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2626             fatal=fatal, data=data, headers=headers, query=query)
2627         if res is False:
2628             return []
2629         mpd_doc, urlh = res
2630         if mpd_doc is None:
2631             return []
2632
2633         # We could have been redirected to a new url when we retrieved our mpd file.
2634         mpd_url = urlh.url
2635         mpd_base_url = base_url(mpd_url)
2636
2637         return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
2638
2639     def _parse_mpd_formats(self, *args, **kwargs):
2640         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2641         if subs:
2642             self._report_ignoring_subs('DASH')
2643         return fmts
2644
2645     def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
2646         periods = self._parse_mpd_periods(*args, **kwargs)
2647         return self._merge_mpd_periods(periods)
2648
2649     def _merge_mpd_periods(self, periods):
2650         """
2651         Combine all formats and subtitles from an MPD manifest into a single list,
2652         by concatenate streams with similar formats.
2653         """
2654         formats, subtitles = {}, {}
2655         for period in periods:
2656             for f in period['formats']:
2657                 assert 'is_dash_periods' not in f, 'format already processed'
2658                 f['is_dash_periods'] = True
2659                 format_key = tuple(v for k, v in f.items() if k not in (
2660                     ('format_id', 'fragments', 'manifest_stream_number')))
2661                 if format_key not in formats:
2662                     formats[format_key] = f
2663                 elif 'fragments' in f:
2664                     formats[format_key].setdefault('fragments', []).extend(f['fragments'])
2665
2666             if subtitles and period['subtitles']:
2667                 self.report_warning(bug_reports_message(
2668                     'Found subtitles in multiple periods in the DASH manifest; '
2669                     'if part of the subtitles are missing,',
2670                 ), only_once=True)
2671
2672             for sub_lang, sub_info in period['subtitles'].items():
2673                 subtitles.setdefault(sub_lang, []).extend(sub_info)
2674
2675         return list(formats.values()), subtitles
2676
2677     def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2678         """
2679         Parse formats from MPD manifest.
2680         References:
2681          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2682             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2683          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2684         """
2685         if not self.get_param('dynamic_mpd', True):
2686             if mpd_doc.get('type') == 'dynamic':
2687                 return [], {}
2688
2689         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2690
2691         def _add_ns(path):
2692             return self._xpath_ns(path, namespace)
2693
2694         def is_drm_protected(element):
2695             return element.find(_add_ns('ContentProtection')) is not None
2696
2697         def extract_multisegment_info(element, ms_parent_info):
2698             ms_info = ms_parent_info.copy()
2699
2700             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2701             # common attributes and elements.  We will only extract relevant
2702             # for us.
2703             def extract_common(source):
2704                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2705                 if segment_timeline is not None:
2706                     s_e = segment_timeline.findall(_add_ns('S'))
2707                     if s_e:
2708                         ms_info['total_number'] = 0
2709                         ms_info['s'] = []
2710                         for s in s_e:
2711                             r = int(s.get('r', 0))
2712                             ms_info['total_number'] += 1 + r
2713                             ms_info['s'].append({
2714                                 't': int(s.get('t', 0)),
2715                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2716                                 'd': int(s.attrib['d']),
2717                                 'r': r,
2718                             })
2719                 start_number = source.get('startNumber')
2720                 if start_number:
2721                     ms_info['start_number'] = int(start_number)
2722                 timescale = source.get('timescale')
2723                 if timescale:
2724                     ms_info['timescale'] = int(timescale)
2725                 segment_duration = source.get('duration')
2726                 if segment_duration:
2727                     ms_info['segment_duration'] = float(segment_duration)
2728
2729             def extract_Initialization(source):
2730                 initialization = source.find(_add_ns('Initialization'))
2731                 if initialization is not None:
2732                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2733
2734             segment_list = element.find(_add_ns('SegmentList'))
2735             if segment_list is not None:
2736                 extract_common(segment_list)
2737                 extract_Initialization(segment_list)
2738                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2739                 if segment_urls_e:
2740                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2741             else:
2742                 segment_template = element.find(_add_ns('SegmentTemplate'))
2743                 if segment_template is not None:
2744                     extract_common(segment_template)
2745                     media = segment_template.get('media')
2746                     if media:
2747                         ms_info['media'] = media
2748                     initialization = segment_template.get('initialization')
2749                     if initialization:
2750                         ms_info['initialization'] = initialization
2751                     else:
2752                         extract_Initialization(segment_template)
2753             return ms_info
2754
2755         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2756         stream_numbers = collections.defaultdict(int)
2757         for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
2758             period_entry = {
2759                 'id': period.get('id', f'period-{period_idx}'),
2760                 'formats': [],
2761                 'subtitles': collections.defaultdict(list),
2762             }
2763             period_duration = parse_duration(period.get('duration')) or mpd_duration
2764             period_ms_info = extract_multisegment_info(period, {
2765                 'start_number': 1,
2766                 'timescale': 1,
2767             })
2768             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2769                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2770                 for representation in adaptation_set.findall(_add_ns('Representation')):
2771                     representation_attrib = adaptation_set.attrib.copy()
2772                     representation_attrib.update(representation.attrib)
2773                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2774                     mime_type = representation_attrib['mimeType']
2775                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2776
2777                     codec_str = representation_attrib.get('codecs', '')
2778                     # Some kind of binary subtitle found in some youtube livestreams
2779                     if mime_type == 'application/x-rawcc':
2780                         codecs = {'scodec': codec_str}
2781                     else:
2782                         codecs = parse_codecs(codec_str)
2783                     if content_type not in ('video', 'audio', 'text'):
2784                         if mime_type == 'image/jpeg':
2785                             content_type = mime_type
2786                         elif codecs.get('vcodec', 'none') != 'none':
2787                             content_type = 'video'
2788                         elif codecs.get('acodec', 'none') != 'none':
2789                             content_type = 'audio'
2790                         elif codecs.get('scodec', 'none') != 'none':
2791                             content_type = 'text'
2792                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2793                             content_type = 'text'
2794                         else:
2795                             self.report_warning(f'Unknown MIME type {mime_type} in DASH manifest')
2796                             continue
2797
2798                     base_url = ''
2799                     for element in (representation, adaptation_set, period, mpd_doc):
2800                         base_url_e = element.find(_add_ns('BaseURL'))
2801                         if try_call(lambda: base_url_e.text) is not None:
2802                             base_url = base_url_e.text + base_url
2803                             if re.match(r'^https?://', base_url):
2804                                 break
2805                     if mpd_base_url and base_url.startswith('/'):
2806                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2807                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2808                         if not mpd_base_url.endswith('/'):
2809                             mpd_base_url += '/'
2810                         base_url = mpd_base_url + base_url
2811                     representation_id = representation_attrib.get('id')
2812                     lang = representation_attrib.get('lang')
2813                     url_el = representation.find(_add_ns('BaseURL'))
2814                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2815                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2816                     if representation_id is not None:
2817                         format_id = representation_id
2818                     else:
2819                         format_id = content_type
2820                     if mpd_id:
2821                         format_id = mpd_id + '-' + format_id
2822                     if content_type in ('video', 'audio'):
2823                         f = {
2824                             'format_id': format_id,
2825                             'manifest_url': mpd_url,
2826                             'ext': mimetype2ext(mime_type),
2827                             'width': int_or_none(representation_attrib.get('width')),
2828                             'height': int_or_none(representation_attrib.get('height')),
2829                             'tbr': float_or_none(bandwidth, 1000),
2830                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2831                             'fps': int_or_none(representation_attrib.get('frameRate')),
2832                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2833                             'format_note': f'DASH {content_type}',
2834                             'filesize': filesize,
2835                             'container': mimetype2ext(mime_type) + '_dash',
2836                             **codecs,
2837                         }
2838                     elif content_type == 'text':
2839                         f = {
2840                             'ext': mimetype2ext(mime_type),
2841                             'manifest_url': mpd_url,
2842                             'filesize': filesize,
2843                         }
2844                     elif content_type == 'image/jpeg':
2845                         # See test case in VikiIE
2846                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2847                         f = {
2848                             'format_id': format_id,
2849                             'ext': 'mhtml',
2850                             'manifest_url': mpd_url,
2851                             'format_note': 'DASH storyboards (jpeg)',
2852                             'acodec': 'none',
2853                             'vcodec': 'none',
2854                         }
2855                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2856                         f['has_drm'] = True
2857                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2858
2859                     def prepare_template(template_name, identifiers):
2860                         tmpl = representation_ms_info[template_name]
2861                         if representation_id is not None:
2862                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2863                         # First of, % characters outside $...$ templates
2864                         # must be escaped by doubling for proper processing
2865                         # by % operator string formatting used further (see
2866                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2867                         t = ''
2868                         in_template = False
2869                         for c in tmpl:
2870                             t += c
2871                             if c == '$':
2872                                 in_template = not in_template
2873                             elif c == '%' and not in_template:
2874                                 t += c
2875                         # Next, $...$ templates are translated to their
2876                         # %(...) counterparts to be used with % operator
2877                         t = re.sub(r'\$({})\$'.format('|'.join(identifiers)), r'%(\1)d', t)
2878                         t = re.sub(r'\$({})%([^$]+)\$'.format('|'.join(identifiers)), r'%(\1)\2', t)
2879                         t.replace('$$', '$')
2880                         return t
2881
2882                     # @initialization is a regular template like @media one
2883                     # so it should be handled just the same way (see
2884                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2885                     if 'initialization' in representation_ms_info:
2886                         initialization_template = prepare_template(
2887                             'initialization',
2888                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2889                             # $Time$ shall not be included for @initialization thus
2890                             # only $Bandwidth$ remains
2891                             ('Bandwidth', ))
2892                         representation_ms_info['initialization_url'] = initialization_template % {
2893                             'Bandwidth': bandwidth,
2894                         }
2895
2896                     def location_key(location):
2897                         return 'url' if re.match(r'^https?://', location) else 'path'
2898
2899                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2900
2901                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2902                         media_location_key = location_key(media_template)
2903
2904                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2905                         # can't be used at the same time
2906                         if '%(Number' in media_template and 's' not in representation_ms_info:
2907                             segment_duration = None
2908                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2909                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2910                                 representation_ms_info['total_number'] = int(math.ceil(
2911                                     float_or_none(period_duration, segment_duration, default=0)))
2912                             representation_ms_info['fragments'] = [{
2913                                 media_location_key: media_template % {
2914                                     'Number': segment_number,
2915                                     'Bandwidth': bandwidth,
2916                                 },
2917                                 'duration': segment_duration,
2918                             } for segment_number in range(
2919                                 representation_ms_info['start_number'],
2920                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2921                         else:
2922                             # $Number*$ or $Time$ in media template with S list available
2923                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2924                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2925                             representation_ms_info['fragments'] = []
2926                             segment_time = 0
2927                             segment_d = None
2928                             segment_number = representation_ms_info['start_number']
2929
2930                             def add_segment_url():
2931                                 segment_url = media_template % {
2932                                     'Time': segment_time,
2933                                     'Bandwidth': bandwidth,
2934                                     'Number': segment_number,
2935                                 }
2936                                 representation_ms_info['fragments'].append({
2937                                     media_location_key: segment_url,
2938                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2939                                 })
2940
2941                             for s in representation_ms_info['s']:
2942                                 segment_time = s.get('t') or segment_time
2943                                 segment_d = s['d']
2944                                 add_segment_url()
2945                                 segment_number += 1
2946                                 for _ in range(s.get('r', 0)):
2947                                     segment_time += segment_d
2948                                     add_segment_url()
2949                                     segment_number += 1
2950                                 segment_time += segment_d
2951                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2952                         # No media template,
2953                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2954                         # or any YouTube dashsegments video
2955                         fragments = []
2956                         segment_index = 0
2957                         timescale = representation_ms_info['timescale']
2958                         for s in representation_ms_info['s']:
2959                             duration = float_or_none(s['d'], timescale)
2960                             for _ in range(s.get('r', 0) + 1):
2961                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2962                                 fragments.append({
2963                                     location_key(segment_uri): segment_uri,
2964                                     'duration': duration,
2965                                 })
2966                                 segment_index += 1
2967                         representation_ms_info['fragments'] = fragments
2968                     elif 'segment_urls' in representation_ms_info:
2969                         # Segment URLs with no SegmentTimeline
2970                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2971                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2972                         fragments = []
2973                         segment_duration = float_or_none(
2974                             representation_ms_info['segment_duration'],
2975                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2976                         for segment_url in representation_ms_info['segment_urls']:
2977                             fragment = {
2978                                 location_key(segment_url): segment_url,
2979                             }
2980                             if segment_duration:
2981                                 fragment['duration'] = segment_duration
2982                             fragments.append(fragment)
2983                         representation_ms_info['fragments'] = fragments
2984                     # If there is a fragments key available then we correctly recognized fragmented media.
2985                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2986                     # assumption is not necessarily correct since we may simply have no support for
2987                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2988                     if 'fragments' in representation_ms_info:
2989                         f.update({
2990                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2991                             'url': mpd_url or base_url,
2992                             'fragment_base_url': base_url,
2993                             'fragments': [],
2994                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2995                         })
2996                         if 'initialization_url' in representation_ms_info:
2997                             initialization_url = representation_ms_info['initialization_url']
2998                             if not f.get('url'):
2999                                 f['url'] = initialization_url
3000                             f['fragments'].append({location_key(initialization_url): initialization_url})
3001                         f['fragments'].extend(representation_ms_info['fragments'])
3002                         if not period_duration:
3003                             period_duration = try_get(
3004                                 representation_ms_info,
3005                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3006                     else:
3007                         # Assuming direct URL to unfragmented media.
3008                         f['url'] = base_url
3009                     if content_type in ('video', 'audio', 'image/jpeg'):
3010                         f['manifest_stream_number'] = stream_numbers[f['url']]
3011                         stream_numbers[f['url']] += 1
3012                         period_entry['formats'].append(f)
3013                     elif content_type == 'text':
3014                         period_entry['subtitles'][lang or 'und'].append(f)
3015             yield period_entry
3016
3017     def _extract_ism_formats(self, *args, **kwargs):
3018         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3019         if subs:
3020             self._report_ignoring_subs('ISM')
3021         return fmts
3022
3023     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3024         if self.get_param('ignore_no_formats_error'):
3025             fatal = False
3026
3027         res = self._download_xml_handle(
3028             ism_url, video_id,
3029             note='Downloading ISM manifest' if note is None else note,
3030             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3031             fatal=fatal, data=data, headers=headers, query=query)
3032         if res is False:
3033             return [], {}
3034         ism_doc, urlh = res
3035         if ism_doc is None:
3036             return [], {}
3037
3038         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
3039
3040     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3041         """
3042         Parse formats from ISM manifest.
3043         References:
3044          1. [MS-SSTR]: Smooth Streaming Protocol,
3045             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3046         """
3047         if ism_doc.get('IsLive') == 'TRUE':
3048             return [], {}
3049
3050         duration = int(ism_doc.attrib['Duration'])
3051         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3052
3053         formats = []
3054         subtitles = {}
3055         for stream in ism_doc.findall('StreamIndex'):
3056             stream_type = stream.get('Type')
3057             if stream_type not in ('video', 'audio', 'text'):
3058                 continue
3059             url_pattern = stream.attrib['Url']
3060             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3061             stream_name = stream.get('Name')
3062             stream_language = stream.get('Language', 'und')
3063             for track in stream.findall('QualityLevel'):
3064                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3065                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3066                 # TODO: add support for WVC1 and WMAP
3067                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3068                     self.report_warning(f'{fourcc} is not a supported codec')
3069                     continue
3070                 tbr = int(track.attrib['Bitrate']) // 1000
3071                 # [1] does not mention Width and Height attributes. However,
3072                 # they're often present while MaxWidth and MaxHeight are
3073                 # missing, so should be used as fallbacks
3074                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3075                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3076                 sampling_rate = int_or_none(track.get('SamplingRate'))
3077
3078                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3079                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3080
3081                 fragments = []
3082                 fragment_ctx = {
3083                     'time': 0,
3084                 }
3085                 stream_fragments = stream.findall('c')
3086                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3087                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3088                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3089                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3090                     if not fragment_ctx['duration']:
3091                         try:
3092                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3093                         except IndexError:
3094                             next_fragment_time = duration
3095                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3096                     for _ in range(fragment_repeat):
3097                         fragments.append({
3098                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3099                             'duration': fragment_ctx['duration'] / stream_timescale,
3100                         })
3101                         fragment_ctx['time'] += fragment_ctx['duration']
3102
3103                 if stream_type == 'text':
3104                     subtitles.setdefault(stream_language, []).append({
3105                         'ext': 'ismt',
3106                         'protocol': 'ism',
3107                         'url': ism_url,
3108                         'manifest_url': ism_url,
3109                         'fragments': fragments,
3110                         '_download_params': {
3111                             'stream_type': stream_type,
3112                             'duration': duration,
3113                             'timescale': stream_timescale,
3114                             'fourcc': fourcc,
3115                             'language': stream_language,
3116                             'codec_private_data': track.get('CodecPrivateData'),
3117                         },
3118                     })
3119                 elif stream_type in ('video', 'audio'):
3120                     formats.append({
3121                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3122                         'url': ism_url,
3123                         'manifest_url': ism_url,
3124                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3125                         'width': width,
3126                         'height': height,
3127                         'tbr': tbr,
3128                         'asr': sampling_rate,
3129                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3130                         'acodec': 'none' if stream_type == 'video' else fourcc,
3131                         'protocol': 'ism',
3132                         'fragments': fragments,
3133                         'has_drm': ism_doc.find('Protection') is not None,
3134                         'language': stream_language,
3135                         'audio_channels': int_or_none(track.get('Channels')),
3136                         '_download_params': {
3137                             'stream_type': stream_type,
3138                             'duration': duration,
3139                             'timescale': stream_timescale,
3140                             'width': width or 0,
3141                             'height': height or 0,
3142                             'fourcc': fourcc,
3143                             'language': stream_language,
3144                             'codec_private_data': track.get('CodecPrivateData'),
3145                             'sampling_rate': sampling_rate,
3146                             'channels': int_or_none(track.get('Channels', 2)),
3147                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3148                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3149                         },
3150                     })
3151         return formats, subtitles
3152
3153     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3154         def absolute_url(item_url):
3155             return urljoin(base_url, item_url)
3156
3157         def parse_content_type(content_type):
3158             if not content_type:
3159                 return {}
3160             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3161             if ctr:
3162                 mimetype, codecs = ctr.groups()
3163                 f = parse_codecs(codecs)
3164                 f['ext'] = mimetype2ext(mimetype)
3165                 return f
3166             return {}
3167
3168         def _media_formats(src, cur_media_type, type_info=None):
3169             type_info = type_info or {}
3170             full_url = absolute_url(src)
3171             ext = type_info.get('ext') or determine_ext(full_url)
3172             if ext == 'm3u8':
3173                 is_plain_url = False
3174                 formats = self._extract_m3u8_formats(
3175                     full_url, video_id, ext='mp4',
3176                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3177                     preference=preference, quality=quality, fatal=False)
3178             elif ext == 'mpd':
3179                 is_plain_url = False
3180                 formats = self._extract_mpd_formats(
3181                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3182             else:
3183                 is_plain_url = True
3184                 formats = [{
3185                     'url': full_url,
3186                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3187                     'ext': ext,
3188                 }]
3189             return is_plain_url, formats
3190
3191         entries = []
3192         # amp-video and amp-audio are very similar to their HTML5 counterparts
3193         # so we will include them right here (see
3194         # https://www.ampproject.org/docs/reference/components/amp-video)
3195         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3196         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3197         media_tags = [(media_tag, media_tag_name, media_type, '')
3198                       for media_tag, media_tag_name, media_type
3199                       in re.findall(rf'(?s)(<({_MEDIA_TAG_NAME_RE})[^>]*/>)', webpage)]
3200         media_tags.extend(re.findall(
3201             # We only allow video|audio followed by a whitespace or '>'.
3202             # Allowing more characters may end up in significant slow down (see
3203             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3204             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3205             rf'(?s)(<(?P<tag>{_MEDIA_TAG_NAME_RE})(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
3206         for media_tag, _, media_type, media_content in media_tags:
3207             media_info = {
3208                 'formats': [],
3209                 'subtitles': {},
3210             }
3211             media_attributes = extract_attributes(media_tag)
3212             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3213             if src:
3214                 f = parse_content_type(media_attributes.get('type'))
3215                 _, formats = _media_formats(src, media_type, f)
3216                 media_info['formats'].extend(formats)
3217             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3218             if media_content:
3219                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3220                     s_attr = extract_attributes(source_tag)
3221                     # data-video-src and data-src are non standard but seen
3222                     # several times in the wild
3223                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3224                     if not src:
3225                         continue
3226                     f = parse_content_type(s_attr.get('type'))
3227                     is_plain_url, formats = _media_formats(src, media_type, f)
3228                     if is_plain_url:
3229                         # width, height, res, label and title attributes are
3230                         # all not standard but seen several times in the wild
3231                         labels = [
3232                             s_attr.get(lbl)
3233                             for lbl in ('label', 'title')
3234                             if str_or_none(s_attr.get(lbl))
3235                         ]
3236                         width = int_or_none(s_attr.get('width'))
3237                         height = (int_or_none(s_attr.get('height'))
3238                                   or int_or_none(s_attr.get('res')))
3239                         if not width or not height:
3240                             for lbl in labels:
3241                                 resolution = parse_resolution(lbl)
3242                                 if not resolution:
3243                                     continue
3244                                 width = width or resolution.get('width')
3245                                 height = height or resolution.get('height')
3246                         for lbl in labels:
3247                             tbr = parse_bitrate(lbl)
3248                             if tbr:
3249                                 break
3250                         else:
3251                             tbr = None
3252                         f.update({
3253                             'width': width,
3254                             'height': height,
3255                             'tbr': tbr,
3256                             'format_id': s_attr.get('label') or s_attr.get('title'),
3257                         })
3258                         f.update(formats[0])
3259                         media_info['formats'].append(f)
3260                     else:
3261                         media_info['formats'].extend(formats)
3262                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3263                     track_attributes = extract_attributes(track_tag)
3264                     kind = track_attributes.get('kind')
3265                     if not kind or kind in ('subtitles', 'captions'):
3266                         src = strip_or_none(track_attributes.get('src'))
3267                         if not src:
3268                             continue
3269                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3270                         media_info['subtitles'].setdefault(lang, []).append({
3271                             'url': absolute_url(src),
3272                         })
3273             for f in media_info['formats']:
3274                 f.setdefault('http_headers', {})['Referer'] = base_url
3275             if media_info['formats'] or media_info['subtitles']:
3276                 entries.append(media_info)
3277         return entries
3278
3279     def _extract_akamai_formats(self, *args, **kwargs):
3280         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3281         if subs:
3282             self._report_ignoring_subs('akamai')
3283         return fmts
3284
3285     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3286         signed = 'hdnea=' in manifest_url
3287         if not signed:
3288             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3289             manifest_url = re.sub(
3290                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3291                 '', manifest_url).strip('?')
3292
3293         formats = []
3294         subtitles = {}
3295
3296         hdcore_sign = 'hdcore=3.7.0'
3297         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3298         hds_host = hosts.get('hds')
3299         if hds_host:
3300             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3301         if 'hdcore=' not in f4m_url:
3302             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3303         f4m_formats = self._extract_f4m_formats(
3304             f4m_url, video_id, f4m_id='hds', fatal=False)
3305         for entry in f4m_formats:
3306             entry.update({'extra_param_to_segment_url': hdcore_sign})
3307         formats.extend(f4m_formats)
3308
3309         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3310         hls_host = hosts.get('hls')
3311         if hls_host:
3312             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3313         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3314             m3u8_url, video_id, 'mp4', 'm3u8_native',
3315             m3u8_id='hls', fatal=False)
3316         formats.extend(m3u8_formats)
3317         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3318
3319         http_host = hosts.get('http')
3320         if http_host and m3u8_formats and not signed:
3321             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3322             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3323             qualities_length = len(qualities)
3324             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3325                 i = 0
3326                 for f in m3u8_formats:
3327                     if f['vcodec'] != 'none':
3328                         for protocol in ('http', 'https'):
3329                             http_f = f.copy()
3330                             del http_f['manifest_url']
3331                             http_url = re.sub(
3332                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3333                             http_f.update({
3334                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3335                                 'url': http_url,
3336                                 'protocol': protocol,
3337                             })
3338                             formats.append(http_f)
3339                         i += 1
3340
3341         return formats, subtitles
3342
3343     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3344         query = urllib.parse.urlparse(url).query
3345         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3346         mobj = re.search(
3347             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3348         url_base = mobj.group('url')
3349         http_base_url = '{}{}:{}'.format('http', mobj.group('s') or '', url_base)
3350         formats = []
3351
3352         def manifest_url(manifest):
3353             m_url = f'{http_base_url}/{manifest}'
3354             if query:
3355                 m_url += f'?{query}'
3356             return m_url
3357
3358         if 'm3u8' not in skip_protocols:
3359             formats.extend(self._extract_m3u8_formats(
3360                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3361                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3362         if 'f4m' not in skip_protocols:
3363             formats.extend(self._extract_f4m_formats(
3364                 manifest_url('manifest.f4m'),
3365                 video_id, f4m_id='hds', fatal=False))
3366         if 'dash' not in skip_protocols:
3367             formats.extend(self._extract_mpd_formats(
3368                 manifest_url('manifest.mpd'),
3369                 video_id, mpd_id='dash', fatal=False))
3370         if re.search(r'(?:/smil:|\.smil)', url_base):
3371             if 'smil' not in skip_protocols:
3372                 rtmp_formats = self._extract_smil_formats(
3373                     manifest_url('jwplayer.smil'),
3374                     video_id, fatal=False)
3375                 for rtmp_format in rtmp_formats:
3376                     rtsp_format = rtmp_format.copy()
3377                     rtsp_format['url'] = '{}/{}'.format(rtmp_format['url'], rtmp_format['play_path'])
3378                     del rtsp_format['play_path']
3379                     del rtsp_format['ext']
3380                     rtsp_format.update({
3381                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3382                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3383                         'protocol': 'rtsp',
3384                     })
3385                     formats.extend([rtmp_format, rtsp_format])
3386         else:
3387             for protocol in ('rtmp', 'rtsp'):
3388                 if protocol not in skip_protocols:
3389                     formats.append({
3390                         'url': f'{protocol}:{url_base}',
3391                         'format_id': protocol,
3392                         'protocol': protocol,
3393                     })
3394         return formats
3395
3396     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3397         return self._search_json(
3398             r'''(?<!-)\bjwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!</script>).)*?\.\s*(?:setup\s*\(|(?P<load>load)\s*\(\s*\[)''',
3399             webpage, 'JWPlayer data', video_id,
3400             # must be a {...} or sequence, ending
3401             contains_pattern=r'\{(?s:.*)}(?(load)(?:\s*,\s*\{(?s:.*)})*)', end_pattern=r'(?(load)\]|\))',
3402             transform_source=transform_source, default=None)
3403
3404     def _extract_jwplayer_data(self, webpage, video_id, *args, transform_source=js_to_json, **kwargs):
3405         jwplayer_data = self._find_jwplayer_data(
3406             webpage, video_id, transform_source=transform_source)
3407         return self._parse_jwplayer_data(
3408             jwplayer_data, video_id, *args, **kwargs)
3409
3410     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3411                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3412         entries = []
3413         if not isinstance(jwplayer_data, dict):
3414             return entries
3415
3416         playlist_items = jwplayer_data.get('playlist')
3417         # JWPlayer backward compatibility: single playlist item/flattened playlists
3418         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3419         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3420         if not isinstance(playlist_items, list):
3421             playlist_items = (playlist_items or jwplayer_data, )
3422
3423         for video_data in playlist_items:
3424             if not isinstance(video_data, dict):
3425                 continue
3426             # JWPlayer backward compatibility: flattened sources
3427             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3428             if 'sources' not in video_data:
3429                 video_data['sources'] = [video_data]
3430
3431             this_video_id = video_id or video_data['mediaid']
3432
3433             formats = self._parse_jwplayer_formats(
3434                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3435                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3436
3437             subtitles = {}
3438             for track in traverse_obj(video_data, (
3439                     'tracks', lambda _, v: v['kind'].lower() in ('captions', 'subtitles'))):
3440                 track_url = urljoin(base_url, track.get('file'))
3441                 if not track_url:
3442                     continue
3443                 subtitles.setdefault(track.get('label') or 'en', []).append({
3444                     'url': self._proto_relative_url(track_url),
3445                 })
3446
3447             entry = {
3448                 'id': this_video_id,
3449                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3450                 'description': clean_html(video_data.get('description')),
3451                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3452                 'timestamp': int_or_none(video_data.get('pubdate')),
3453                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3454                 'subtitles': subtitles,
3455                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3456                 'genre': clean_html(video_data.get('genre')),
3457                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3458                 'season_number': int_or_none(video_data.get('season')),
3459                 'episode_number': int_or_none(video_data.get('episode')),
3460                 'release_year': int_or_none(video_data.get('releasedate')),
3461                 'age_limit': int_or_none(video_data.get('age_restriction')),
3462             }
3463             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3464             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3465                 entry.update({
3466                     '_type': 'url_transparent',
3467                     'url': formats[0]['url'],
3468                 })
3469             else:
3470                 entry['formats'] = formats
3471             entries.append(entry)
3472         if len(entries) == 1:
3473             return entries[0]
3474         else:
3475             return self.playlist_result(entries)
3476
3477     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3478                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3479         urls = set()
3480         formats = []
3481         for source in jwplayer_sources_data:
3482             if not isinstance(source, dict):
3483                 continue
3484             source_url = urljoin(
3485                 base_url, self._proto_relative_url(source.get('file')))
3486             if not source_url or source_url in urls:
3487                 continue
3488             urls.add(source_url)
3489             source_type = source.get('type') or ''
3490             ext = mimetype2ext(source_type) or determine_ext(source_url)
3491             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3492                 formats.extend(self._extract_m3u8_formats(
3493                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3494                     m3u8_id=m3u8_id, fatal=False))
3495             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3496                 formats.extend(self._extract_mpd_formats(
3497                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3498             elif ext == 'smil':
3499                 formats.extend(self._extract_smil_formats(
3500                     source_url, video_id, fatal=False))
3501             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3502             elif source_type.startswith('audio') or ext in (
3503                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3504                 formats.append({
3505                     'url': source_url,
3506                     'vcodec': 'none',
3507                     'ext': ext,
3508                 })
3509             else:
3510                 format_id = str_or_none(source.get('label'))
3511                 height = int_or_none(source.get('height'))
3512                 if height is None and format_id:
3513                     # Often no height is provided but there is a label in
3514                     # format like "1080p", "720p SD", or 1080.
3515                     height = parse_resolution(format_id).get('height')
3516                 a_format = {
3517                     'url': source_url,
3518                     'width': int_or_none(source.get('width')),
3519                     'height': height,
3520                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3521                     'filesize': int_or_none(source.get('filesize')),
3522                     'ext': ext,
3523                     'format_id': format_id,
3524                 }
3525                 if source_url.startswith('rtmp'):
3526                     a_format['ext'] = 'flv'
3527                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3528                     # of jwplayer.flash.swf
3529                     rtmp_url_parts = re.split(
3530                         r'((?:mp4|mp3|flv):)', source_url, maxsplit=1)
3531                     if len(rtmp_url_parts) == 3:
3532                         rtmp_url, prefix, play_path = rtmp_url_parts
3533                         a_format.update({
3534                             'url': rtmp_url,
3535                             'play_path': prefix + play_path,
3536                         })
3537                     if rtmp_params:
3538                         a_format.update(rtmp_params)
3539                 formats.append(a_format)
3540         return formats
3541
3542     def _live_title(self, name):
3543         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3544         return name
3545
3546     def _int(self, v, name, fatal=False, **kwargs):
3547         res = int_or_none(v, **kwargs)
3548         if res is None:
3549             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3550             if fatal:
3551                 raise ExtractorError(msg)
3552             else:
3553                 self.report_warning(msg)
3554         return res
3555
3556     def _float(self, v, name, fatal=False, **kwargs):
3557         res = float_or_none(v, **kwargs)
3558         if res is None:
3559             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3560             if fatal:
3561                 raise ExtractorError(msg)
3562             else:
3563                 self.report_warning(msg)
3564         return res
3565
3566     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3567                     path='/', secure=False, discard=False, rest={}, **kwargs):
3568         cookie = http.cookiejar.Cookie(
3569             0, name, value, port, port is not None, domain, True,
3570             domain.startswith('.'), path, True, secure, expire_time,
3571             discard, None, None, rest)
3572         self.cookiejar.set_cookie(cookie)
3573
3574     def _get_cookies(self, url):
3575         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3576         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3577
3578     def _apply_first_set_cookie_header(self, url_handle, cookie):
3579         """
3580         Apply first Set-Cookie header instead of the last. Experimental.
3581
3582         Some sites (e.g. [1-3]) may serve two cookies under the same name
3583         in Set-Cookie header and expect the first (old) one to be set rather
3584         than second (new). However, as of RFC6265 the newer one cookie
3585         should be set into cookie store what actually happens.
3586         We will workaround this issue by resetting the cookie to
3587         the first one manually.
3588         1. https://new.vk.com/
3589         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3590         3. https://learning.oreilly.com/
3591         """
3592         for header, cookies in url_handle.headers.items():
3593             if header.lower() != 'set-cookie':
3594                 continue
3595             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3596             cookie_value = re.search(
3597                 rf'{cookie}=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)', cookies)
3598             if cookie_value:
3599                 value, domain = cookie_value.groups()
3600                 self._set_cookie(domain, cookie, value)
3601                 break
3602
3603     @classmethod
3604     def get_testcases(cls, include_onlymatching=False):
3605         # Do not look in super classes
3606         t = vars(cls).get('_TEST')
3607         if t:
3608             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3609             tests = [t]
3610         else:
3611             tests = vars(cls).get('_TESTS', [])
3612         for t in tests:
3613             if not include_onlymatching and t.get('only_matching', False):
3614                 continue
3615             t['name'] = cls.ie_key()
3616             yield t
3617         if getattr(cls, '__wrapped__', None):
3618             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3619
3620     @classmethod
3621     def get_webpage_testcases(cls):
3622         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3623         for t in tests:
3624             t['name'] = cls.ie_key()
3625             yield t
3626         if getattr(cls, '__wrapped__', None):
3627             yield from cls.__wrapped__.get_webpage_testcases()
3628
3629     @classproperty(cache=True)
3630     def age_limit(cls):
3631         """Get age limit from the testcases"""
3632         return max(traverse_obj(
3633             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3634             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3635
3636     @classproperty(cache=True)
3637     def _RETURN_TYPE(cls):
3638         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3639         tests = tuple(cls.get_testcases(include_onlymatching=False))
3640         if not tests:
3641             return None
3642         elif not any(k.startswith('playlist') for test in tests for k in test):
3643             return 'video'
3644         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3645             return 'playlist'
3646         return 'any'
3647
3648     @classmethod
3649     def is_single_video(cls, url):
3650         """Returns whether the URL is of a single video, None if unknown"""
3651         if cls.suitable(url):
3652             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3653
3654     @classmethod
3655     def is_suitable(cls, age_limit):
3656         """Test whether the extractor is generally suitable for the given age limit"""
3657         return not age_restricted(cls.age_limit, age_limit)
3658
3659     @classmethod
3660     def description(cls, *, markdown=True, search_examples=None):
3661         """Description of the extractor"""
3662         desc = ''
3663         if cls._NETRC_MACHINE:
3664             if markdown:
3665                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3666             else:
3667                 desc += f' [{cls._NETRC_MACHINE}]'
3668         if cls.IE_DESC is False:
3669             desc += ' [HIDDEN]'
3670         elif cls.IE_DESC:
3671             desc += f' {cls.IE_DESC}'
3672         if cls.SEARCH_KEY:
3673             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3674             if search_examples:
3675                 _COUNTS = ('', '5', '10', 'all')
3676                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3677         if not cls.working():
3678             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3679
3680         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3681         name = (' - **{}**'.format(re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME))) if markdown else cls.IE_NAME
3682         return f'{name}:{desc}' if desc else name
3683
3684     def extract_subtitles(self, *args, **kwargs):
3685         if (self.get_param('writesubtitles', False)
3686                 or self.get_param('listsubtitles')):
3687             return self._get_subtitles(*args, **kwargs)
3688         return {}
3689
3690     def _get_subtitles(self, *args, **kwargs):
3691         raise NotImplementedError('This method must be implemented by subclasses')
3692
3693     class CommentsDisabled(Exception):
3694         """Raise in _get_comments if comments are disabled for the video"""
3695
3696     def extract_comments(self, *args, **kwargs):
3697         if not self.get_param('getcomments'):
3698             return None
3699         generator = self._get_comments(*args, **kwargs)
3700
3701         def extractor():
3702             comments = []
3703             interrupted = True
3704             try:
3705                 while True:
3706                     comments.append(next(generator))
3707             except StopIteration:
3708                 interrupted = False
3709             except KeyboardInterrupt:
3710                 self.to_screen('Interrupted by user')
3711             except self.CommentsDisabled:
3712                 return {'comments': None, 'comment_count': None}
3713             except Exception as e:
3714                 if self.get_param('ignoreerrors') is not True:
3715                     raise
3716                 self._downloader.report_error(e)
3717             comment_count = len(comments)
3718             self.to_screen(f'Extracted {comment_count} comments')
3719             return {
3720                 'comments': comments,
3721                 'comment_count': None if interrupted else comment_count,
3722             }
3723         return extractor
3724
3725     def _get_comments(self, *args, **kwargs):
3726         raise NotImplementedError('This method must be implemented by subclasses')
3727
3728     @staticmethod
3729     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3730         """ Merge subtitle items for one language. Items with duplicated URLs/data
3731         will be dropped. """
3732         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3733         ret = list(subtitle_list1)
3734         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3735         return ret
3736
3737     @classmethod
3738     def _merge_subtitles(cls, *dicts, target=None):
3739         """ Merge subtitle dictionaries, language by language. """
3740         if target is None:
3741             target = {}
3742         for d in dicts:
3743             for lang, subs in d.items():
3744                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3745         return target
3746
3747     def extract_automatic_captions(self, *args, **kwargs):
3748         if (self.get_param('writeautomaticsub', False)
3749                 or self.get_param('listsubtitles')):
3750             return self._get_automatic_captions(*args, **kwargs)
3751         return {}
3752
3753     def _get_automatic_captions(self, *args, **kwargs):
3754         raise NotImplementedError('This method must be implemented by subclasses')
3755
3756     @functools.cached_property
3757     def _cookies_passed(self):
3758         """Whether cookies have been passed to YoutubeDL"""
3759         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3760
3761     def mark_watched(self, *args, **kwargs):
3762         if not self.get_param('mark_watched', False):
3763             return
3764         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3765             self._mark_watched(*args, **kwargs)
3766
3767     def _mark_watched(self, *args, **kwargs):
3768         raise NotImplementedError('This method must be implemented by subclasses')
3769
3770     def geo_verification_headers(self):
3771         headers = {}
3772         geo_verification_proxy = self.get_param('geo_verification_proxy')
3773         if geo_verification_proxy:
3774             headers['Ytdl-request-proxy'] = geo_verification_proxy
3775         return headers
3776
3777     @staticmethod
3778     def _generic_id(url):
3779         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3780
3781     def _generic_title(self, url='', webpage='', *, default=None):
3782         return (self._og_search_title(webpage, default=None)
3783                 or self._html_extract_title(webpage, default=None)
3784                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3785                 or default)
3786
3787     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3788         if not duration:
3789             return
3790         chapter_list = [{
3791             'start_time': start_function(chapter),
3792             'title': title_function(chapter),
3793         } for chapter in chapter_list or []]
3794         if strict:
3795             warn = self.report_warning
3796         else:
3797             warn = self.write_debug
3798             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3799
3800         chapters = [{'start_time': 0}]
3801         for idx, chapter in enumerate(chapter_list):
3802             if chapter['start_time'] is None:
3803                 warn(f'Incomplete chapter {idx}')
3804             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3805                 chapters.append(chapter)
3806             elif chapter not in chapters:
3807                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3808                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3809                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3810         return chapters[1:]
3811
3812     def _extract_chapters_from_description(self, description, duration):
3813         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3814         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3815         return self._extract_chapters_helper(
3816             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3817             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3818             duration=duration, strict=False) or self._extract_chapters_helper(
3819             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3820             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3821             duration=duration, strict=False)
3822
3823     @staticmethod
3824     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3825         all_known = all(
3826             x is not None for x in
3827             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted))
3828         return (
3829             'private' if is_private
3830             else 'premium_only' if needs_premium
3831             else 'subscriber_only' if needs_subscription
3832             else 'needs_auth' if needs_auth
3833             else 'unlisted' if is_unlisted
3834             else 'public' if all_known
3835             else None)
3836
3837     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3838         '''
3839         @returns            A list of values for the extractor argument given by "key"
3840                             or "default" if no such key is present
3841         @param default      The default value to return when the key is not present (default: [])
3842         @param casesense    When false, the values are converted to lower case
3843         '''
3844         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3845         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3846         if val is None:
3847             return [] if default is NO_DEFAULT else default
3848         return list(val) if casesense else [x.lower() for x in val]
3849
3850     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3851         if not playlist_id or not video_id:
3852             return not video_id
3853
3854         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3855         if no_playlist is not None:
3856             return not no_playlist
3857
3858         video_id = '' if video_id is True else f' {video_id}'
3859         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3860         if self.get_param('noplaylist'):
3861             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3862             return False
3863         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3864         return True
3865
3866     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3867         RetryManager.report_retry(
3868             err, _count or int(fatal), _retries,
3869             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3870             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3871
3872     def RetryManager(self, **kwargs):
3873         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3874
3875     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3876         display_id = traverse_obj(info_dict, 'display_id', 'id')
3877         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3878         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3879             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3880
3881     @classmethod
3882     def extract_from_webpage(cls, ydl, url, webpage):
3883         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3884               else ydl.get_info_extractor(cls.ie_key()))
3885         for info in ie._extract_from_webpage(url, webpage) or []:
3886             # url = None since we do not want to set (webpage/original)_url
3887             ydl.add_default_extra_info(info, ie, None)
3888             yield info
3889
3890     @classmethod
3891     def _extract_from_webpage(cls, url, webpage):
3892         for embed_url in orderedSet(
3893                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3894             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3895
3896     @classmethod
3897     def _extract_embed_urls(cls, url, webpage):
3898         """@returns all the embed urls on the webpage"""
3899         if '_EMBED_URL_RE' not in cls.__dict__:
3900             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3901             for idx, regex in enumerate(cls._EMBED_REGEX):
3902                 assert regex.count('(?P<url>') == 1, \
3903                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3904             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3905
3906         for regex in cls._EMBED_URL_RE:
3907             for mobj in regex.finditer(webpage):
3908                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3909                 if cls._VALID_URL is False or cls.suitable(embed_url):
3910                     yield embed_url
3911
3912     class StopExtraction(Exception):
3913         pass
3914
3915     @classmethod
3916     def _extract_url(cls, webpage):  # TODO: Remove
3917         """Only for compatibility with some older extractors"""
3918         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3919
3920     @classmethod
3921     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3922         if plugin_name:
3923             mro = inspect.getmro(cls)
3924             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3925             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3926             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3927             while getattr(super_class, '__wrapped__', None):
3928                 super_class = super_class.__wrapped__
3929             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3930             _PLUGIN_OVERRIDES[super_class].append(cls)
3931
3932         return super().__init_subclass__(**kwargs)
3933
3934
3935 class SearchInfoExtractor(InfoExtractor):
3936     """
3937     Base class for paged search queries extractors.
3938     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3939     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3940     """
3941
3942     _MAX_RESULTS = float('inf')
3943     _RETURN_TYPE = 'playlist'
3944
3945     @classproperty
3946     def _VALID_URL(cls):
3947         return rf'{cls._SEARCH_KEY}(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)'
3948
3949     def _real_extract(self, query):
3950         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3951         if prefix == '':
3952             return self._get_n_results(query, 1)
3953         elif prefix == 'all':
3954             return self._get_n_results(query, self._MAX_RESULTS)
3955         else:
3956             n = int(prefix)
3957             if n <= 0:
3958                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3959             elif n > self._MAX_RESULTS:
3960                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3961                 n = self._MAX_RESULTS
3962             return self._get_n_results(query, n)
3963
3964     def _get_n_results(self, query, n):
3965         """Get a specified number of results for a query.
3966         Either this function or _search_results must be overridden by subclasses """
3967         return self.playlist_result(
3968             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3969             query, query)
3970
3971     def _search_results(self, query):
3972         """Returns an iterator of search results"""
3973         raise NotImplementedError('This method must be implemented by subclasses')
3974
3975     @classproperty
3976     def SEARCH_KEY(cls):
3977         return cls._SEARCH_KEY
3978
3979
3980 class UnsupportedURLIE(InfoExtractor):
3981     _VALID_URL = '.*'
3982     _ENABLED = False
3983     IE_DESC = False
3984
3985     def _real_extract(self, url):
3986         raise UnsupportedError(url)
3987
3988
3989 _PLUGIN_OVERRIDES = collections.defaultdict(list)