yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import functools
   4 import getpass
   5 import hashlib
   6 import http.client
   7 import http.cookiejar
   8 import http.cookies
   9 import inspect
  10 import itertools
  11 import json
  12 import math
  13 import netrc
  14 import os
  15 import random
  16 import re
  17 import subprocess
  18 import sys
  19 import time
  20 import types
  21 import urllib.parse
  22 import urllib.request
  23 import xml.etree.ElementTree
  24
  25 from ..compat import (
  26     compat_etree_fromstring,
  27     compat_expanduser,
  28     compat_os_name,
  29     urllib_req_to_req,
  30 )
  31 from ..cookies import LenientSimpleCookie
  32 from ..downloader.f4m import get_base_url, remove_encrypted_media
  33 from ..downloader.hls import HlsFD
  34 from ..networking import HEADRequest, Request
  35 from ..networking.exceptions import (
  36     HTTPError,
  37     IncompleteRead,
  38     network_exceptions,
  39 )
  40 from ..networking.impersonate import ImpersonateTarget
  41 from ..utils import (
  42     IDENTITY,
  43     JSON_LD_RE,
  44     NO_DEFAULT,
  45     ExtractorError,
  46     FormatSorter,
  47     GeoRestrictedError,
  48     GeoUtils,
  49     LenientJSONDecoder,
  50     Popen,
  51     RegexNotFoundError,
  52     RetryManager,
  53     UnsupportedError,
  54     age_restricted,
  55     base_url,
  56     bug_reports_message,
  57     classproperty,
  58     clean_html,
  59     deprecation_warning,
  60     determine_ext,
  61     dict_get,
  62     encode_data_uri,
  63     extract_attributes,
  64     filter_dict,
  65     fix_xml_ampersands,
  66     float_or_none,
  67     format_field,
  68     int_or_none,
  69     join_nonempty,
  70     js_to_json,
  71     mimetype2ext,
  72     netrc_from_content,
  73     orderedSet,
  74     parse_bitrate,
  75     parse_codecs,
  76     parse_duration,
  77     parse_iso8601,
  78     parse_m3u8_attributes,
  79     parse_resolution,
  80     sanitize_filename,
  81     sanitize_url,
  82     smuggle_url,
  83     str_or_none,
  84     str_to_int,
  85     strip_or_none,
  86     traverse_obj,
  87     truncate_string,
  88     try_call,
  89     try_get,
  90     unescapeHTML,
  91     unified_strdate,
  92     unified_timestamp,
  93     url_basename,
  94     url_or_none,
  95     urlhandle_detect_ext,
  96     urljoin,
  97     variadic,
  98     xpath_element,
  99     xpath_text,
 100     xpath_with_ns,
 101 )
 102
 103
 104 class InfoExtractor:
 105     """Information Extractor class.
 106
 107     Information extractors are the classes that, given a URL, extract
 108     information about the video (or videos) the URL refers to. This
 109     information includes the real video URL, the video title, author and
 110     others. The information is stored in a dictionary which is then
 111     passed to the YoutubeDL. The YoutubeDL processes this
 112     information possibly downloading the video to the file system, among
 113     other possible outcomes.
 114
 115     The type field determines the type of the result.
 116     By far the most common value (and the default if _type is missing) is
 117     "video", which indicates a single video.
 118
 119     For a video, the dictionaries must include the following fields:
 120
 121     id:             Video identifier.
 122     title:          Video title, unescaped. Set to an empty string if video has
 123                     no title as opposed to "None" which signifies that the
 124                     extractor failed to obtain a title
 125
 126     Additionally, it must contain either a formats entry or a url one:
 127
 128     formats:        A list of dictionaries for each format available, ordered
 129                     from worst to best quality.
 130
 131                     Potential fields:
 132                     * url        The mandatory URL representing the media:
 133                                    for plain file media - HTTP URL of this file,
 134                                    for RTMP - RTMP URL,
 135                                    for HLS - URL of the M3U8 media playlist,
 136                                    for HDS - URL of the F4M manifest,
 137                                    for DASH
 138                                      - HTTP URL to plain file media (in case of
 139                                        unfragmented media)
 140                                      - URL of the MPD manifest or base URL
 141                                        representing the media if MPD manifest
 142                                        is parsed from a string (in case of
 143                                        fragmented media)
 144                                    for MSS - URL of the ISM manifest.
 145                     * request_data  Data to send in POST request to the URL
 146                     * manifest_url
 147                                  The URL of the manifest file in case of
 148                                  fragmented media:
 149                                    for HLS - URL of the M3U8 master playlist,
 150                                    for HDS - URL of the F4M manifest,
 151                                    for DASH - URL of the MPD manifest,
 152                                    for MSS - URL of the ISM manifest.
 153                     * manifest_stream_number  (For internal use only)
 154                                  The index of the stream in the manifest file
 155                     * ext        Will be calculated from URL if missing
 156                     * format     A human-readable description of the format
 157                                  ("mp4 container with h264/opus").
 158                                  Calculated from the format_id, width, height.
 159                                  and format_note fields if missing.
 160                     * format_id  A short description of the format
 161                                  ("mp4_h264_opus" or "19").
 162                                 Technically optional, but strongly recommended.
 163                     * format_note Additional info about the format
 164                                  ("3D" or "DASH video")
 165                     * width      Width of the video, if known
 166                     * height     Height of the video, if known
 167                     * aspect_ratio  Aspect ratio of the video, if known
 168                                  Automatically calculated from width and height
 169                     * resolution Textual description of width and height
 170                                  Automatically calculated from width and height
 171                     * dynamic_range The dynamic range of the video. One of:
 172                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 173                     * tbr        Average bitrate of audio and video in kbps (1000 bits/sec)
 174                     * abr        Average audio bitrate in kbps (1000 bits/sec)
 175                     * acodec     Name of the audio codec in use
 176                     * asr        Audio sampling rate in Hertz
 177                     * audio_channels  Number of audio channels
 178                     * vbr        Average video bitrate in kbps (1000 bits/sec)
 179                     * fps        Frame rate
 180                     * vcodec     Name of the video codec in use
 181                     * container  Name of the container format
 182                     * filesize   The number of bytes, if known in advance
 183                     * filesize_approx  An estimate for the number of bytes
 184                     * player_url SWF Player URL (used for rtmpdump).
 185                     * protocol   The protocol that will be used for the actual
 186                                  download, lower-case. One of "http", "https" or
 187                                  one of the protocols defined in downloader.PROTOCOL_MAP
 188                     * fragment_base_url
 189                                  Base URL for fragments. Each fragment's path
 190                                  value (if present) will be relative to
 191                                  this URL.
 192                     * fragments  A list of fragments of a fragmented media.
 193                                  Each fragment entry must contain either an url
 194                                  or a path. If an url is present it should be
 195                                  considered by a client. Otherwise both path and
 196                                  fragment_base_url must be present. Here is
 197                                  the list of all potential fields:
 198                                  * "url" - fragment's URL
 199                                  * "path" - fragment's path relative to
 200                                             fragment_base_url
 201                                  * "duration" (optional, int or float)
 202                                  * "filesize" (optional, int)
 203                     * is_from_start  Is a live format that can be downloaded
 204                                 from the start. Boolean
 205                     * preference Order number of this format. If this field is
 206                                  present and not None, the formats get sorted
 207                                  by this field, regardless of all other values.
 208                                  -1 for default (order by other properties),
 209                                  -2 or smaller for less than default.
 210                                  < -1000 to hide the format (if there is
 211                                     another one which is strictly better)
 212                     * language   Language code, e.g. "de" or "en-US".
 213                     * language_preference  Is this in the language mentioned in
 214                                  the URL?
 215                                  10 if it's what the URL is about,
 216                                  -1 for default (don't know),
 217                                  -10 otherwise, other values reserved for now.
 218                     * quality    Order number of the video quality of this
 219                                  format, irrespective of the file format.
 220                                  -1 for default (order by other properties),
 221                                  -2 or smaller for less than default.
 222                     * source_preference  Order number for this video source
 223                                   (quality takes higher priority)
 224                                  -1 for default (order by other properties),
 225                                  -2 or smaller for less than default.
 226                     * http_headers  A dictionary of additional HTTP headers
 227                                  to add to the request.
 228                     * stretched_ratio  If given and not 1, indicates that the
 229                                  video's pixels are not square.
 230                                  width : height ratio as float.
 231                     * no_resume  The server does not support resuming the
 232                                  (HTTP or RTMP) download. Boolean.
 233                     * has_drm    True if the format has DRM and cannot be downloaded.
 234                                  'maybe' if the format may have DRM and has to be tested before download.
 235                     * extra_param_to_segment_url  A query string to append to each
 236                                  fragment's URL, or to update each existing query string
 237                                  with. Only applied by the native HLS/DASH downloaders.
 238                     * hls_aes    A dictionary of HLS AES-128 decryption information
 239                                  used by the native HLS downloader to override the
 240                                  values in the media playlist when an '#EXT-X-KEY' tag
 241                                  is present in the playlist:
 242                                  * uri  The URI from which the key will be downloaded
 243                                  * key  The key (as hex) used to decrypt fragments.
 244                                         If `key` is given, any key URI will be ignored
 245                                  * iv   The IV (as hex) used to decrypt fragments
 246                     * downloader_options  A dictionary of downloader options
 247                                  (For internal use only)
 248                                  * http_chunk_size Chunk size for HTTP downloads
 249                                  * ffmpeg_args     Extra arguments for ffmpeg downloader (input)
 250                                  * ffmpeg_args_out Extra arguments for ffmpeg downloader (output)
 251                     * is_dash_periods  Whether the format is a result of merging
 252                                  multiple DASH periods.
 253                     RTMP formats can also have the additional fields: page_url,
 254                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 255                     rtmp_protocol, rtmp_real_time
 256
 257     url:            Final video URL.
 258     ext:            Video filename extension.
 259     format:         The video format, defaults to ext (used for --get-format)
 260     player_url:     SWF Player URL (used for rtmpdump).
 261
 262     The following fields are optional:
 263
 264     direct:         True if a direct video file was given (must only be set by GenericIE)
 265     alt_title:      A secondary title of the video.
 266     display_id:     An alternative identifier for the video, not necessarily
 267                     unique, but available before title. Typically, id is
 268                     something like "4234987", title "Dancing naked mole rats",
 269                     and display_id "dancing-naked-mole-rats"
 270     thumbnails:     A list of dictionaries, with the following entries:
 271                         * "id" (optional, string) - Thumbnail format ID
 272                         * "url"
 273                         * "preference" (optional, int) - quality of the image
 274                         * "width" (optional, int)
 275                         * "height" (optional, int)
 276                         * "resolution" (optional, string "{width}x{height}",
 277                                         deprecated)
 278                         * "filesize" (optional, int)
 279                         * "http_headers" (dict) - HTTP headers for the request
 280     thumbnail:      Full URL to a video thumbnail image.
 281     description:    Full video description.
 282     uploader:       Full name of the video uploader.
 283     license:        License name the video is licensed under.
 284     creators:       List of creators of the video.
 285     timestamp:      UNIX timestamp of the moment the video was uploaded
 286     upload_date:    Video upload date in UTC (YYYYMMDD).
 287                     If not explicitly set, calculated from timestamp
 288     release_timestamp: UNIX timestamp of the moment the video was released.
 289                     If it is not clear whether to use timestamp or this, use the former
 290     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 291                     If not explicitly set, calculated from release_timestamp
 292     release_year:   Year (YYYY) as integer when the video or album was released.
 293                     To be used if no exact release date is known.
 294                     If not explicitly set, calculated from release_date.
 295     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 296     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 297                     If not explicitly set, calculated from modified_timestamp
 298     uploader_id:    Nickname or id of the video uploader.
 299     uploader_url:   Full URL to a personal webpage of the video uploader.
 300     channel:        Full name of the channel the video is uploaded on.
 301                     Note that channel fields may or may not repeat uploader
 302                     fields. This depends on a particular extractor.
 303     channel_id:     Id of the channel.
 304     channel_url:    Full URL to a channel webpage.
 305     channel_follower_count: Number of followers of the channel.
 306     channel_is_verified: Whether the channel is verified on the platform.
 307     location:       Physical location where the video was filmed.
 308     subtitles:      The available subtitles as a dictionary in the format
 309                     {tag: subformats}. "tag" is usually a language code, and
 310                     "subformats" is a list sorted from lower to higher
 311                     preference, each element is a dictionary with the "ext"
 312                     entry and one of:
 313                         * "data": The subtitles file contents
 314                         * "url": A URL pointing to the subtitles file
 315                     It can optionally also have:
 316                         * "name": Name or description of the subtitles
 317                         * "http_headers": A dictionary of additional HTTP headers
 318                                   to add to the request.
 319                     "ext" will be calculated from URL if missing
 320     automatic_captions: Like 'subtitles'; contains automatically generated
 321                     captions instead of normal subtitles
 322     duration:       Length of the video in seconds, as an integer or float.
 323     view_count:     How many users have watched the video on the platform.
 324     concurrent_view_count: How many users are currently watching the video on the platform.
 325     like_count:     Number of positive ratings of the video
 326     dislike_count:  Number of negative ratings of the video
 327     repost_count:   Number of reposts of the video
 328     average_rating: Average rating give by users, the scale used depends on the webpage
 329     comment_count:  Number of comments on the video
 330     comments:       A list of comments, each with one or more of the following
 331                     properties (all but one of text or html optional):
 332                         * "author" - human-readable name of the comment author
 333                         * "author_id" - user ID of the comment author
 334                         * "author_thumbnail" - The thumbnail of the comment author
 335                         * "author_url" - The url to the comment author's page
 336                         * "author_is_verified" - Whether the author is verified
 337                                                  on the platform
 338                         * "author_is_uploader" - Whether the comment is made by
 339                                                  the video uploader
 340                         * "id" - Comment ID
 341                         * "html" - Comment as HTML
 342                         * "text" - Plain text of the comment
 343                         * "timestamp" - UNIX timestamp of comment
 344                         * "parent" - ID of the comment this one is replying to.
 345                                      Set to "root" to indicate that this is a
 346                                      comment to the original video.
 347                         * "like_count" - Number of positive ratings of the comment
 348                         * "dislike_count" - Number of negative ratings of the comment
 349                         * "is_favorited" - Whether the comment is marked as
 350                                            favorite by the video uploader
 351                         * "is_pinned" - Whether the comment is pinned to
 352                                         the top of the comments
 353     age_limit:      Age restriction for the video, as an integer (years)
 354     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 355                     should allow to get the same result again. (It will be set
 356                     by YoutubeDL if it's missing)
 357     categories:     A list of categories that the video falls in, for example
 358                     ["Sports", "Berlin"]
 359     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 360     cast:           A list of the video cast
 361     is_live:        True, False, or None (=unknown). Whether this video is a
 362                     live stream that goes on instead of a fixed-length video.
 363     was_live:       True, False, or None (=unknown). Whether this video was
 364                     originally a live stream.
 365     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 366                     or 'post_live' (was live, but VOD is not yet processed)
 367                     If absent, automatically set from is_live, was_live
 368     start_time:     Time in seconds where the reproduction should start, as
 369                     specified in the URL.
 370     end_time:       Time in seconds where the reproduction should end, as
 371                     specified in the URL.
 372     chapters:       A list of dictionaries, with the following entries:
 373                         * "start_time" - The start time of the chapter in seconds
 374                         * "end_time" - The end time of the chapter in seconds
 375                         * "title" (optional, string)
 376     heatmap:        A list of dictionaries, with the following entries:
 377                         * "start_time" - The start time of the data point in seconds
 378                         * "end_time" - The end time of the data point in seconds
 379                         * "value" - The normalized value of the data point (float between 0 and 1)
 380     playable_in_embed: Whether this video is allowed to play in embedded
 381                     players on other sites. Can be True (=always allowed),
 382                     False (=never allowed), None (=unknown), or a string
 383                     specifying the criteria for embedability; e.g. 'whitelist'
 384     availability:   Under what condition the video is available. One of
 385                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 386                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 387                     to set it
 388     media_type:     The type of media as classified by the site, e.g. "episode", "clip", "trailer"
 389     _old_archive_ids: A list of old archive ids needed for backward compatibility
 390     _format_sort_fields: A list of fields to use for sorting formats
 391     __post_extractor: A function to be called just before the metadata is
 392                     written to either disk, logger or console. The function
 393                     must return a dict which will be added to the info_dict.
 394                     This is usefull for additional information that is
 395                     time-consuming to extract. Note that the fields thus
 396                     extracted will not be available to output template and
 397                     match_filter. So, only "comments" and "comment_count" are
 398                     currently allowed to be extracted via this method.
 399
 400     The following fields should only be used when the video belongs to some logical
 401     chapter or section:
 402
 403     chapter:        Name or title of the chapter the video belongs to.
 404     chapter_number: Number of the chapter the video belongs to, as an integer.
 405     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 406
 407     The following fields should only be used when the video is an episode of some
 408     series, programme or podcast:
 409
 410     series:         Title of the series or programme the video episode belongs to.
 411     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 412     season:         Title of the season the video episode belongs to.
 413     season_number:  Number of the season the video episode belongs to, as an integer.
 414     season_id:      Id of the season the video episode belongs to, as a unicode string.
 415     episode:        Title of the video episode. Unlike mandatory video title field,
 416                     this field should denote the exact title of the video episode
 417                     without any kind of decoration.
 418     episode_number: Number of the video episode within a season, as an integer.
 419     episode_id:     Id of the video episode, as a unicode string.
 420
 421     The following fields should only be used when the media is a track or a part of
 422     a music album:
 423
 424     track:          Title of the track.
 425     track_number:   Number of the track within an album or a disc, as an integer.
 426     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 427                     as a unicode string.
 428     artists:        List of artists of the track.
 429     composers:      List of composers of the piece.
 430     genres:         List of genres of the track.
 431     album:          Title of the album the track belongs to.
 432     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 433     album_artists:  List of all artists appeared on the album.
 434                     E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
 435                     Useful for splits and compilations.
 436     disc_number:    Number of the disc or other physical medium the track belongs to,
 437                     as an integer.
 438
 439     The following fields should only be set for clips that should be cut from the original video:
 440
 441     section_start:  Start time of the section in seconds
 442     section_end:    End time of the section in seconds
 443
 444     The following fields should only be set for storyboards:
 445     rows:           Number of rows in each storyboard fragment, as an integer
 446     columns:        Number of columns in each storyboard fragment, as an integer
 447
 448     The following fields are deprecated and should not be set by new code:
 449     composer:       Use "composers" instead.
 450                     Composer(s) of the piece, comma-separated.
 451     artist:         Use "artists" instead.
 452                     Artist(s) of the track, comma-separated.
 453     genre:          Use "genres" instead.
 454                     Genre(s) of the track, comma-separated.
 455     album_artist:   Use "album_artists" instead.
 456                     All artists appeared on the album, comma-separated.
 457     creator:        Use "creators" instead.
 458                     The creator of the video.
 459
 460     Unless mentioned otherwise, the fields should be Unicode strings.
 461
 462     Unless mentioned otherwise, None is equivalent to absence of information.
 463
 464
 465     _type "playlist" indicates multiple videos.
 466     There must be a key "entries", which is a list, an iterable, or a PagedList
 467     object, each element of which is a valid dictionary by this specification.
 468
 469     Additionally, playlists can have "id", "title", and any other relevant
 470     attributes with the same semantics as videos (see above).
 471
 472     It can also have the following optional fields:
 473
 474     playlist_count: The total number of videos in a playlist. If not given,
 475                     YoutubeDL tries to calculate it from "entries"
 476
 477
 478     _type "multi_video" indicates that there are multiple videos that
 479     form a single show, for examples multiple acts of an opera or TV episode.
 480     It must have an entries key like a playlist and contain all the keys
 481     required for a video at the same time.
 482
 483
 484     _type "url" indicates that the video must be extracted from another
 485     location, possibly by a different extractor. Its only required key is:
 486     "url" - the next URL to extract.
 487     The key "ie_key" can be set to the class name (minus the trailing "IE",
 488     e.g. "Youtube") if the extractor class is known in advance.
 489     Additionally, the dictionary may have any properties of the resolved entity
 490     known in advance, for example "title" if the title of the referred video is
 491     known ahead of time.
 492
 493
 494     _type "url_transparent" entities have the same specification as "url", but
 495     indicate that the given additional information is more precise than the one
 496     associated with the resolved URL.
 497     This is useful when a site employs a video service that hosts the video and
 498     its technical metadata, but that video service does not embed a useful
 499     title, description etc.
 500
 501
 502     Subclasses of this should also be added to the list of extractors and
 503     should define _VALID_URL as a regexp or a Sequence of regexps, and
 504     re-define the _real_extract() and (optionally) _real_initialize() methods.
 505
 506     Subclasses may also override suitable() if necessary, but ensure the function
 507     signature is preserved and that this function imports everything it needs
 508     (except other extractors), so that lazy_extractors works correctly.
 509
 510     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 511     the HTML of Generic webpages. It may also override _extract_embed_urls
 512     or _extract_from_webpage as necessary. While these are normally classmethods,
 513     _extract_from_webpage is allowed to be an instance method.
 514
 515     _extract_from_webpage may raise self.StopExtraction() to stop further
 516     processing of the webpage and obtain exclusive rights to it. This is useful
 517     when the extractor cannot reliably be matched using just the URL,
 518     e.g. invidious/peertube instances
 519
 520     Embed-only extractors can be defined by setting _VALID_URL = False.
 521
 522     To support username + password (or netrc) login, the extractor must define a
 523     _NETRC_MACHINE and re-define _perform_login(username, password) and
 524     (optionally) _initialize_pre_login() methods. The _perform_login method will
 525     be called between _initialize_pre_login and _real_initialize if credentials
 526     are passed by the user. In cases where it is necessary to have the login
 527     process as part of the extraction rather than initialization, _perform_login
 528     can be left undefined.
 529
 530     _GEO_BYPASS attribute may be set to False in order to disable
 531     geo restriction bypass mechanisms for a particular extractor.
 532     Though it won't disable explicit geo restriction bypass based on
 533     country code provided with geo_bypass_country.
 534
 535     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 536     countries for this extractor. One of these countries will be used by
 537     geo restriction bypass mechanism right away in order to bypass
 538     geo restriction, of course, if the mechanism is not disabled.
 539
 540     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 541     IP blocks in CIDR notation for this extractor. One of these IP blocks
 542     will be used by geo restriction bypass mechanism similarly
 543     to _GEO_COUNTRIES.
 544
 545     The _ENABLED attribute should be set to False for IEs that
 546     are disabled by default and must be explicitly enabled.
 547
 548     The _WORKING attribute should be set to False for broken IEs
 549     in order to warn the users and skip the tests.
 550     """
 551
 552     _ready = False
 553     _downloader = None
 554     _x_forwarded_for_ip = None
 555     _GEO_BYPASS = True
 556     _GEO_COUNTRIES = None
 557     _GEO_IP_BLOCKS = None
 558     _WORKING = True
 559     _ENABLED = True
 560     _NETRC_MACHINE = None
 561     IE_DESC = None
 562     SEARCH_KEY = None
 563     _VALID_URL = None
 564     _EMBED_REGEX = []
 565
 566     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 567         password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 568         return {
 569             None: '',
 570             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 571             'password': f'Use {password_hint}',
 572             'cookies': (
 573                 'Use --cookies-from-browser or --cookies for the authentication. '
 574                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 575         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 576
 577     def __init__(self, downloader=None):
 578         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 579         If a downloader is not passed during initialization,
 580         it must be set using "set_downloader()" before "extract()" is called"""
 581         self._ready = False
 582         self._x_forwarded_for_ip = None
 583         self._printed_messages = set()
 584         self.set_downloader(downloader)
 585
 586     @classmethod
 587     def _match_valid_url(cls, url):
 588         if cls._VALID_URL is False:
 589             return None
 590         # This does not use has/getattr intentionally - we want to know whether
 591         # we have cached the regexp for *this* class, whereas getattr would also
 592         # match the superclass
 593         if '_VALID_URL_RE' not in cls.__dict__:
 594             cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
 595         return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
 596
 597     @classmethod
 598     def suitable(cls, url):
 599         """Receives a URL and returns True if suitable for this IE."""
 600         # This function must import everything it needs (except other extractors),
 601         # so that lazy_extractors works correctly
 602         return cls._match_valid_url(url) is not None
 603
 604     @classmethod
 605     def _match_id(cls, url):
 606         return cls._match_valid_url(url).group('id')
 607
 608     @classmethod
 609     def get_temp_id(cls, url):
 610         try:
 611             return cls._match_id(url)
 612         except (IndexError, AttributeError):
 613             return None
 614
 615     @classmethod
 616     def working(cls):
 617         """Getter method for _WORKING."""
 618         return cls._WORKING
 619
 620     @classmethod
 621     def supports_login(cls):
 622         return bool(cls._NETRC_MACHINE)
 623
 624     def initialize(self):
 625         """Initializes an instance (authentication, etc)."""
 626         self._printed_messages = set()
 627         self._initialize_geo_bypass({
 628             'countries': self._GEO_COUNTRIES,
 629             'ip_blocks': self._GEO_IP_BLOCKS,
 630         })
 631         if not self._ready:
 632             self._initialize_pre_login()
 633             if self.supports_login():
 634                 username, password = self._get_login_info()
 635                 if username:
 636                     self._perform_login(username, password)
 637             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 638                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 639             self._real_initialize()
 640             self._ready = True
 641
 642     def _initialize_geo_bypass(self, geo_bypass_context):
 643         """
 644         Initialize geo restriction bypass mechanism.
 645
 646         This method is used to initialize geo bypass mechanism based on faking
 647         X-Forwarded-For HTTP header. A random country from provided country list
 648         is selected and a random IP belonging to this country is generated. This
 649         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 650         HTTP requests.
 651
 652         This method will be used for initial geo bypass mechanism initialization
 653         during the instance initialization with _GEO_COUNTRIES and
 654         _GEO_IP_BLOCKS.
 655
 656         You may also manually call it from extractor's code if geo bypass
 657         information is not available beforehand (e.g. obtained during
 658         extraction) or due to some other reason. In this case you should pass
 659         this information in geo bypass context passed as first argument. It may
 660         contain following fields:
 661
 662         countries:  List of geo unrestricted countries (similar
 663                     to _GEO_COUNTRIES)
 664         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 665                     (similar to _GEO_IP_BLOCKS)
 666
 667         """
 668         if not self._x_forwarded_for_ip:
 669
 670             # Geo bypass mechanism is explicitly disabled by user
 671             if not self.get_param('geo_bypass', True):
 672                 return
 673
 674             if not geo_bypass_context:
 675                 geo_bypass_context = {}
 676
 677             # Backward compatibility: previously _initialize_geo_bypass
 678             # expected a list of countries, some 3rd party code may still use
 679             # it this way
 680             if isinstance(geo_bypass_context, (list, tuple)):
 681                 geo_bypass_context = {
 682                     'countries': geo_bypass_context,
 683                 }
 684
 685             # The whole point of geo bypass mechanism is to fake IP
 686             # as X-Forwarded-For HTTP header based on some IP block or
 687             # country code.
 688
 689             # Path 1: bypassing based on IP block in CIDR notation
 690
 691             # Explicit IP block specified by user, use it right away
 692             # regardless of whether extractor is geo bypassable or not
 693             ip_block = self.get_param('geo_bypass_ip_block', None)
 694
 695             # Otherwise use random IP block from geo bypass context but only
 696             # if extractor is known as geo bypassable
 697             if not ip_block:
 698                 ip_blocks = geo_bypass_context.get('ip_blocks')
 699                 if self._GEO_BYPASS and ip_blocks:
 700                     ip_block = random.choice(ip_blocks)
 701
 702             if ip_block:
 703                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 704                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 705                 return
 706
 707             # Path 2: bypassing based on country code
 708
 709             # Explicit country code specified by user, use it right away
 710             # regardless of whether extractor is geo bypassable or not
 711             country = self.get_param('geo_bypass_country', None)
 712
 713             # Otherwise use random country code from geo bypass context but
 714             # only if extractor is known as geo bypassable
 715             if not country:
 716                 countries = geo_bypass_context.get('countries')
 717                 if self._GEO_BYPASS and countries:
 718                     country = random.choice(countries)
 719
 720             if country:
 721                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 722                 self._downloader.write_debug(
 723                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 724
 725     def extract(self, url):
 726         """Extracts URL information and returns it in list of dicts."""
 727         try:
 728             for _ in range(2):
 729                 try:
 730                     self.initialize()
 731                     self.to_screen('Extracting URL: %s' % (
 732                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 733                     ie_result = self._real_extract(url)
 734                     if ie_result is None:
 735                         return None
 736                     if self._x_forwarded_for_ip:
 737                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 738                     subtitles = ie_result.get('subtitles') or {}
 739                     if 'no-live-chat' in self.get_param('compat_opts'):
 740                         for lang in ('live_chat', 'comments', 'danmaku'):
 741                             subtitles.pop(lang, None)
 742                     return ie_result
 743                 except GeoRestrictedError as e:
 744                     if self.__maybe_fake_ip_and_retry(e.countries):
 745                         continue
 746                     raise
 747         except UnsupportedError:
 748             raise
 749         except ExtractorError as e:
 750             e.video_id = e.video_id or self.get_temp_id(url)
 751             e.ie = e.ie or self.IE_NAME
 752             e.traceback = e.traceback or sys.exc_info()[2]
 753             raise
 754         except IncompleteRead as e:
 755             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 756         except (KeyError, StopIteration) as e:
 757             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 758
 759     def __maybe_fake_ip_and_retry(self, countries):
 760         if (not self.get_param('geo_bypass_country', None)
 761                 and self._GEO_BYPASS
 762                 and self.get_param('geo_bypass', True)
 763                 and not self._x_forwarded_for_ip
 764                 and countries):
 765             country_code = random.choice(countries)
 766             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 767             if self._x_forwarded_for_ip:
 768                 self.report_warning(
 769                     'Video is geo restricted. Retrying extraction with fake IP '
 770                     f'{self._x_forwarded_for_ip} ({country_code.upper()}) as X-Forwarded-For.')
 771                 return True
 772         return False
 773
 774     def set_downloader(self, downloader):
 775         """Sets a YoutubeDL instance as the downloader for this IE."""
 776         self._downloader = downloader
 777
 778     @property
 779     def cache(self):
 780         return self._downloader.cache
 781
 782     @property
 783     def cookiejar(self):
 784         return self._downloader.cookiejar
 785
 786     def _initialize_pre_login(self):
 787         """ Initialization before login. Redefine in subclasses."""
 788         pass
 789
 790     def _perform_login(self, username, password):
 791         """ Login with username and password. Redefine in subclasses."""
 792         pass
 793
 794     def _real_initialize(self):
 795         """Real initialization process. Redefine in subclasses."""
 796         pass
 797
 798     def _real_extract(self, url):
 799         """Real extraction process. Redefine in subclasses."""
 800         raise NotImplementedError('This method must be implemented by subclasses')
 801
 802     @classmethod
 803     def ie_key(cls):
 804         """A string for getting the InfoExtractor with get_info_extractor"""
 805         return cls.__name__[:-2]
 806
 807     @classproperty
 808     def IE_NAME(cls):
 809         return cls.__name__[:-2]
 810
 811     @staticmethod
 812     def __can_accept_status_code(err, expected_status):
 813         assert isinstance(err, HTTPError)
 814         if expected_status is None:
 815             return False
 816         elif callable(expected_status):
 817             return expected_status(err.status) is True
 818         else:
 819             return err.status in variadic(expected_status)
 820
 821     def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None):
 822         if isinstance(url_or_request, urllib.request.Request):
 823             self._downloader.deprecation_warning(
 824                 'Passing a urllib.request.Request to _create_request() is deprecated. '
 825                 'Use yt_dlp.networking.common.Request instead.')
 826             url_or_request = urllib_req_to_req(url_or_request)
 827         elif not isinstance(url_or_request, Request):
 828             url_or_request = Request(url_or_request)
 829
 830         url_or_request.update(data=data, headers=headers, query=query, extensions=extensions)
 831         return url_or_request
 832
 833     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None,
 834                          headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False):
 835         """
 836         Return the response handle.
 837
 838         See _download_webpage docstring for arguments specification.
 839         """
 840         if not self._downloader._first_webpage_request:
 841             sleep_interval = self.get_param('sleep_interval_requests') or 0
 842             if sleep_interval > 0:
 843                 self.to_screen(f'Sleeping {sleep_interval} seconds ...')
 844                 time.sleep(sleep_interval)
 845         else:
 846             self._downloader._first_webpage_request = False
 847
 848         if note is None:
 849             self.report_download_webpage(video_id)
 850         elif note is not False:
 851             if video_id is None:
 852                 self.to_screen(str(note))
 853             else:
 854                 self.to_screen(f'{video_id}: {note}')
 855
 856         # Some sites check X-Forwarded-For HTTP header in order to figure out
 857         # the origin of the client behind proxy. This allows bypassing geo
 858         # restriction by faking this header's value to IP that belongs to some
 859         # geo unrestricted country. We will do so once we encounter any
 860         # geo restriction error.
 861         if self._x_forwarded_for_ip:
 862             headers = (headers or {}).copy()
 863             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 864
 865         extensions = {}
 866
 867         if impersonate in (True, ''):
 868             impersonate = ImpersonateTarget()
 869         requested_targets = [
 870             t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t)
 871             for t in variadic(impersonate)
 872         ] if impersonate else []
 873
 874         available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None)
 875         if available_target:
 876             extensions['impersonate'] = available_target
 877         elif requested_targets:
 878             message = 'The extractor is attempting impersonation, but '
 879             message += (
 880                 'no impersonate target is available' if not str(impersonate)
 881                 else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"')
 882             info_msg = ('see  https://github.com/yt-dlp/yt-dlp#impersonation  '
 883                         'for information on installing the required dependencies')
 884             if require_impersonation:
 885                 raise ExtractorError(f'{message}; {info_msg}', expected=True)
 886             self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True)
 887
 888         try:
 889             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions))
 890         except network_exceptions as err:
 891             if isinstance(err, HTTPError):
 892                 if self.__can_accept_status_code(err, expected_status):
 893                     return err.response
 894
 895             if errnote is False:
 896                 return False
 897             if errnote is None:
 898                 errnote = 'Unable to download webpage'
 899
 900             errmsg = f'{errnote}: {err}'
 901             if fatal:
 902                 raise ExtractorError(errmsg, cause=err)
 903             else:
 904                 self.report_warning(errmsg)
 905                 return False
 906
 907     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 908                                  encoding=None, data=None, headers={}, query={}, expected_status=None,
 909                                  impersonate=None, require_impersonation=False):
 910         """
 911         Return a tuple (page content as string, URL handle).
 912
 913         Arguments:
 914         url_or_request -- plain text URL as a string or
 915             a yt_dlp.networking.Request object
 916         video_id -- Video/playlist/item identifier (string)
 917
 918         Keyword arguments:
 919         note -- note printed before downloading (string)
 920         errnote -- note printed in case of an error (string)
 921         fatal -- flag denoting whether error should be considered fatal,
 922             i.e. whether it should cause ExtractionError to be raised,
 923             otherwise a warning will be reported and extraction continued
 924         encoding -- encoding for a page content decoding, guessed automatically
 925             when not explicitly specified
 926         data -- POST data (bytes)
 927         headers -- HTTP headers (dict)
 928         query -- URL query (dict)
 929         expected_status -- allows to accept failed HTTP requests (non 2xx
 930             status code) by explicitly specifying a set of accepted status
 931             codes. Can be any of the following entities:
 932                 - an integer type specifying an exact failed status code to
 933                   accept
 934                 - a list or a tuple of integer types specifying a list of
 935                   failed status codes to accept
 936                 - a callable accepting an actual failed status code and
 937                   returning True if it should be accepted
 938             Note that this argument does not affect success status codes (2xx)
 939             which are always accepted.
 940         impersonate -- the impersonate target. Can be any of the following entities:
 941                 - an instance of yt_dlp.networking.impersonate.ImpersonateTarget
 942                 - a string in the format of CLIENT[:OS]
 943                 - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances
 944                 - a boolean value; True means any impersonate target is sufficient
 945         require_impersonation -- flag to toggle whether the request should raise an error
 946             if impersonation is not possible (bool, default: False)
 947         """
 948
 949         # Strip hashes from the URL (#1038)
 950         if isinstance(url_or_request, str):
 951             url_or_request = url_or_request.partition('#')[0]
 952
 953         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data,
 954                                      headers=headers, query=query, expected_status=expected_status,
 955                                      impersonate=impersonate, require_impersonation=require_impersonation)
 956         if urlh is False:
 957             assert not fatal
 958             return False
 959         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
 960                                              encoding=encoding, data=data)
 961         return (content, urlh)
 962
 963     @staticmethod
 964     def _guess_encoding_from_content(content_type, webpage_bytes):
 965         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 966         if m:
 967             encoding = m.group(1)
 968         else:
 969             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 970                           webpage_bytes[:1024])
 971             if m:
 972                 encoding = m.group(1).decode('ascii')
 973             elif webpage_bytes.startswith(b'\xff\xfe'):
 974                 encoding = 'utf-16'
 975             else:
 976                 encoding = 'utf-8'
 977
 978         return encoding
 979
 980     def __check_blocked(self, content):
 981         first_block = content[:512]
 982         if ('<title>Access to this site is blocked</title>' in content
 983                 and 'Websense' in first_block):
 984             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 985             blocked_iframe = self._html_search_regex(
 986                 r'<iframe src="([^"]+)"', content,
 987                 'Websense information URL', default=None)
 988             if blocked_iframe:
 989                 msg += f' Visit {blocked_iframe} for more details'
 990             raise ExtractorError(msg, expected=True)
 991         if '<title>The URL you requested has been blocked</title>' in first_block:
 992             msg = (
 993                 'Access to this webpage has been blocked by Indian censorship. '
 994                 'Use a VPN or proxy server (with --proxy) to route around it.')
 995             block_msg = self._html_search_regex(
 996                 r'</h1><p>(.*?)</p>',
 997                 content, 'block message', default=None)
 998             if block_msg:
 999                 msg += ' (Message: "{}")'.format(block_msg.replace('\n', ' '))
1000             raise ExtractorError(msg, expected=True)
1001         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
1002                 and 'blocklist.rkn.gov.ru' in content):
1003             raise ExtractorError(
1004                 'Access to this webpage has been blocked by decision of the Russian government. '
1005                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
1006                 expected=True)
1007
1008     def _request_dump_filename(self, url, video_id, data=None):
1009         if data is not None:
1010             data = hashlib.md5(data).hexdigest()
1011         basen = join_nonempty(video_id, data, url, delim='_')
1012         trim_length = self.get_param('trim_file_name') or 240
1013         if len(basen) > trim_length:
1014             h = '___' + hashlib.md5(basen.encode()).hexdigest()
1015             basen = basen[:trim_length - len(h)] + h
1016         filename = sanitize_filename(f'{basen}.dump', restricted=True)
1017         # Working around MAX_PATH limitation on Windows (see
1018         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
1019         if compat_os_name == 'nt':
1020             absfilepath = os.path.abspath(filename)
1021             if len(absfilepath) > 259:
1022                 filename = fR'\\?\{absfilepath}'
1023         return filename
1024
1025     def __decode_webpage(self, webpage_bytes, encoding, headers):
1026         if not encoding:
1027             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
1028         try:
1029             return webpage_bytes.decode(encoding, 'replace')
1030         except LookupError:
1031             return webpage_bytes.decode('utf-8', 'replace')
1032
1033     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
1034                               prefix=None, encoding=None, data=None):
1035         webpage_bytes = urlh.read()
1036         if prefix is not None:
1037             webpage_bytes = prefix + webpage_bytes
1038         if self.get_param('dump_intermediate_pages', False):
1039             self.to_screen('Dumping request to ' + urlh.url)
1040             dump = base64.b64encode(webpage_bytes).decode('ascii')
1041             self._downloader.to_screen(dump)
1042         if self.get_param('write_pages'):
1043             if isinstance(url_or_request, Request):
1044                 data = self._create_request(url_or_request, data).data
1045             filename = self._request_dump_filename(urlh.url, video_id, data)
1046             self.to_screen(f'Saving request to {filename}')
1047             with open(filename, 'wb') as outf:
1048                 outf.write(webpage_bytes)
1049
1050         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
1051         self.__check_blocked(content)
1052
1053         return content
1054
1055     def __print_error(self, errnote, fatal, video_id, err):
1056         if fatal:
1057             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
1058         elif errnote:
1059             self.report_warning(f'{video_id}: {errnote}: {err}')
1060
1061     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
1062         if transform_source:
1063             xml_string = transform_source(xml_string)
1064         try:
1065             return compat_etree_fromstring(xml_string.encode())
1066         except xml.etree.ElementTree.ParseError as ve:
1067             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1068
1069     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1070         try:
1071             return json.loads(
1072                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1073         except ValueError as ve:
1074             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1075
1076     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1077         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1078
1079     def __create_download_methods(name, parser, note, errnote, return_value):
1080
1081         def parse(ie, content, *args, errnote=errnote, **kwargs):
1082             if parser is None:
1083                 return content
1084             if errnote is False:
1085                 kwargs['errnote'] = errnote
1086             # parser is fetched by name so subclasses can override it
1087             return getattr(ie, parser)(content, *args, **kwargs)
1088
1089         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1090                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
1091                             impersonate=None, require_impersonation=False):
1092             res = self._download_webpage_handle(
1093                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1094                 data=data, headers=headers, query=query, expected_status=expected_status,
1095                 impersonate=impersonate, require_impersonation=require_impersonation)
1096             if res is False:
1097                 return res
1098             content, urlh = res
1099             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1100
1101         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1102                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
1103                              impersonate=None, require_impersonation=False):
1104             if self.get_param('load_pages'):
1105                 url_or_request = self._create_request(url_or_request, data, headers, query)
1106                 filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
1107                 self.to_screen(f'Loading request from {filename}')
1108                 try:
1109                     with open(filename, 'rb') as dumpf:
1110                         webpage_bytes = dumpf.read()
1111                 except OSError as e:
1112                     self.report_warning(f'Unable to load request from disk: {e}')
1113                 else:
1114                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1115                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1116             kwargs = {
1117                 'note': note,
1118                 'errnote': errnote,
1119                 'transform_source': transform_source,
1120                 'fatal': fatal,
1121                 'encoding': encoding,
1122                 'data': data,
1123                 'headers': headers,
1124                 'query': query,
1125                 'expected_status': expected_status,
1126                 'impersonate': impersonate,
1127                 'require_impersonation': require_impersonation,
1128             }
1129             if parser is None:
1130                 kwargs.pop('transform_source')
1131             # The method is fetched by name so subclasses can override _download_..._handle
1132             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1133             return res if res is False else res[0]
1134
1135         def impersonate(func, name, return_value):
1136             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1137             func.__doc__ = f'''
1138                 @param transform_source     Apply this transformation before parsing
1139                 @returns                    {return_value}
1140
1141                 See _download_webpage_handle docstring for other arguments specification
1142             '''
1143
1144         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1145         impersonate(download_content, f'_download_{name}', f'{return_value}')
1146         return download_handle, download_content
1147
1148     _download_xml_handle, _download_xml = __create_download_methods(
1149         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1150     _download_json_handle, _download_json = __create_download_methods(
1151         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1152     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1153         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1154     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1155
1156     def _download_webpage(
1157             self, url_or_request, video_id, note=None, errnote=None,
1158             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1159         """
1160         Return the data of the page as a string.
1161
1162         Keyword arguments:
1163         tries -- number of tries
1164         timeout -- sleep interval between tries
1165
1166         See _download_webpage_handle docstring for other arguments specification.
1167         """
1168
1169         R''' # NB: These are unused; should they be deprecated?
1170         if tries != 1:
1171             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1172         if timeout is NO_DEFAULT:
1173             timeout = 5
1174         else:
1175             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1176         '''
1177
1178         try_count = 0
1179         while True:
1180             try:
1181                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1182             except IncompleteRead as e:
1183                 try_count += 1
1184                 if try_count >= tries:
1185                     raise e
1186                 self._sleep(timeout, video_id)
1187
1188     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1189         idstr = format_field(video_id, None, '%s: ')
1190         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1191         if only_once:
1192             if f'WARNING: {msg}' in self._printed_messages:
1193                 return
1194             self._printed_messages.add(f'WARNING: {msg}')
1195         self._downloader.report_warning(msg, *args, **kwargs)
1196
1197     def to_screen(self, msg, *args, **kwargs):
1198         """Print msg to screen, prefixing it with '[ie_name]'"""
1199         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1200
1201     def write_debug(self, msg, *args, **kwargs):
1202         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1203
1204     def get_param(self, name, default=None, *args, **kwargs):
1205         if self._downloader:
1206             return self._downloader.params.get(name, default, *args, **kwargs)
1207         return default
1208
1209     def report_drm(self, video_id, partial=NO_DEFAULT):
1210         if partial is not NO_DEFAULT:
1211             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1212         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1213
1214     def report_extraction(self, id_or_name):
1215         """Report information extraction."""
1216         self.to_screen(f'{id_or_name}: Extracting information')
1217
1218     def report_download_webpage(self, video_id):
1219         """Report webpage download."""
1220         self.to_screen(f'{video_id}: Downloading webpage')
1221
1222     def report_age_confirmation(self):
1223         """Report attempt to confirm age."""
1224         self.to_screen('Confirming age')
1225
1226     def report_login(self):
1227         """Report attempt to log in."""
1228         self.to_screen('Logging in')
1229
1230     def raise_login_required(
1231             self, msg='This video is only available for registered users',
1232             metadata_available=False, method=NO_DEFAULT):
1233         if metadata_available and (
1234                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1235             self.report_warning(msg)
1236             return
1237         msg += format_field(self._login_hint(method), None, '. %s')
1238         raise ExtractorError(msg, expected=True)
1239
1240     def raise_geo_restricted(
1241             self, msg='This video is not available from your location due to geo restriction',
1242             countries=None, metadata_available=False):
1243         if metadata_available and (
1244                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1245             self.report_warning(msg)
1246         else:
1247             raise GeoRestrictedError(msg, countries=countries)
1248
1249     def raise_no_formats(self, msg, expected=False, video_id=None):
1250         if expected and (
1251                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1252             self.report_warning(msg, video_id)
1253         elif isinstance(msg, ExtractorError):
1254             raise msg
1255         else:
1256             raise ExtractorError(msg, expected=expected, video_id=video_id)
1257
1258     # Methods for following #608
1259     @staticmethod
1260     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1261         """Returns a URL that points to a page that should be processed"""
1262         if ie is not None:
1263             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1264         if video_id is not None:
1265             kwargs['id'] = video_id
1266         if video_title is not None:
1267             kwargs['title'] = video_title
1268         return {
1269             **kwargs,
1270             '_type': 'url_transparent' if url_transparent else 'url',
1271             'url': url,
1272         }
1273
1274     @classmethod
1275     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1276                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1277         return cls.playlist_result(
1278             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1279             playlist_id, playlist_title, **kwargs)
1280
1281     @staticmethod
1282     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1283         """Returns a playlist"""
1284         if playlist_id:
1285             kwargs['id'] = playlist_id
1286         if playlist_title:
1287             kwargs['title'] = playlist_title
1288         if playlist_description is not None:
1289             kwargs['description'] = playlist_description
1290         return {
1291             **kwargs,
1292             '_type': 'multi_video' if multi_video else 'playlist',
1293             'entries': entries,
1294         }
1295
1296     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1297         """
1298         Perform a regex search on the given string, using a single or a list of
1299         patterns returning the first matching group.
1300         In case of failure return a default value or raise a WARNING or a
1301         RegexNotFoundError, depending on fatal, specifying the field name.
1302         """
1303         if string is None:
1304             mobj = None
1305         elif isinstance(pattern, (str, re.Pattern)):
1306             mobj = re.search(pattern, string, flags)
1307         else:
1308             for p in pattern:
1309                 mobj = re.search(p, string, flags)
1310                 if mobj:
1311                     break
1312
1313         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1314
1315         if mobj:
1316             if group is None:
1317                 # return the first matching group
1318                 return next(g for g in mobj.groups() if g is not None)
1319             elif isinstance(group, (list, tuple)):
1320                 return tuple(mobj.group(g) for g in group)
1321             else:
1322                 return mobj.group(group)
1323         elif default is not NO_DEFAULT:
1324             return default
1325         elif fatal:
1326             raise RegexNotFoundError(f'Unable to extract {_name}')
1327         else:
1328             self.report_warning(f'unable to extract {_name}' + bug_reports_message())
1329             return None
1330
1331     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1332                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1333         """Searches string for the JSON object specified by start_pattern"""
1334         # NB: end_pattern is only used to reduce the size of the initial match
1335         if default is NO_DEFAULT:
1336             default, has_default = {}, False
1337         else:
1338             fatal, has_default = False, True
1339
1340         json_string = self._search_regex(
1341             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1342             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1343         if not json_string:
1344             return default
1345
1346         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1347         try:
1348             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1349         except ExtractorError as e:
1350             if fatal:
1351                 raise ExtractorError(
1352                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1353             elif not has_default:
1354                 self.report_warning(
1355                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1356         return default
1357
1358     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1359         """
1360         Like _search_regex, but strips HTML tags and unescapes entities.
1361         """
1362         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1363         if isinstance(res, tuple):
1364             return tuple(map(clean_html, res))
1365         return clean_html(res)
1366
1367     def _get_netrc_login_info(self, netrc_machine=None):
1368         netrc_machine = netrc_machine or self._NETRC_MACHINE
1369
1370         cmd = self.get_param('netrc_cmd')
1371         if cmd:
1372             cmd = cmd.replace('{}', netrc_machine)
1373             self.to_screen(f'Executing command: {cmd}')
1374             stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1375             if ret != 0:
1376                 raise OSError(f'Command returned error code {ret}')
1377             info = netrc_from_content(stdout).authenticators(netrc_machine)
1378
1379         elif self.get_param('usenetrc', False):
1380             netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1381             if os.path.isdir(netrc_file):
1382                 netrc_file = os.path.join(netrc_file, '.netrc')
1383             info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1384
1385         else:
1386             return None, None
1387         if not info:
1388             self.to_screen(f'No authenticators for {netrc_machine}')
1389             return None, None
1390
1391         self.write_debug(f'Using netrc for {netrc_machine} authentication')
1392         return info[0], info[2]
1393
1394     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1395         """
1396         Get the login info as (username, password)
1397         First look for the manually specified credentials using username_option
1398         and password_option as keys in params dictionary. If no such credentials
1399         are available try the netrc_cmd if it is defined or look in the
1400         netrc file using the netrc_machine or _NETRC_MACHINE value.
1401         If there's no info available, return (None, None)
1402         """
1403
1404         username = self.get_param(username_option)
1405         if username is not None:
1406             password = self.get_param(password_option)
1407         else:
1408             try:
1409                 username, password = self._get_netrc_login_info(netrc_machine)
1410             except (OSError, netrc.NetrcParseError) as err:
1411                 self.report_warning(f'Failed to parse .netrc: {err}')
1412                 return None, None
1413         return username, password
1414
1415     def _get_tfa_info(self, note='two-factor verification code'):
1416         """
1417         Get the two-factor authentication info
1418         TODO - asking the user will be required for sms/phone verify
1419         currently just uses the command line option
1420         If there's no info available, return None
1421         """
1422
1423         tfa = self.get_param('twofactor')
1424         if tfa is not None:
1425             return tfa
1426
1427         return getpass.getpass(f'Type {note} and press [Return]: ')
1428
1429     # Helper functions for extracting OpenGraph info
1430     @staticmethod
1431     def _og_regexes(prop):
1432         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1433         property_re = r'(?:name|property)=(?:\'og{sep}{prop}\'|"og{sep}{prop}"|\s*og{sep}{prop}\b)'.format(
1434             prop=re.escape(prop), sep='(?:&#x3A;|[:-])')
1435         template = r'<meta[^>]+?%s[^>]+?%s'
1436         return [
1437             template % (property_re, content_re),
1438             template % (content_re, property_re),
1439         ]
1440
1441     @staticmethod
1442     def _meta_regex(prop):
1443         return rf'''(?isx)<meta
1444                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?){re.escape(prop)}\1)
1445                     [^>]+?content=(["\'])(?P<content>.*?)\2'''
1446
1447     def _og_search_property(self, prop, html, name=None, **kargs):
1448         prop = variadic(prop)
1449         if name is None:
1450             name = f'OpenGraph {prop[0]}'
1451         og_regexes = []
1452         for p in prop:
1453             og_regexes.extend(self._og_regexes(p))
1454         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1455         if escaped is None:
1456             return None
1457         return unescapeHTML(escaped)
1458
1459     def _og_search_thumbnail(self, html, **kargs):
1460         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1461
1462     def _og_search_description(self, html, **kargs):
1463         return self._og_search_property('description', html, fatal=False, **kargs)
1464
1465     def _og_search_title(self, html, *, fatal=False, **kargs):
1466         return self._og_search_property('title', html, fatal=fatal, **kargs)
1467
1468     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1469         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1470         if secure:
1471             regexes = self._og_regexes('video:secure_url') + regexes
1472         return self._html_search_regex(regexes, html, name, **kargs)
1473
1474     def _og_search_url(self, html, **kargs):
1475         return self._og_search_property('url', html, **kargs)
1476
1477     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1478         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1479
1480     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1481         name = variadic(name)
1482         if display_name is None:
1483             display_name = name[0]
1484         return self._html_search_regex(
1485             [self._meta_regex(n) for n in name],
1486             html, display_name, fatal=fatal, group='content', **kwargs)
1487
1488     def _dc_search_uploader(self, html):
1489         return self._html_search_meta('dc.creator', html, 'uploader')
1490
1491     @staticmethod
1492     def _rta_search(html):
1493         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1494         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1495                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1496                      html):
1497             return 18
1498
1499         # And then there are the jokers who advertise that they use RTA, but actually don't.
1500         AGE_LIMIT_MARKERS = [
1501             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1502             r'>[^<]*you acknowledge you are at least (\d+) years old',
1503             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1504         ]
1505
1506         age_limit = 0
1507         for marker in AGE_LIMIT_MARKERS:
1508             mobj = re.search(marker, html)
1509             if mobj:
1510                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1511         return age_limit
1512
1513     def _media_rating_search(self, html):
1514         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1515         rating = self._html_search_meta('rating', html)
1516
1517         if not rating:
1518             return None
1519
1520         RATING_TABLE = {
1521             'safe for kids': 0,
1522             'general': 8,
1523             '14 years': 14,
1524             'mature': 17,
1525             'restricted': 19,
1526         }
1527         return RATING_TABLE.get(rating.lower())
1528
1529     def _family_friendly_search(self, html):
1530         # See http://schema.org/VideoObject
1531         family_friendly = self._html_search_meta(
1532             'isFamilyFriendly', html, default=None)
1533
1534         if not family_friendly:
1535             return None
1536
1537         RATING_TABLE = {
1538             '1': 0,
1539             'true': 0,
1540             '0': 18,
1541             'false': 18,
1542         }
1543         return RATING_TABLE.get(family_friendly.lower())
1544
1545     def _twitter_search_player(self, html):
1546         return self._html_search_meta('twitter:player', html,
1547                                       'twitter card player')
1548
1549     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1550         """Yield all json ld objects in the html"""
1551         if default is not NO_DEFAULT:
1552             fatal = False
1553         for mobj in re.finditer(JSON_LD_RE, html):
1554             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1555             for json_ld in variadic(json_ld_item):
1556                 if isinstance(json_ld, dict):
1557                     yield json_ld
1558
1559     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1560         """Search for a video in any json ld in the html"""
1561         if default is not NO_DEFAULT:
1562             fatal = False
1563         info = self._json_ld(
1564             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1565             video_id, fatal=fatal, expected_type=expected_type)
1566         if info:
1567             return info
1568         if default is not NO_DEFAULT:
1569             return default
1570         elif fatal:
1571             raise RegexNotFoundError('Unable to extract JSON-LD')
1572         else:
1573             self.report_warning(f'unable to extract JSON-LD {bug_reports_message()}')
1574             return {}
1575
1576     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1577         if isinstance(json_ld, str):
1578             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1579         if not json_ld:
1580             return {}
1581         info = {}
1582
1583         INTERACTION_TYPE_MAP = {
1584             'CommentAction': 'comment',
1585             'AgreeAction': 'like',
1586             'DisagreeAction': 'dislike',
1587             'LikeAction': 'like',
1588             'DislikeAction': 'dislike',
1589             'ListenAction': 'view',
1590             'WatchAction': 'view',
1591             'ViewAction': 'view',
1592         }
1593
1594         def is_type(e, *expected_types):
1595             type_ = variadic(traverse_obj(e, '@type'))
1596             return any(x in type_ for x in expected_types)
1597
1598         def extract_interaction_type(e):
1599             interaction_type = e.get('interactionType')
1600             if isinstance(interaction_type, dict):
1601                 interaction_type = interaction_type.get('@type')
1602             return str_or_none(interaction_type)
1603
1604         def extract_interaction_statistic(e):
1605             interaction_statistic = e.get('interactionStatistic')
1606             if isinstance(interaction_statistic, dict):
1607                 interaction_statistic = [interaction_statistic]
1608             if not isinstance(interaction_statistic, list):
1609                 return
1610             for is_e in interaction_statistic:
1611                 if not is_type(is_e, 'InteractionCounter'):
1612                     continue
1613                 interaction_type = extract_interaction_type(is_e)
1614                 if not interaction_type:
1615                     continue
1616                 # For interaction count some sites provide string instead of
1617                 # an integer (as per spec) with non digit characters (e.g. ",")
1618                 # so extracting count with more relaxed str_to_int
1619                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1620                 if interaction_count is None:
1621                     continue
1622                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1623                 if not count_kind:
1624                     continue
1625                 count_key = f'{count_kind}_count'
1626                 if info.get(count_key) is not None:
1627                     continue
1628                 info[count_key] = interaction_count
1629
1630         def extract_chapter_information(e):
1631             chapters = [{
1632                 'title': part.get('name'),
1633                 'start_time': part.get('startOffset'),
1634                 'end_time': part.get('endOffset'),
1635             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1636             for idx, (last_c, current_c, next_c) in enumerate(zip(
1637                     [{'end_time': 0}, *chapters], chapters, chapters[1:])):
1638                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1639                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1640                 if None in current_c.values():
1641                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1642                     return
1643             if chapters:
1644                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1645                 info['chapters'] = chapters
1646
1647         def extract_video_object(e):
1648             author = e.get('author')
1649             info.update({
1650                 'url': url_or_none(e.get('contentUrl')),
1651                 'ext': mimetype2ext(e.get('encodingFormat')),
1652                 'title': unescapeHTML(e.get('name')),
1653                 'description': unescapeHTML(e.get('description')),
1654                 'thumbnails': [{'url': unescapeHTML(url)}
1655                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1656                                if url_or_none(url)],
1657                 'duration': parse_duration(e.get('duration')),
1658                 'timestamp': unified_timestamp(e.get('uploadDate')),
1659                 # author can be an instance of 'Organization' or 'Person' types.
1660                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1661                 # however some websites are using 'Text' type instead.
1662                 # 1. https://schema.org/VideoObject
1663                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1664                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1665                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1666                 'tbr': int_or_none(e.get('bitrate')),
1667                 'width': int_or_none(e.get('width')),
1668                 'height': int_or_none(e.get('height')),
1669                 'view_count': int_or_none(e.get('interactionCount')),
1670                 'tags': try_call(lambda: e.get('keywords').split(',')),
1671             })
1672             if is_type(e, 'AudioObject'):
1673                 info.update({
1674                     'vcodec': 'none',
1675                     'abr': int_or_none(e.get('bitrate')),
1676                 })
1677             extract_interaction_statistic(e)
1678             extract_chapter_information(e)
1679
1680         def traverse_json_ld(json_ld, at_top_level=True):
1681             for e in variadic(json_ld):
1682                 if not isinstance(e, dict):
1683                     continue
1684                 if at_top_level and '@context' not in e:
1685                     continue
1686                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1687                     traverse_json_ld(e['@graph'], at_top_level=False)
1688                     continue
1689                 if expected_type is not None and not is_type(e, expected_type):
1690                     continue
1691                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1692                 if rating is not None:
1693                     info['average_rating'] = rating
1694                 if is_type(e, 'TVEpisode', 'Episode'):
1695                     episode_name = unescapeHTML(e.get('name'))
1696                     info.update({
1697                         'episode': episode_name,
1698                         'episode_number': int_or_none(e.get('episodeNumber')),
1699                         'description': unescapeHTML(e.get('description')),
1700                     })
1701                     if not info.get('title') and episode_name:
1702                         info['title'] = episode_name
1703                     part_of_season = e.get('partOfSeason')
1704                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1705                         info.update({
1706                             'season': unescapeHTML(part_of_season.get('name')),
1707                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1708                         })
1709                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1710                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1711                         info['series'] = unescapeHTML(part_of_series.get('name'))
1712                 elif is_type(e, 'Movie'):
1713                     info.update({
1714                         'title': unescapeHTML(e.get('name')),
1715                         'description': unescapeHTML(e.get('description')),
1716                         'duration': parse_duration(e.get('duration')),
1717                         'timestamp': unified_timestamp(e.get('dateCreated')),
1718                     })
1719                 elif is_type(e, 'Article', 'NewsArticle'):
1720                     info.update({
1721                         'timestamp': parse_iso8601(e.get('datePublished')),
1722                         'title': unescapeHTML(e.get('headline')),
1723                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1724                     })
1725                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1726                         extract_video_object(e['video'][0])
1727                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1728                         extract_video_object(e['subjectOf'][0])
1729                 elif is_type(e, 'VideoObject', 'AudioObject'):
1730                     extract_video_object(e)
1731                     if expected_type is None:
1732                         continue
1733                     else:
1734                         break
1735                 video = e.get('video')
1736                 if is_type(video, 'VideoObject'):
1737                     extract_video_object(video)
1738                 if expected_type is None:
1739                     continue
1740                 else:
1741                     break
1742
1743         traverse_json_ld(json_ld)
1744         return filter_dict(info)
1745
1746     def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw):
1747         if default == '{}':
1748             self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead')
1749             default = {}
1750         if default is not NO_DEFAULT:
1751             fatal = False
1752
1753         return self._search_json(
1754             r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
1755             video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
1756
1757     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1758         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1759         rectx = re.escape(context_name)
1760         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1761         js, arg_keys, arg_vals = self._search_regex(
1762             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1763             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1764             default=NO_DEFAULT if fatal else (None, None, None))
1765         if js is None:
1766             return {}
1767
1768         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1769             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1770
1771         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1772         return traverse_obj(ret, traverse) or {}
1773
1774     @staticmethod
1775     def _hidden_inputs(html):
1776         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1777         hidden_inputs = {}
1778         for input_el in re.findall(r'(?i)(<input[^>]+>)', html):
1779             attrs = extract_attributes(input_el)
1780             if not input_el:
1781                 continue
1782             if attrs.get('type') not in ('hidden', 'submit'):
1783                 continue
1784             name = attrs.get('name') or attrs.get('id')
1785             value = attrs.get('value')
1786             if name and value is not None:
1787                 hidden_inputs[name] = value
1788         return hidden_inputs
1789
1790     def _form_hidden_inputs(self, form_id, html):
1791         form = self._search_regex(
1792             rf'(?is)<form[^>]+?id=(["\']){form_id}\1[^>]*>(?P<form>.+?)</form>',
1793             html, f'{form_id} form', group='form')
1794         return self._hidden_inputs(form)
1795
1796     @classproperty(cache=True)
1797     def FormatSort(cls):
1798         class FormatSort(FormatSorter):
1799             def __init__(ie, *args, **kwargs):
1800                 super().__init__(ie._downloader, *args, **kwargs)
1801
1802         deprecation_warning(
1803             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1804             'Use yt_dlp.utils.FormatSorter instead')
1805         return FormatSort
1806
1807     def _sort_formats(self, formats, field_preference=[]):
1808         if not field_preference:
1809             self._downloader.deprecation_warning(
1810                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1811             return
1812         self._downloader.deprecation_warning(
1813             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1814             'Return _format_sort_fields in the info_dict instead')
1815         if formats:
1816             formats[0]['__sort_fields'] = field_preference
1817
1818     def _check_formats(self, formats, video_id):
1819         if formats:
1820             formats[:] = filter(
1821                 lambda f: self._is_valid_url(
1822                     f['url'], video_id,
1823                     item='{} video format'.format(f.get('format_id')) if f.get('format_id') else 'video'),
1824                 formats)
1825
1826     @staticmethod
1827     def _remove_duplicate_formats(formats):
1828         format_urls = set()
1829         unique_formats = []
1830         for f in formats:
1831             if f['url'] not in format_urls:
1832                 format_urls.add(f['url'])
1833                 unique_formats.append(f)
1834         formats[:] = unique_formats
1835
1836     def _is_valid_url(self, url, video_id, item='video', headers={}):
1837         url = self._proto_relative_url(url, scheme='http:')
1838         # For now assume non HTTP(S) URLs always valid
1839         if not url.startswith(('http://', 'https://')):
1840             return True
1841         try:
1842             self._request_webpage(url, video_id, f'Checking {item} URL', headers=headers)
1843             return True
1844         except ExtractorError as e:
1845             self.to_screen(
1846                 f'{video_id}: {item} URL is invalid, skipping: {e.cause!s}')
1847             return False
1848
1849     def http_scheme(self):
1850         """ Either "http:" or "https:", depending on the user's preferences """
1851         return (
1852             'http:'
1853             if self.get_param('prefer_insecure', False)
1854             else 'https:')
1855
1856     def _proto_relative_url(self, url, scheme=None):
1857         scheme = scheme or self.http_scheme()
1858         assert scheme.endswith(':')
1859         return sanitize_url(url, scheme=scheme[:-1])
1860
1861     def _sleep(self, timeout, video_id, msg_template=None):
1862         if msg_template is None:
1863             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1864         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1865         self.to_screen(msg)
1866         time.sleep(timeout)
1867
1868     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1869                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1870                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1871         if self.get_param('ignore_no_formats_error'):
1872             fatal = False
1873
1874         res = self._download_xml_handle(
1875             manifest_url, video_id, 'Downloading f4m manifest',
1876             'Unable to download f4m manifest',
1877             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1878             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1879             transform_source=transform_source,
1880             fatal=fatal, data=data, headers=headers, query=query)
1881         if res is False:
1882             return []
1883
1884         manifest, urlh = res
1885         manifest_url = urlh.url
1886
1887         return self._parse_f4m_formats(
1888             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1889             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1890
1891     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1892                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1893                            fatal=True, m3u8_id=None):
1894         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1895             return []
1896
1897         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1898         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1899         if akamai_pv is not None and ';' in akamai_pv.text:
1900             player_verification_challenge = akamai_pv.text.split(';')[0]
1901             if player_verification_challenge.strip() != '':
1902                 return []
1903
1904         formats = []
1905         manifest_version = '1.0'
1906         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1907         if not media_nodes:
1908             manifest_version = '2.0'
1909             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1910         # Remove unsupported DRM protected media from final formats
1911         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1912         media_nodes = remove_encrypted_media(media_nodes)
1913         if not media_nodes:
1914             return formats
1915
1916         manifest_base_url = get_base_url(manifest)
1917
1918         bootstrap_info = xpath_element(
1919             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1920             'bootstrap info', default=None)
1921
1922         vcodec = None
1923         mime_type = xpath_text(
1924             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1925             'base URL', default=None)
1926         if mime_type and mime_type.startswith('audio/'):
1927             vcodec = 'none'
1928
1929         for i, media_el in enumerate(media_nodes):
1930             tbr = int_or_none(media_el.attrib.get('bitrate'))
1931             width = int_or_none(media_el.attrib.get('width'))
1932             height = int_or_none(media_el.attrib.get('height'))
1933             format_id = join_nonempty(f4m_id, tbr or i)
1934             # If <bootstrapInfo> is present, the specified f4m is a
1935             # stream-level manifest, and only set-level manifests may refer to
1936             # external resources.  See section 11.4 and section 4 of F4M spec
1937             if bootstrap_info is None:
1938                 media_url = None
1939                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1940                 if manifest_version == '2.0':
1941                     media_url = media_el.attrib.get('href')
1942                 if media_url is None:
1943                     media_url = media_el.attrib.get('url')
1944                 if not media_url:
1945                     continue
1946                 manifest_url = (
1947                     media_url if media_url.startswith(('http://', 'https://'))
1948                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1949                 # If media_url is itself a f4m manifest do the recursive extraction
1950                 # since bitrates in parent manifest (this one) and media_url manifest
1951                 # may differ leading to inability to resolve the format by requested
1952                 # bitrate in f4m downloader
1953                 ext = determine_ext(manifest_url)
1954                 if ext == 'f4m':
1955                     f4m_formats = self._extract_f4m_formats(
1956                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1957                         transform_source=transform_source, fatal=fatal)
1958                     # Sometimes stream-level manifest contains single media entry that
1959                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1960                     # At the same time parent's media entry in set-level manifest may
1961                     # contain it. We will copy it from parent in such cases.
1962                     if len(f4m_formats) == 1:
1963                         f = f4m_formats[0]
1964                         f.update({
1965                             'tbr': f.get('tbr') or tbr,
1966                             'width': f.get('width') or width,
1967                             'height': f.get('height') or height,
1968                             'format_id': f.get('format_id') if not tbr else format_id,
1969                             'vcodec': vcodec,
1970                         })
1971                     formats.extend(f4m_formats)
1972                     continue
1973                 elif ext == 'm3u8':
1974                     formats.extend(self._extract_m3u8_formats(
1975                         manifest_url, video_id, 'mp4', preference=preference,
1976                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1977                     continue
1978             formats.append({
1979                 'format_id': format_id,
1980                 'url': manifest_url,
1981                 'manifest_url': manifest_url,
1982                 'ext': 'flv' if bootstrap_info is not None else None,
1983                 'protocol': 'f4m',
1984                 'tbr': tbr,
1985                 'width': width,
1986                 'height': height,
1987                 'vcodec': vcodec,
1988                 'preference': preference,
1989                 'quality': quality,
1990             })
1991         return formats
1992
1993     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1994         return {
1995             'format_id': join_nonempty(m3u8_id, 'meta'),
1996             'url': m3u8_url,
1997             'ext': ext,
1998             'protocol': 'm3u8',
1999             'preference': preference - 100 if preference else -100,
2000             'quality': quality,
2001             'resolution': 'multiple',
2002             'format_note': 'Quality selection URL',
2003         }
2004
2005     def _report_ignoring_subs(self, name):
2006         self.report_warning(bug_reports_message(
2007             f'Ignoring subtitle tracks found in the {name} manifest; '
2008             'if any subtitle tracks are missing,',
2009         ), only_once=True)
2010
2011     def _extract_m3u8_formats(self, *args, **kwargs):
2012         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2013         if subs:
2014             self._report_ignoring_subs('HLS')
2015         return fmts
2016
2017     def _extract_m3u8_formats_and_subtitles(
2018             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2019             preference=None, quality=None, m3u8_id=None, note=None,
2020             errnote=None, fatal=True, live=False, data=None, headers={},
2021             query={}):
2022
2023         if self.get_param('ignore_no_formats_error'):
2024             fatal = False
2025
2026         if not m3u8_url:
2027             if errnote is not False:
2028                 errnote = errnote or 'Failed to obtain m3u8 URL'
2029                 if fatal:
2030                     raise ExtractorError(errnote, video_id=video_id)
2031                 self.report_warning(f'{errnote}{bug_reports_message()}')
2032             return [], {}
2033
2034         res = self._download_webpage_handle(
2035             m3u8_url, video_id,
2036             note='Downloading m3u8 information' if note is None else note,
2037             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2038             fatal=fatal, data=data, headers=headers, query=query)
2039
2040         if res is False:
2041             return [], {}
2042
2043         m3u8_doc, urlh = res
2044         m3u8_url = urlh.url
2045
2046         return self._parse_m3u8_formats_and_subtitles(
2047             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2048             preference=preference, quality=quality, m3u8_id=m3u8_id,
2049             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2050             headers=headers, query=query, video_id=video_id)
2051
2052     def _parse_m3u8_formats_and_subtitles(
2053             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2054             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2055             errnote=None, fatal=True, data=None, headers={}, query={},
2056             video_id=None):
2057         formats, subtitles = [], {}
2058         has_drm = HlsFD._has_drm(m3u8_doc)
2059
2060         def format_url(url):
2061             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2062
2063         if self.get_param('hls_split_discontinuity', False):
2064             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2065                 if not m3u8_doc:
2066                     if not manifest_url:
2067                         return []
2068                     m3u8_doc = self._download_webpage(
2069                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2070                         note=False, errnote='Failed to download m3u8 playlist information')
2071                     if m3u8_doc is False:
2072                         return []
2073                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2074
2075         else:
2076             def _extract_m3u8_playlist_indices(*args, **kwargs):
2077                 return [None]
2078
2079         # References:
2080         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2081         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2082         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2083
2084         # We should try extracting formats only from master playlists [1, 4.3.4],
2085         # i.e. playlists that describe available qualities. On the other hand
2086         # media playlists [1, 4.3.3] should be returned as is since they contain
2087         # just the media without qualities renditions.
2088         # Fortunately, master playlist can be easily distinguished from media
2089         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2090         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2091         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2092         # media playlist and MUST NOT appear in master playlist thus we can
2093         # clearly detect media playlist with this criterion.
2094
2095         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2096             formats = [{
2097                 'format_id': join_nonempty(m3u8_id, idx),
2098                 'format_index': idx,
2099                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode(), 'application/x-mpegurl'),
2100                 'ext': ext,
2101                 'protocol': entry_protocol,
2102                 'preference': preference,
2103                 'quality': quality,
2104                 'has_drm': has_drm,
2105             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2106
2107             return formats, subtitles
2108
2109         groups = {}
2110         last_stream_inf = {}
2111
2112         def extract_media(x_media_line):
2113             media = parse_m3u8_attributes(x_media_line)
2114             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2115             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2116             if not (media_type and group_id and name):
2117                 return
2118             groups.setdefault(group_id, []).append(media)
2119             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2120             if media_type == 'SUBTITLES':
2121                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2122                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2123                 # However, lack of URI has been spotted in the wild.
2124                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2125                 if not media.get('URI'):
2126                     return
2127                 url = format_url(media['URI'])
2128                 sub_info = {
2129                     'url': url,
2130                     'ext': determine_ext(url),
2131                 }
2132                 if sub_info['ext'] == 'm3u8':
2133                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2134                     # files may contain is WebVTT:
2135                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2136                     sub_info['ext'] = 'vtt'
2137                     sub_info['protocol'] = 'm3u8_native'
2138                 lang = media.get('LANGUAGE') or 'und'
2139                 subtitles.setdefault(lang, []).append(sub_info)
2140             if media_type not in ('VIDEO', 'AUDIO'):
2141                 return
2142             media_url = media.get('URI')
2143             if media_url:
2144                 manifest_url = format_url(media_url)
2145                 formats.extend({
2146                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2147                     'format_note': name,
2148                     'format_index': idx,
2149                     'url': manifest_url,
2150                     'manifest_url': m3u8_url,
2151                     'language': media.get('LANGUAGE'),
2152                     'ext': ext,
2153                     'protocol': entry_protocol,
2154                     'preference': preference,
2155                     'quality': quality,
2156                     'has_drm': has_drm,
2157                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2158                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2159
2160         def build_stream_name():
2161             # Despite specification does not mention NAME attribute for
2162             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2163             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2164             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2165             stream_name = last_stream_inf.get('NAME')
2166             if stream_name:
2167                 return stream_name
2168             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2169             # from corresponding rendition group
2170             stream_group_id = last_stream_inf.get('VIDEO')
2171             if not stream_group_id:
2172                 return
2173             stream_group = groups.get(stream_group_id)
2174             if not stream_group:
2175                 return stream_group_id
2176             rendition = stream_group[0]
2177             return rendition.get('NAME') or stream_group_id
2178
2179         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2180         # chance to detect video only formats when EXT-X-STREAM-INF tags
2181         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2182         for line in m3u8_doc.splitlines():
2183             if line.startswith('#EXT-X-MEDIA:'):
2184                 extract_media(line)
2185
2186         for line in m3u8_doc.splitlines():
2187             if line.startswith('#EXT-X-STREAM-INF:'):
2188                 last_stream_inf = parse_m3u8_attributes(line)
2189             elif line.startswith('#') or not line.strip():
2190                 continue
2191             else:
2192                 tbr = float_or_none(
2193                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2194                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2195                 manifest_url = format_url(line.strip())
2196
2197                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2198                     format_id = [m3u8_id, None, idx]
2199                     # Bandwidth of live streams may differ over time thus making
2200                     # format_id unpredictable. So it's better to keep provided
2201                     # format_id intact.
2202                     if not live:
2203                         stream_name = build_stream_name()
2204                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2205                     f = {
2206                         'format_id': join_nonempty(*format_id),
2207                         'format_index': idx,
2208                         'url': manifest_url,
2209                         'manifest_url': m3u8_url,
2210                         'tbr': tbr,
2211                         'ext': ext,
2212                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2213                         'protocol': entry_protocol,
2214                         'preference': preference,
2215                         'quality': quality,
2216                         'has_drm': has_drm,
2217                     }
2218                     resolution = last_stream_inf.get('RESOLUTION')
2219                     if resolution:
2220                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2221                         if mobj:
2222                             f['width'] = int(mobj.group('width'))
2223                             f['height'] = int(mobj.group('height'))
2224                     # Unified Streaming Platform
2225                     mobj = re.search(
2226                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2227                     if mobj:
2228                         abr, vbr = mobj.groups()
2229                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2230                         f.update({
2231                             'vbr': vbr,
2232                             'abr': abr,
2233                         })
2234                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2235                     f.update(codecs)
2236                     audio_group_id = last_stream_inf.get('AUDIO')
2237                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2238                     # references a rendition group MUST have a CODECS attribute.
2239                     # However, this is not always respected. E.g. [2]
2240                     # contains EXT-X-STREAM-INF tag which references AUDIO
2241                     # rendition group but does not have CODECS and despite
2242                     # referencing an audio group it represents a complete
2243                     # (with audio and video) format. So, for such cases we will
2244                     # ignore references to rendition groups and treat them
2245                     # as complete formats.
2246                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2247                         audio_group = groups.get(audio_group_id)
2248                         if audio_group and audio_group[0].get('URI'):
2249                             # TODO: update acodec for audio only formats with
2250                             # the same GROUP-ID
2251                             f['acodec'] = 'none'
2252                     if not f.get('ext'):
2253                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2254                     formats.append(f)
2255
2256                     # for DailyMotion
2257                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2258                     if progressive_uri:
2259                         http_f = f.copy()
2260                         del http_f['manifest_url']
2261                         http_f.update({
2262                             'format_id': f['format_id'].replace('hls-', 'http-'),
2263                             'protocol': 'http',
2264                             'url': progressive_uri,
2265                         })
2266                         formats.append(http_f)
2267
2268                 last_stream_inf = {}
2269         return formats, subtitles
2270
2271     def _extract_m3u8_vod_duration(
2272             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2273
2274         m3u8_vod = self._download_webpage(
2275             m3u8_vod_url, video_id,
2276             note='Downloading m3u8 VOD manifest' if note is None else note,
2277             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2278             fatal=False, data=data, headers=headers, query=query)
2279
2280         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2281
2282     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2283         if '#EXT-X-ENDLIST' not in m3u8_vod:
2284             return None
2285
2286         return int(sum(
2287             float(line[len('#EXTINF:'):].split(',')[0])
2288             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2289
2290     def _extract_mpd_vod_duration(
2291             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2292
2293         mpd_doc = self._download_xml(
2294             mpd_url, video_id,
2295             note='Downloading MPD VOD manifest' if note is None else note,
2296             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2297             fatal=False, data=data, headers=headers, query=query)
2298         if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2299             return None
2300         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2301
2302     @staticmethod
2303     def _xpath_ns(path, namespace=None):
2304         if not namespace:
2305             return path
2306         out = []
2307         for c in path.split('/'):
2308             if not c or c == '.':
2309                 out.append(c)
2310             else:
2311                 out.append(f'{{{namespace}}}{c}')
2312         return '/'.join(out)
2313
2314     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2315         if self.get_param('ignore_no_formats_error'):
2316             fatal = False
2317
2318         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2319         if res is False:
2320             assert not fatal
2321             return [], {}
2322         smil, urlh = res
2323
2324         return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2325                                                       namespace=self._parse_smil_namespace(smil))
2326
2327     def _extract_smil_formats(self, *args, **kwargs):
2328         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2329         if subs:
2330             self._report_ignoring_subs('SMIL')
2331         return fmts
2332
2333     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2334         res = self._download_smil(smil_url, video_id, fatal=fatal)
2335         if res is False:
2336             return {}
2337
2338         smil, urlh = res
2339         smil_url = urlh.url
2340
2341         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2342
2343     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2344         return self._download_xml_handle(
2345             smil_url, video_id, 'Downloading SMIL file',
2346             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2347
2348     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2349         namespace = self._parse_smil_namespace(smil)
2350
2351         formats, subtitles = self._parse_smil_formats_and_subtitles(
2352             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2353
2354         video_id = os.path.splitext(url_basename(smil_url))[0]
2355         title = None
2356         description = None
2357         upload_date = None
2358         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2359             name = meta.attrib.get('name')
2360             content = meta.attrib.get('content')
2361             if not name or not content:
2362                 continue
2363             if not title and name == 'title':
2364                 title = content
2365             elif not description and name in ('description', 'abstract'):
2366                 description = content
2367             elif not upload_date and name == 'date':
2368                 upload_date = unified_strdate(content)
2369
2370         thumbnails = [{
2371             'id': image.get('type'),
2372             'url': image.get('src'),
2373             'width': int_or_none(image.get('width')),
2374             'height': int_or_none(image.get('height')),
2375         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2376
2377         return {
2378             'id': video_id,
2379             'title': title or video_id,
2380             'description': description,
2381             'upload_date': upload_date,
2382             'thumbnails': thumbnails,
2383             'formats': formats,
2384             'subtitles': subtitles,
2385         }
2386
2387     def _parse_smil_namespace(self, smil):
2388         return self._search_regex(
2389             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2390
2391     def _parse_smil_formats(self, *args, **kwargs):
2392         fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2393         if subs:
2394             self._report_ignoring_subs('SMIL')
2395         return fmts
2396
2397     def _parse_smil_formats_and_subtitles(
2398             self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2399         base = smil_url
2400         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2401             b = meta.get('base') or meta.get('httpBase')
2402             if b:
2403                 base = b
2404                 break
2405
2406         formats, subtitles = [], {}
2407         rtmp_count = 0
2408         http_count = 0
2409         m3u8_count = 0
2410         imgs_count = 0
2411
2412         srcs = set()
2413         media = itertools.chain.from_iterable(
2414             smil.findall(self._xpath_ns(arg, namespace))
2415             for arg in ['.//video', './/audio', './/media'])
2416         for medium in media:
2417             src = medium.get('src')
2418             if not src or src in srcs:
2419                 continue
2420             srcs.add(src)
2421
2422             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2423             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2424             width = int_or_none(medium.get('width'))
2425             height = int_or_none(medium.get('height'))
2426             proto = medium.get('proto')
2427             ext = medium.get('ext')
2428             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2429                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2430             streamer = medium.get('streamer') or base
2431
2432             if proto == 'rtmp' or streamer.startswith('rtmp'):
2433                 rtmp_count += 1
2434                 formats.append({
2435                     'url': streamer,
2436                     'play_path': src,
2437                     'ext': 'flv',
2438                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2439                     'tbr': bitrate,
2440                     'filesize': filesize,
2441                     'width': width,
2442                     'height': height,
2443                 })
2444                 if transform_rtmp_url:
2445                     streamer, src = transform_rtmp_url(streamer, src)
2446                     formats[-1].update({
2447                         'url': streamer,
2448                         'play_path': src,
2449                     })
2450                 continue
2451
2452             src_url = src if src.startswith('http') else urllib.parse.urljoin(f'{base}/', src)
2453             src_url = src_url.strip()
2454
2455             if proto == 'm3u8' or src_ext == 'm3u8':
2456                 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
2457                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2458                 self._merge_subtitles(m3u8_subs, target=subtitles)
2459                 if len(m3u8_formats) == 1:
2460                     m3u8_count += 1
2461                     m3u8_formats[0].update({
2462                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2463                         'tbr': bitrate,
2464                         'width': width,
2465                         'height': height,
2466                     })
2467                 formats.extend(m3u8_formats)
2468             elif src_ext == 'f4m':
2469                 f4m_url = src_url
2470                 if not f4m_params:
2471                     f4m_params = {
2472                         'hdcore': '3.2.0',
2473                         'plugin': 'flowplayer-3.2.0.1',
2474                     }
2475                 f4m_url += '&' if '?' in f4m_url else '?'
2476                 f4m_url += urllib.parse.urlencode(f4m_params)
2477                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2478             elif src_ext == 'mpd':
2479                 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2480                     src_url, video_id, mpd_id='dash', fatal=False)
2481                 formats.extend(mpd_formats)
2482                 self._merge_subtitles(mpd_subs, target=subtitles)
2483             elif re.search(r'\.ism/[Mm]anifest', src_url):
2484                 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2485                     src_url, video_id, ism_id='mss', fatal=False)
2486                 formats.extend(ism_formats)
2487                 self._merge_subtitles(ism_subs, target=subtitles)
2488             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2489                 http_count += 1
2490                 formats.append({
2491                     'url': src_url,
2492                     'ext': ext or src_ext or 'flv',
2493                     'format_id': 'http-%d' % (bitrate or http_count),
2494                     'tbr': bitrate,
2495                     'filesize': filesize,
2496                     'width': width,
2497                     'height': height,
2498                 })
2499
2500         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2501             src = medium.get('src')
2502             if not src or src in srcs:
2503                 continue
2504             srcs.add(src)
2505
2506             imgs_count += 1
2507             formats.append({
2508                 'format_id': f'imagestream-{imgs_count}',
2509                 'url': src,
2510                 'ext': mimetype2ext(medium.get('type')),
2511                 'acodec': 'none',
2512                 'vcodec': 'none',
2513                 'width': int_or_none(medium.get('width')),
2514                 'height': int_or_none(medium.get('height')),
2515                 'format_note': 'SMIL storyboards',
2516             })
2517
2518         smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2519         self._merge_subtitles(smil_subs, target=subtitles)
2520
2521         return formats, subtitles
2522
2523     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2524         urls = []
2525         subtitles = {}
2526         for textstream in smil.findall(self._xpath_ns('.//textstream', namespace)):
2527             src = textstream.get('src')
2528             if not src or src in urls:
2529                 continue
2530             urls.append(src)
2531             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2532             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2533             subtitles.setdefault(lang, []).append({
2534                 'url': src,
2535                 'ext': ext,
2536             })
2537         return subtitles
2538
2539     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2540         res = self._download_xml_handle(
2541             xspf_url, playlist_id, 'Downloading xpsf playlist',
2542             'Unable to download xspf manifest', fatal=fatal)
2543         if res is False:
2544             return []
2545
2546         xspf, urlh = res
2547         xspf_url = urlh.url
2548
2549         return self._parse_xspf(
2550             xspf, playlist_id, xspf_url=xspf_url,
2551             xspf_base_url=base_url(xspf_url))
2552
2553     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2554         NS_MAP = {
2555             'xspf': 'http://xspf.org/ns/0/',
2556             's1': 'http://static.streamone.nl/player/ns/0',
2557         }
2558
2559         entries = []
2560         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2561             title = xpath_text(
2562                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2563             description = xpath_text(
2564                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2565             thumbnail = xpath_text(
2566                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2567             duration = float_or_none(
2568                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2569
2570             formats = []
2571             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2572                 format_url = urljoin(xspf_base_url, location.text)
2573                 if not format_url:
2574                     continue
2575                 formats.append({
2576                     'url': format_url,
2577                     'manifest_url': xspf_url,
2578                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2579                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2580                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2581                 })
2582
2583             entries.append({
2584                 'id': playlist_id,
2585                 'title': title,
2586                 'description': description,
2587                 'thumbnail': thumbnail,
2588                 'duration': duration,
2589                 'formats': formats,
2590             })
2591         return entries
2592
2593     def _extract_mpd_formats(self, *args, **kwargs):
2594         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2595         if subs:
2596             self._report_ignoring_subs('DASH')
2597         return fmts
2598
2599     def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
2600         periods = self._extract_mpd_periods(*args, **kwargs)
2601         return self._merge_mpd_periods(periods)
2602
2603     def _extract_mpd_periods(
2604             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2605             fatal=True, data=None, headers={}, query={}):
2606
2607         if self.get_param('ignore_no_formats_error'):
2608             fatal = False
2609
2610         res = self._download_xml_handle(
2611             mpd_url, video_id,
2612             note='Downloading MPD manifest' if note is None else note,
2613             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2614             fatal=fatal, data=data, headers=headers, query=query)
2615         if res is False:
2616             return []
2617         mpd_doc, urlh = res
2618         if mpd_doc is None:
2619             return []
2620
2621         # We could have been redirected to a new url when we retrieved our mpd file.
2622         mpd_url = urlh.url
2623         mpd_base_url = base_url(mpd_url)
2624
2625         return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
2626
2627     def _parse_mpd_formats(self, *args, **kwargs):
2628         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2629         if subs:
2630             self._report_ignoring_subs('DASH')
2631         return fmts
2632
2633     def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
2634         periods = self._parse_mpd_periods(*args, **kwargs)
2635         return self._merge_mpd_periods(periods)
2636
2637     def _merge_mpd_periods(self, periods):
2638         """
2639         Combine all formats and subtitles from an MPD manifest into a single list,
2640         by concatenate streams with similar formats.
2641         """
2642         formats, subtitles = {}, {}
2643         for period in periods:
2644             for f in period['formats']:
2645                 assert 'is_dash_periods' not in f, 'format already processed'
2646                 f['is_dash_periods'] = True
2647                 format_key = tuple(v for k, v in f.items() if k not in (
2648                     ('format_id', 'fragments', 'manifest_stream_number')))
2649                 if format_key not in formats:
2650                     formats[format_key] = f
2651                 elif 'fragments' in f:
2652                     formats[format_key].setdefault('fragments', []).extend(f['fragments'])
2653
2654             if subtitles and period['subtitles']:
2655                 self.report_warning(bug_reports_message(
2656                     'Found subtitles in multiple periods in the DASH manifest; '
2657                     'if part of the subtitles are missing,',
2658                 ), only_once=True)
2659
2660             for sub_lang, sub_info in period['subtitles'].items():
2661                 subtitles.setdefault(sub_lang, []).extend(sub_info)
2662
2663         return list(formats.values()), subtitles
2664
2665     def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2666         """
2667         Parse formats from MPD manifest.
2668         References:
2669          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2670             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2671          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2672         """
2673         if not self.get_param('dynamic_mpd', True):
2674             if mpd_doc.get('type') == 'dynamic':
2675                 return [], {}
2676
2677         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2678
2679         def _add_ns(path):
2680             return self._xpath_ns(path, namespace)
2681
2682         def is_drm_protected(element):
2683             return element.find(_add_ns('ContentProtection')) is not None
2684
2685         def extract_multisegment_info(element, ms_parent_info):
2686             ms_info = ms_parent_info.copy()
2687
2688             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2689             # common attributes and elements.  We will only extract relevant
2690             # for us.
2691             def extract_common(source):
2692                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2693                 if segment_timeline is not None:
2694                     s_e = segment_timeline.findall(_add_ns('S'))
2695                     if s_e:
2696                         ms_info['total_number'] = 0
2697                         ms_info['s'] = []
2698                         for s in s_e:
2699                             r = int(s.get('r', 0))
2700                             ms_info['total_number'] += 1 + r
2701                             ms_info['s'].append({
2702                                 't': int(s.get('t', 0)),
2703                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2704                                 'd': int(s.attrib['d']),
2705                                 'r': r,
2706                             })
2707                 start_number = source.get('startNumber')
2708                 if start_number:
2709                     ms_info['start_number'] = int(start_number)
2710                 timescale = source.get('timescale')
2711                 if timescale:
2712                     ms_info['timescale'] = int(timescale)
2713                 segment_duration = source.get('duration')
2714                 if segment_duration:
2715                     ms_info['segment_duration'] = float(segment_duration)
2716
2717             def extract_Initialization(source):
2718                 initialization = source.find(_add_ns('Initialization'))
2719                 if initialization is not None:
2720                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2721
2722             segment_list = element.find(_add_ns('SegmentList'))
2723             if segment_list is not None:
2724                 extract_common(segment_list)
2725                 extract_Initialization(segment_list)
2726                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2727                 if segment_urls_e:
2728                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2729             else:
2730                 segment_template = element.find(_add_ns('SegmentTemplate'))
2731                 if segment_template is not None:
2732                     extract_common(segment_template)
2733                     media = segment_template.get('media')
2734                     if media:
2735                         ms_info['media'] = media
2736                     initialization = segment_template.get('initialization')
2737                     if initialization:
2738                         ms_info['initialization'] = initialization
2739                     else:
2740                         extract_Initialization(segment_template)
2741             return ms_info
2742
2743         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2744         stream_numbers = collections.defaultdict(int)
2745         for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
2746             period_entry = {
2747                 'id': period.get('id', f'period-{period_idx}'),
2748                 'formats': [],
2749                 'subtitles': collections.defaultdict(list),
2750             }
2751             period_duration = parse_duration(period.get('duration')) or mpd_duration
2752             period_ms_info = extract_multisegment_info(period, {
2753                 'start_number': 1,
2754                 'timescale': 1,
2755             })
2756             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2757                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2758                 for representation in adaptation_set.findall(_add_ns('Representation')):
2759                     representation_attrib = adaptation_set.attrib.copy()
2760                     representation_attrib.update(representation.attrib)
2761                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2762                     mime_type = representation_attrib['mimeType']
2763                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2764
2765                     codec_str = representation_attrib.get('codecs', '')
2766                     # Some kind of binary subtitle found in some youtube livestreams
2767                     if mime_type == 'application/x-rawcc':
2768                         codecs = {'scodec': codec_str}
2769                     else:
2770                         codecs = parse_codecs(codec_str)
2771                     if content_type not in ('video', 'audio', 'text'):
2772                         if mime_type == 'image/jpeg':
2773                             content_type = mime_type
2774                         elif codecs.get('vcodec', 'none') != 'none':
2775                             content_type = 'video'
2776                         elif codecs.get('acodec', 'none') != 'none':
2777                             content_type = 'audio'
2778                         elif codecs.get('scodec', 'none') != 'none':
2779                             content_type = 'text'
2780                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2781                             content_type = 'text'
2782                         else:
2783                             self.report_warning(f'Unknown MIME type {mime_type} in DASH manifest')
2784                             continue
2785
2786                     base_url = ''
2787                     for element in (representation, adaptation_set, period, mpd_doc):
2788                         base_url_e = element.find(_add_ns('BaseURL'))
2789                         if try_call(lambda: base_url_e.text) is not None:
2790                             base_url = base_url_e.text + base_url
2791                             if re.match(r'^https?://', base_url):
2792                                 break
2793                     if mpd_base_url and base_url.startswith('/'):
2794                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2795                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2796                         if not mpd_base_url.endswith('/'):
2797                             mpd_base_url += '/'
2798                         base_url = mpd_base_url + base_url
2799                     representation_id = representation_attrib.get('id')
2800                     lang = representation_attrib.get('lang')
2801                     url_el = representation.find(_add_ns('BaseURL'))
2802                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2803                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2804                     if representation_id is not None:
2805                         format_id = representation_id
2806                     else:
2807                         format_id = content_type
2808                     if mpd_id:
2809                         format_id = mpd_id + '-' + format_id
2810                     if content_type in ('video', 'audio'):
2811                         f = {
2812                             'format_id': format_id,
2813                             'manifest_url': mpd_url,
2814                             'ext': mimetype2ext(mime_type),
2815                             'width': int_or_none(representation_attrib.get('width')),
2816                             'height': int_or_none(representation_attrib.get('height')),
2817                             'tbr': float_or_none(bandwidth, 1000),
2818                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2819                             'fps': int_or_none(representation_attrib.get('frameRate')),
2820                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2821                             'format_note': f'DASH {content_type}',
2822                             'filesize': filesize,
2823                             'container': mimetype2ext(mime_type) + '_dash',
2824                             **codecs,
2825                         }
2826                     elif content_type == 'text':
2827                         f = {
2828                             'ext': mimetype2ext(mime_type),
2829                             'manifest_url': mpd_url,
2830                             'filesize': filesize,
2831                         }
2832                     elif content_type == 'image/jpeg':
2833                         # See test case in VikiIE
2834                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2835                         f = {
2836                             'format_id': format_id,
2837                             'ext': 'mhtml',
2838                             'manifest_url': mpd_url,
2839                             'format_note': 'DASH storyboards (jpeg)',
2840                             'acodec': 'none',
2841                             'vcodec': 'none',
2842                         }
2843                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2844                         f['has_drm'] = True
2845                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2846
2847                     def prepare_template(template_name, identifiers):
2848                         tmpl = representation_ms_info[template_name]
2849                         if representation_id is not None:
2850                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2851                         # First of, % characters outside $...$ templates
2852                         # must be escaped by doubling for proper processing
2853                         # by % operator string formatting used further (see
2854                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2855                         t = ''
2856                         in_template = False
2857                         for c in tmpl:
2858                             t += c
2859                             if c == '$':
2860                                 in_template = not in_template
2861                             elif c == '%' and not in_template:
2862                                 t += c
2863                         # Next, $...$ templates are translated to their
2864                         # %(...) counterparts to be used with % operator
2865                         t = re.sub(r'\$({})\$'.format('|'.join(identifiers)), r'%(\1)d', t)
2866                         t = re.sub(r'\$({})%([^$]+)\$'.format('|'.join(identifiers)), r'%(\1)\2', t)
2867                         t.replace('$$', '$')
2868                         return t
2869
2870                     # @initialization is a regular template like @media one
2871                     # so it should be handled just the same way (see
2872                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2873                     if 'initialization' in representation_ms_info:
2874                         initialization_template = prepare_template(
2875                             'initialization',
2876                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2877                             # $Time$ shall not be included for @initialization thus
2878                             # only $Bandwidth$ remains
2879                             ('Bandwidth', ))
2880                         representation_ms_info['initialization_url'] = initialization_template % {
2881                             'Bandwidth': bandwidth,
2882                         }
2883
2884                     def location_key(location):
2885                         return 'url' if re.match(r'^https?://', location) else 'path'
2886
2887                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2888
2889                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2890                         media_location_key = location_key(media_template)
2891
2892                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2893                         # can't be used at the same time
2894                         if '%(Number' in media_template and 's' not in representation_ms_info:
2895                             segment_duration = None
2896                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2897                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2898                                 representation_ms_info['total_number'] = int(math.ceil(
2899                                     float_or_none(period_duration, segment_duration, default=0)))
2900                             representation_ms_info['fragments'] = [{
2901                                 media_location_key: media_template % {
2902                                     'Number': segment_number,
2903                                     'Bandwidth': bandwidth,
2904                                 },
2905                                 'duration': segment_duration,
2906                             } for segment_number in range(
2907                                 representation_ms_info['start_number'],
2908                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2909                         else:
2910                             # $Number*$ or $Time$ in media template with S list available
2911                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2912                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2913                             representation_ms_info['fragments'] = []
2914                             segment_time = 0
2915                             segment_d = None
2916                             segment_number = representation_ms_info['start_number']
2917
2918                             def add_segment_url():
2919                                 segment_url = media_template % {
2920                                     'Time': segment_time,
2921                                     'Bandwidth': bandwidth,
2922                                     'Number': segment_number,
2923                                 }
2924                                 representation_ms_info['fragments'].append({
2925                                     media_location_key: segment_url,
2926                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2927                                 })
2928
2929                             for s in representation_ms_info['s']:
2930                                 segment_time = s.get('t') or segment_time
2931                                 segment_d = s['d']
2932                                 add_segment_url()
2933                                 segment_number += 1
2934                                 for _ in range(s.get('r', 0)):
2935                                     segment_time += segment_d
2936                                     add_segment_url()
2937                                     segment_number += 1
2938                                 segment_time += segment_d
2939                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2940                         # No media template,
2941                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2942                         # or any YouTube dashsegments video
2943                         fragments = []
2944                         segment_index = 0
2945                         timescale = representation_ms_info['timescale']
2946                         for s in representation_ms_info['s']:
2947                             duration = float_or_none(s['d'], timescale)
2948                             for _ in range(s.get('r', 0) + 1):
2949                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2950                                 fragments.append({
2951                                     location_key(segment_uri): segment_uri,
2952                                     'duration': duration,
2953                                 })
2954                                 segment_index += 1
2955                         representation_ms_info['fragments'] = fragments
2956                     elif 'segment_urls' in representation_ms_info:
2957                         # Segment URLs with no SegmentTimeline
2958                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2959                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2960                         fragments = []
2961                         segment_duration = float_or_none(
2962                             representation_ms_info['segment_duration'],
2963                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2964                         for segment_url in representation_ms_info['segment_urls']:
2965                             fragment = {
2966                                 location_key(segment_url): segment_url,
2967                             }
2968                             if segment_duration:
2969                                 fragment['duration'] = segment_duration
2970                             fragments.append(fragment)
2971                         representation_ms_info['fragments'] = fragments
2972                     # If there is a fragments key available then we correctly recognized fragmented media.
2973                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2974                     # assumption is not necessarily correct since we may simply have no support for
2975                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2976                     if 'fragments' in representation_ms_info:
2977                         f.update({
2978                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2979                             'url': mpd_url or base_url,
2980                             'fragment_base_url': base_url,
2981                             'fragments': [],
2982                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2983                         })
2984                         if 'initialization_url' in representation_ms_info:
2985                             initialization_url = representation_ms_info['initialization_url']
2986                             if not f.get('url'):
2987                                 f['url'] = initialization_url
2988                             f['fragments'].append({location_key(initialization_url): initialization_url})
2989                         f['fragments'].extend(representation_ms_info['fragments'])
2990                         if not period_duration:
2991                             period_duration = try_get(
2992                                 representation_ms_info,
2993                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2994                     else:
2995                         # Assuming direct URL to unfragmented media.
2996                         f['url'] = base_url
2997                     if content_type in ('video', 'audio', 'image/jpeg'):
2998                         f['manifest_stream_number'] = stream_numbers[f['url']]
2999                         stream_numbers[f['url']] += 1
3000                         period_entry['formats'].append(f)
3001                     elif content_type == 'text':
3002                         period_entry['subtitles'][lang or 'und'].append(f)
3003             yield period_entry
3004
3005     def _extract_ism_formats(self, *args, **kwargs):
3006         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3007         if subs:
3008             self._report_ignoring_subs('ISM')
3009         return fmts
3010
3011     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3012         if self.get_param('ignore_no_formats_error'):
3013             fatal = False
3014
3015         res = self._download_xml_handle(
3016             ism_url, video_id,
3017             note='Downloading ISM manifest' if note is None else note,
3018             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3019             fatal=fatal, data=data, headers=headers, query=query)
3020         if res is False:
3021             return [], {}
3022         ism_doc, urlh = res
3023         if ism_doc is None:
3024             return [], {}
3025
3026         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
3027
3028     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3029         """
3030         Parse formats from ISM manifest.
3031         References:
3032          1. [MS-SSTR]: Smooth Streaming Protocol,
3033             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3034         """
3035         if ism_doc.get('IsLive') == 'TRUE':
3036             return [], {}
3037
3038         duration = int(ism_doc.attrib['Duration'])
3039         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3040
3041         formats = []
3042         subtitles = {}
3043         for stream in ism_doc.findall('StreamIndex'):
3044             stream_type = stream.get('Type')
3045             if stream_type not in ('video', 'audio', 'text'):
3046                 continue
3047             url_pattern = stream.attrib['Url']
3048             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3049             stream_name = stream.get('Name')
3050             stream_language = stream.get('Language', 'und')
3051             for track in stream.findall('QualityLevel'):
3052                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3053                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3054                 # TODO: add support for WVC1 and WMAP
3055                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3056                     self.report_warning(f'{fourcc} is not a supported codec')
3057                     continue
3058                 tbr = int(track.attrib['Bitrate']) // 1000
3059                 # [1] does not mention Width and Height attributes. However,
3060                 # they're often present while MaxWidth and MaxHeight are
3061                 # missing, so should be used as fallbacks
3062                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3063                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3064                 sampling_rate = int_or_none(track.get('SamplingRate'))
3065
3066                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3067                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3068
3069                 fragments = []
3070                 fragment_ctx = {
3071                     'time': 0,
3072                 }
3073                 stream_fragments = stream.findall('c')
3074                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3075                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3076                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3077                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3078                     if not fragment_ctx['duration']:
3079                         try:
3080                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3081                         except IndexError:
3082                             next_fragment_time = duration
3083                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3084                     for _ in range(fragment_repeat):
3085                         fragments.append({
3086                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3087                             'duration': fragment_ctx['duration'] / stream_timescale,
3088                         })
3089                         fragment_ctx['time'] += fragment_ctx['duration']
3090
3091                 if stream_type == 'text':
3092                     subtitles.setdefault(stream_language, []).append({
3093                         'ext': 'ismt',
3094                         'protocol': 'ism',
3095                         'url': ism_url,
3096                         'manifest_url': ism_url,
3097                         'fragments': fragments,
3098                         '_download_params': {
3099                             'stream_type': stream_type,
3100                             'duration': duration,
3101                             'timescale': stream_timescale,
3102                             'fourcc': fourcc,
3103                             'language': stream_language,
3104                             'codec_private_data': track.get('CodecPrivateData'),
3105                         },
3106                     })
3107                 elif stream_type in ('video', 'audio'):
3108                     formats.append({
3109                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3110                         'url': ism_url,
3111                         'manifest_url': ism_url,
3112                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3113                         'width': width,
3114                         'height': height,
3115                         'tbr': tbr,
3116                         'asr': sampling_rate,
3117                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3118                         'acodec': 'none' if stream_type == 'video' else fourcc,
3119                         'protocol': 'ism',
3120                         'fragments': fragments,
3121                         'has_drm': ism_doc.find('Protection') is not None,
3122                         'language': stream_language,
3123                         'audio_channels': int_or_none(track.get('Channels')),
3124                         '_download_params': {
3125                             'stream_type': stream_type,
3126                             'duration': duration,
3127                             'timescale': stream_timescale,
3128                             'width': width or 0,
3129                             'height': height or 0,
3130                             'fourcc': fourcc,
3131                             'language': stream_language,
3132                             'codec_private_data': track.get('CodecPrivateData'),
3133                             'sampling_rate': sampling_rate,
3134                             'channels': int_or_none(track.get('Channels', 2)),
3135                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3136                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3137                         },
3138                     })
3139         return formats, subtitles
3140
3141     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3142         def absolute_url(item_url):
3143             return urljoin(base_url, item_url)
3144
3145         def parse_content_type(content_type):
3146             if not content_type:
3147                 return {}
3148             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3149             if ctr:
3150                 mimetype, codecs = ctr.groups()
3151                 f = parse_codecs(codecs)
3152                 f['ext'] = mimetype2ext(mimetype)
3153                 return f
3154             return {}
3155
3156         def _media_formats(src, cur_media_type, type_info=None):
3157             type_info = type_info or {}
3158             full_url = absolute_url(src)
3159             ext = type_info.get('ext') or determine_ext(full_url)
3160             if ext == 'm3u8':
3161                 is_plain_url = False
3162                 formats = self._extract_m3u8_formats(
3163                     full_url, video_id, ext='mp4',
3164                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3165                     preference=preference, quality=quality, fatal=False)
3166             elif ext == 'mpd':
3167                 is_plain_url = False
3168                 formats = self._extract_mpd_formats(
3169                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3170             else:
3171                 is_plain_url = True
3172                 formats = [{
3173                     'url': full_url,
3174                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3175                     'ext': ext,
3176                 }]
3177             return is_plain_url, formats
3178
3179         entries = []
3180         # amp-video and amp-audio are very similar to their HTML5 counterparts
3181         # so we will include them right here (see
3182         # https://www.ampproject.org/docs/reference/components/amp-video)
3183         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3184         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3185         media_tags = [(media_tag, media_tag_name, media_type, '')
3186                       for media_tag, media_tag_name, media_type
3187                       in re.findall(rf'(?s)(<({_MEDIA_TAG_NAME_RE})[^>]*/>)', webpage)]
3188         media_tags.extend(re.findall(
3189             # We only allow video|audio followed by a whitespace or '>'.
3190             # Allowing more characters may end up in significant slow down (see
3191             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3192             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3193             rf'(?s)(<(?P<tag>{_MEDIA_TAG_NAME_RE})(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
3194         for media_tag, _, media_type, media_content in media_tags:
3195             media_info = {
3196                 'formats': [],
3197                 'subtitles': {},
3198             }
3199             media_attributes = extract_attributes(media_tag)
3200             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3201             if src:
3202                 f = parse_content_type(media_attributes.get('type'))
3203                 _, formats = _media_formats(src, media_type, f)
3204                 media_info['formats'].extend(formats)
3205             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3206             if media_content:
3207                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3208                     s_attr = extract_attributes(source_tag)
3209                     # data-video-src and data-src are non standard but seen
3210                     # several times in the wild
3211                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3212                     if not src:
3213                         continue
3214                     f = parse_content_type(s_attr.get('type'))
3215                     is_plain_url, formats = _media_formats(src, media_type, f)
3216                     if is_plain_url:
3217                         # width, height, res, label and title attributes are
3218                         # all not standard but seen several times in the wild
3219                         labels = [
3220                             s_attr.get(lbl)
3221                             for lbl in ('label', 'title')
3222                             if str_or_none(s_attr.get(lbl))
3223                         ]
3224                         width = int_or_none(s_attr.get('width'))
3225                         height = (int_or_none(s_attr.get('height'))
3226                                   or int_or_none(s_attr.get('res')))
3227                         if not width or not height:
3228                             for lbl in labels:
3229                                 resolution = parse_resolution(lbl)
3230                                 if not resolution:
3231                                     continue
3232                                 width = width or resolution.get('width')
3233                                 height = height or resolution.get('height')
3234                         for lbl in labels:
3235                             tbr = parse_bitrate(lbl)
3236                             if tbr:
3237                                 break
3238                         else:
3239                             tbr = None
3240                         f.update({
3241                             'width': width,
3242                             'height': height,
3243                             'tbr': tbr,
3244                             'format_id': s_attr.get('label') or s_attr.get('title'),
3245                         })
3246                         f.update(formats[0])
3247                         media_info['formats'].append(f)
3248                     else:
3249                         media_info['formats'].extend(formats)
3250                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3251                     track_attributes = extract_attributes(track_tag)
3252                     kind = track_attributes.get('kind')
3253                     if not kind or kind in ('subtitles', 'captions'):
3254                         src = strip_or_none(track_attributes.get('src'))
3255                         if not src:
3256                             continue
3257                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3258                         media_info['subtitles'].setdefault(lang, []).append({
3259                             'url': absolute_url(src),
3260                         })
3261             for f in media_info['formats']:
3262                 f.setdefault('http_headers', {})['Referer'] = base_url
3263             if media_info['formats'] or media_info['subtitles']:
3264                 entries.append(media_info)
3265         return entries
3266
3267     def _extract_akamai_formats(self, *args, **kwargs):
3268         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3269         if subs:
3270             self._report_ignoring_subs('akamai')
3271         return fmts
3272
3273     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3274         signed = 'hdnea=' in manifest_url
3275         if not signed:
3276             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3277             manifest_url = re.sub(
3278                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3279                 '', manifest_url).strip('?')
3280
3281         formats = []
3282         subtitles = {}
3283
3284         hdcore_sign = 'hdcore=3.7.0'
3285         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3286         hds_host = hosts.get('hds')
3287         if hds_host:
3288             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3289         if 'hdcore=' not in f4m_url:
3290             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3291         f4m_formats = self._extract_f4m_formats(
3292             f4m_url, video_id, f4m_id='hds', fatal=False)
3293         for entry in f4m_formats:
3294             entry.update({'extra_param_to_segment_url': hdcore_sign})
3295         formats.extend(f4m_formats)
3296
3297         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3298         hls_host = hosts.get('hls')
3299         if hls_host:
3300             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3301         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3302             m3u8_url, video_id, 'mp4', 'm3u8_native',
3303             m3u8_id='hls', fatal=False)
3304         formats.extend(m3u8_formats)
3305         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3306
3307         http_host = hosts.get('http')
3308         if http_host and m3u8_formats and not signed:
3309             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3310             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3311             qualities_length = len(qualities)
3312             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3313                 i = 0
3314                 for f in m3u8_formats:
3315                     if f['vcodec'] != 'none':
3316                         for protocol in ('http', 'https'):
3317                             http_f = f.copy()
3318                             del http_f['manifest_url']
3319                             http_url = re.sub(
3320                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3321                             http_f.update({
3322                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3323                                 'url': http_url,
3324                                 'protocol': protocol,
3325                             })
3326                             formats.append(http_f)
3327                         i += 1
3328
3329         return formats, subtitles
3330
3331     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3332         query = urllib.parse.urlparse(url).query
3333         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3334         mobj = re.search(
3335             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3336         url_base = mobj.group('url')
3337         http_base_url = '{}{}:{}'.format('http', mobj.group('s') or '', url_base)
3338         formats = []
3339
3340         def manifest_url(manifest):
3341             m_url = f'{http_base_url}/{manifest}'
3342             if query:
3343                 m_url += f'?{query}'
3344             return m_url
3345
3346         if 'm3u8' not in skip_protocols:
3347             formats.extend(self._extract_m3u8_formats(
3348                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3349                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3350         if 'f4m' not in skip_protocols:
3351             formats.extend(self._extract_f4m_formats(
3352                 manifest_url('manifest.f4m'),
3353                 video_id, f4m_id='hds', fatal=False))
3354         if 'dash' not in skip_protocols:
3355             formats.extend(self._extract_mpd_formats(
3356                 manifest_url('manifest.mpd'),
3357                 video_id, mpd_id='dash', fatal=False))
3358         if re.search(r'(?:/smil:|\.smil)', url_base):
3359             if 'smil' not in skip_protocols:
3360                 rtmp_formats = self._extract_smil_formats(
3361                     manifest_url('jwplayer.smil'),
3362                     video_id, fatal=False)
3363                 for rtmp_format in rtmp_formats:
3364                     rtsp_format = rtmp_format.copy()
3365                     rtsp_format['url'] = '{}/{}'.format(rtmp_format['url'], rtmp_format['play_path'])
3366                     del rtsp_format['play_path']
3367                     del rtsp_format['ext']
3368                     rtsp_format.update({
3369                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3370                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3371                         'protocol': 'rtsp',
3372                     })
3373                     formats.extend([rtmp_format, rtsp_format])
3374         else:
3375             for protocol in ('rtmp', 'rtsp'):
3376                 if protocol not in skip_protocols:
3377                     formats.append({
3378                         'url': f'{protocol}:{url_base}',
3379                         'format_id': protocol,
3380                         'protocol': protocol,
3381                     })
3382         return formats
3383
3384     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3385         return self._search_json(
3386             r'''(?<!-)\bjwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!</script>).)*?\.\s*(?:setup\s*\(|(?P<load>load)\s*\(\s*\[)''',
3387             webpage, 'JWPlayer data', video_id,
3388             # must be a {...} or sequence, ending
3389             contains_pattern=r'\{(?s:.*)}(?(load)(?:\s*,\s*\{(?s:.*)})*)', end_pattern=r'(?(load)\]|\))',
3390             transform_source=transform_source, default=None)
3391
3392     def _extract_jwplayer_data(self, webpage, video_id, *args, transform_source=js_to_json, **kwargs):
3393         jwplayer_data = self._find_jwplayer_data(
3394             webpage, video_id, transform_source=transform_source)
3395         return self._parse_jwplayer_data(
3396             jwplayer_data, video_id, *args, **kwargs)
3397
3398     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3399                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3400         entries = []
3401         if not isinstance(jwplayer_data, dict):
3402             return entries
3403
3404         playlist_items = jwplayer_data.get('playlist')
3405         # JWPlayer backward compatibility: single playlist item/flattened playlists
3406         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3407         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3408         if not isinstance(playlist_items, list):
3409             playlist_items = (playlist_items or jwplayer_data, )
3410
3411         for video_data in playlist_items:
3412             if not isinstance(video_data, dict):
3413                 continue
3414             # JWPlayer backward compatibility: flattened sources
3415             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3416             if 'sources' not in video_data:
3417                 video_data['sources'] = [video_data]
3418
3419             this_video_id = video_id or video_data['mediaid']
3420
3421             formats = self._parse_jwplayer_formats(
3422                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3423                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3424
3425             subtitles = {}
3426             for track in traverse_obj(video_data, (
3427                     'tracks', lambda _, v: v['kind'].lower() in ('captions', 'subtitles'))):
3428                 track_url = urljoin(base_url, track.get('file'))
3429                 if not track_url:
3430                     continue
3431                 subtitles.setdefault(track.get('label') or 'en', []).append({
3432                     'url': self._proto_relative_url(track_url),
3433                 })
3434
3435             entry = {
3436                 'id': this_video_id,
3437                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3438                 'description': clean_html(video_data.get('description')),
3439                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3440                 'timestamp': int_or_none(video_data.get('pubdate')),
3441                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3442                 'subtitles': subtitles,
3443                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3444                 'genre': clean_html(video_data.get('genre')),
3445                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3446                 'season_number': int_or_none(video_data.get('season')),
3447                 'episode_number': int_or_none(video_data.get('episode')),
3448                 'release_year': int_or_none(video_data.get('releasedate')),
3449                 'age_limit': int_or_none(video_data.get('age_restriction')),
3450             }
3451             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3452             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3453                 entry.update({
3454                     '_type': 'url_transparent',
3455                     'url': formats[0]['url'],
3456                 })
3457             else:
3458                 entry['formats'] = formats
3459             entries.append(entry)
3460         if len(entries) == 1:
3461             return entries[0]
3462         else:
3463             return self.playlist_result(entries)
3464
3465     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3466                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3467         urls = set()
3468         formats = []
3469         for source in jwplayer_sources_data:
3470             if not isinstance(source, dict):
3471                 continue
3472             source_url = urljoin(
3473                 base_url, self._proto_relative_url(source.get('file')))
3474             if not source_url or source_url in urls:
3475                 continue
3476             urls.add(source_url)
3477             source_type = source.get('type') or ''
3478             ext = mimetype2ext(source_type) or determine_ext(source_url)
3479             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3480                 formats.extend(self._extract_m3u8_formats(
3481                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3482                     m3u8_id=m3u8_id, fatal=False))
3483             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3484                 formats.extend(self._extract_mpd_formats(
3485                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3486             elif ext == 'smil':
3487                 formats.extend(self._extract_smil_formats(
3488                     source_url, video_id, fatal=False))
3489             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3490             elif source_type.startswith('audio') or ext in (
3491                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3492                 formats.append({
3493                     'url': source_url,
3494                     'vcodec': 'none',
3495                     'ext': ext,
3496                 })
3497             else:
3498                 format_id = str_or_none(source.get('label'))
3499                 height = int_or_none(source.get('height'))
3500                 if height is None and format_id:
3501                     # Often no height is provided but there is a label in
3502                     # format like "1080p", "720p SD", or 1080.
3503                     height = parse_resolution(format_id).get('height')
3504                 a_format = {
3505                     'url': source_url,
3506                     'width': int_or_none(source.get('width')),
3507                     'height': height,
3508                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3509                     'filesize': int_or_none(source.get('filesize')),
3510                     'ext': ext,
3511                     'format_id': format_id,
3512                 }
3513                 if source_url.startswith('rtmp'):
3514                     a_format['ext'] = 'flv'
3515                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3516                     # of jwplayer.flash.swf
3517                     rtmp_url_parts = re.split(
3518                         r'((?:mp4|mp3|flv):)', source_url, maxsplit=1)
3519                     if len(rtmp_url_parts) == 3:
3520                         rtmp_url, prefix, play_path = rtmp_url_parts
3521                         a_format.update({
3522                             'url': rtmp_url,
3523                             'play_path': prefix + play_path,
3524                         })
3525                     if rtmp_params:
3526                         a_format.update(rtmp_params)
3527                 formats.append(a_format)
3528         return formats
3529
3530     def _live_title(self, name):
3531         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3532         return name
3533
3534     def _int(self, v, name, fatal=False, **kwargs):
3535         res = int_or_none(v, **kwargs)
3536         if res is None:
3537             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3538             if fatal:
3539                 raise ExtractorError(msg)
3540             else:
3541                 self.report_warning(msg)
3542         return res
3543
3544     def _float(self, v, name, fatal=False, **kwargs):
3545         res = float_or_none(v, **kwargs)
3546         if res is None:
3547             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3548             if fatal:
3549                 raise ExtractorError(msg)
3550             else:
3551                 self.report_warning(msg)
3552         return res
3553
3554     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3555                     path='/', secure=False, discard=False, rest={}, **kwargs):
3556         cookie = http.cookiejar.Cookie(
3557             0, name, value, port, port is not None, domain, True,
3558             domain.startswith('.'), path, True, secure, expire_time,
3559             discard, None, None, rest)
3560         self.cookiejar.set_cookie(cookie)
3561
3562     def _get_cookies(self, url):
3563         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3564         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3565
3566     def _apply_first_set_cookie_header(self, url_handle, cookie):
3567         """
3568         Apply first Set-Cookie header instead of the last. Experimental.
3569
3570         Some sites (e.g. [1-3]) may serve two cookies under the same name
3571         in Set-Cookie header and expect the first (old) one to be set rather
3572         than second (new). However, as of RFC6265 the newer one cookie
3573         should be set into cookie store what actually happens.
3574         We will workaround this issue by resetting the cookie to
3575         the first one manually.
3576         1. https://new.vk.com/
3577         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3578         3. https://learning.oreilly.com/
3579         """
3580         for header, cookies in url_handle.headers.items():
3581             if header.lower() != 'set-cookie':
3582                 continue
3583             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3584             cookie_value = re.search(
3585                 rf'{cookie}=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)', cookies)
3586             if cookie_value:
3587                 value, domain = cookie_value.groups()
3588                 self._set_cookie(domain, cookie, value)
3589                 break
3590
3591     @classmethod
3592     def get_testcases(cls, include_onlymatching=False):
3593         # Do not look in super classes
3594         t = vars(cls).get('_TEST')
3595         if t:
3596             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3597             tests = [t]
3598         else:
3599             tests = vars(cls).get('_TESTS', [])
3600         for t in tests:
3601             if not include_onlymatching and t.get('only_matching', False):
3602                 continue
3603             t['name'] = cls.ie_key()
3604             yield t
3605         if getattr(cls, '__wrapped__', None):
3606             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3607
3608     @classmethod
3609     def get_webpage_testcases(cls):
3610         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3611         for t in tests:
3612             t['name'] = cls.ie_key()
3613             yield t
3614         if getattr(cls, '__wrapped__', None):
3615             yield from cls.__wrapped__.get_webpage_testcases()
3616
3617     @classproperty(cache=True)
3618     def age_limit(cls):
3619         """Get age limit from the testcases"""
3620         return max(traverse_obj(
3621             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3622             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3623
3624     @classproperty(cache=True)
3625     def _RETURN_TYPE(cls):
3626         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3627         tests = tuple(cls.get_testcases(include_onlymatching=False))
3628         if not tests:
3629             return None
3630         elif not any(k.startswith('playlist') for test in tests for k in test):
3631             return 'video'
3632         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3633             return 'playlist'
3634         return 'any'
3635
3636     @classmethod
3637     def is_single_video(cls, url):
3638         """Returns whether the URL is of a single video, None if unknown"""
3639         if cls.suitable(url):
3640             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3641
3642     @classmethod
3643     def is_suitable(cls, age_limit):
3644         """Test whether the extractor is generally suitable for the given age limit"""
3645         return not age_restricted(cls.age_limit, age_limit)
3646
3647     @classmethod
3648     def description(cls, *, markdown=True, search_examples=None):
3649         """Description of the extractor"""
3650         desc = ''
3651         if cls._NETRC_MACHINE:
3652             if markdown:
3653                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3654             else:
3655                 desc += f' [{cls._NETRC_MACHINE}]'
3656         if cls.IE_DESC is False:
3657             desc += ' [HIDDEN]'
3658         elif cls.IE_DESC:
3659             desc += f' {cls.IE_DESC}'
3660         if cls.SEARCH_KEY:
3661             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3662             if search_examples:
3663                 _COUNTS = ('', '5', '10', 'all')
3664                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3665         if not cls.working():
3666             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3667
3668         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3669         name = (' - **{}**'.format(re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME))) if markdown else cls.IE_NAME
3670         return f'{name}:{desc}' if desc else name
3671
3672     def extract_subtitles(self, *args, **kwargs):
3673         if (self.get_param('writesubtitles', False)
3674                 or self.get_param('listsubtitles')):
3675             return self._get_subtitles(*args, **kwargs)
3676         return {}
3677
3678     def _get_subtitles(self, *args, **kwargs):
3679         raise NotImplementedError('This method must be implemented by subclasses')
3680
3681     class CommentsDisabled(Exception):
3682         """Raise in _get_comments if comments are disabled for the video"""
3683
3684     def extract_comments(self, *args, **kwargs):
3685         if not self.get_param('getcomments'):
3686             return None
3687         generator = self._get_comments(*args, **kwargs)
3688
3689         def extractor():
3690             comments = []
3691             interrupted = True
3692             try:
3693                 while True:
3694                     comments.append(next(generator))
3695             except StopIteration:
3696                 interrupted = False
3697             except KeyboardInterrupt:
3698                 self.to_screen('Interrupted by user')
3699             except self.CommentsDisabled:
3700                 return {'comments': None, 'comment_count': None}
3701             except Exception as e:
3702                 if self.get_param('ignoreerrors') is not True:
3703                     raise
3704                 self._downloader.report_error(e)
3705             comment_count = len(comments)
3706             self.to_screen(f'Extracted {comment_count} comments')
3707             return {
3708                 'comments': comments,
3709                 'comment_count': None if interrupted else comment_count,
3710             }
3711         return extractor
3712
3713     def _get_comments(self, *args, **kwargs):
3714         raise NotImplementedError('This method must be implemented by subclasses')
3715
3716     @staticmethod
3717     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3718         """ Merge subtitle items for one language. Items with duplicated URLs/data
3719         will be dropped. """
3720         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3721         ret = list(subtitle_list1)
3722         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3723         return ret
3724
3725     @classmethod
3726     def _merge_subtitles(cls, *dicts, target=None):
3727         """ Merge subtitle dictionaries, language by language. """
3728         if target is None:
3729             target = {}
3730         for d in dicts:
3731             for lang, subs in d.items():
3732                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3733         return target
3734
3735     def extract_automatic_captions(self, *args, **kwargs):
3736         if (self.get_param('writeautomaticsub', False)
3737                 or self.get_param('listsubtitles')):
3738             return self._get_automatic_captions(*args, **kwargs)
3739         return {}
3740
3741     def _get_automatic_captions(self, *args, **kwargs):
3742         raise NotImplementedError('This method must be implemented by subclasses')
3743
3744     @functools.cached_property
3745     def _cookies_passed(self):
3746         """Whether cookies have been passed to YoutubeDL"""
3747         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3748
3749     def mark_watched(self, *args, **kwargs):
3750         if not self.get_param('mark_watched', False):
3751             return
3752         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3753             self._mark_watched(*args, **kwargs)
3754
3755     def _mark_watched(self, *args, **kwargs):
3756         raise NotImplementedError('This method must be implemented by subclasses')
3757
3758     def geo_verification_headers(self):
3759         headers = {}
3760         geo_verification_proxy = self.get_param('geo_verification_proxy')
3761         if geo_verification_proxy:
3762             headers['Ytdl-request-proxy'] = geo_verification_proxy
3763         return headers
3764
3765     @staticmethod
3766     def _generic_id(url):
3767         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3768
3769     def _generic_title(self, url='', webpage='', *, default=None):
3770         return (self._og_search_title(webpage, default=None)
3771                 or self._html_extract_title(webpage, default=None)
3772                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3773                 or default)
3774
3775     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3776         if not duration:
3777             return
3778         chapter_list = [{
3779             'start_time': start_function(chapter),
3780             'title': title_function(chapter),
3781         } for chapter in chapter_list or []]
3782         if strict:
3783             warn = self.report_warning
3784         else:
3785             warn = self.write_debug
3786             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3787
3788         chapters = [{'start_time': 0}]
3789         for idx, chapter in enumerate(chapter_list):
3790             if chapter['start_time'] is None:
3791                 warn(f'Incomplete chapter {idx}')
3792             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3793                 chapters.append(chapter)
3794             elif chapter not in chapters:
3795                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3796                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3797                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3798         return chapters[1:]
3799
3800     def _extract_chapters_from_description(self, description, duration):
3801         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3802         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3803         return self._extract_chapters_helper(
3804             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3805             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3806             duration=duration, strict=False) or self._extract_chapters_helper(
3807             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3808             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3809             duration=duration, strict=False)
3810
3811     @staticmethod
3812     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3813         all_known = all(
3814             x is not None for x in
3815             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted))
3816         return (
3817             'private' if is_private
3818             else 'premium_only' if needs_premium
3819             else 'subscriber_only' if needs_subscription
3820             else 'needs_auth' if needs_auth
3821             else 'unlisted' if is_unlisted
3822             else 'public' if all_known
3823             else None)
3824
3825     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3826         '''
3827         @returns            A list of values for the extractor argument given by "key"
3828                             or "default" if no such key is present
3829         @param default      The default value to return when the key is not present (default: [])
3830         @param casesense    When false, the values are converted to lower case
3831         '''
3832         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3833         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3834         if val is None:
3835             return [] if default is NO_DEFAULT else default
3836         return list(val) if casesense else [x.lower() for x in val]
3837
3838     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3839         if not playlist_id or not video_id:
3840             return not video_id
3841
3842         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3843         if no_playlist is not None:
3844             return not no_playlist
3845
3846         video_id = '' if video_id is True else f' {video_id}'
3847         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3848         if self.get_param('noplaylist'):
3849             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3850             return False
3851         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3852         return True
3853
3854     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3855         RetryManager.report_retry(
3856             err, _count or int(fatal), _retries,
3857             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3858             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3859
3860     def RetryManager(self, **kwargs):
3861         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3862
3863     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3864         display_id = traverse_obj(info_dict, 'display_id', 'id')
3865         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3866         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3867             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3868
3869     @classmethod
3870     def extract_from_webpage(cls, ydl, url, webpage):
3871         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3872               else ydl.get_info_extractor(cls.ie_key()))
3873         for info in ie._extract_from_webpage(url, webpage) or []:
3874             # url = None since we do not want to set (webpage/original)_url
3875             ydl.add_default_extra_info(info, ie, None)
3876             yield info
3877
3878     @classmethod
3879     def _extract_from_webpage(cls, url, webpage):
3880         for embed_url in orderedSet(
3881                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3882             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3883
3884     @classmethod
3885     def _extract_embed_urls(cls, url, webpage):
3886         """@returns all the embed urls on the webpage"""
3887         if '_EMBED_URL_RE' not in cls.__dict__:
3888             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3889             for idx, regex in enumerate(cls._EMBED_REGEX):
3890                 assert regex.count('(?P<url>') == 1, \
3891                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3892             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3893
3894         for regex in cls._EMBED_URL_RE:
3895             for mobj in regex.finditer(webpage):
3896                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3897                 if cls._VALID_URL is False or cls.suitable(embed_url):
3898                     yield embed_url
3899
3900     class StopExtraction(Exception):
3901         pass
3902
3903     @classmethod
3904     def _extract_url(cls, webpage):  # TODO: Remove
3905         """Only for compatibility with some older extractors"""
3906         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3907
3908     @classmethod
3909     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3910         if plugin_name:
3911             mro = inspect.getmro(cls)
3912             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3913             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3914             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3915             while getattr(super_class, '__wrapped__', None):
3916                 super_class = super_class.__wrapped__
3917             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3918             _PLUGIN_OVERRIDES[super_class].append(cls)
3919
3920         return super().__init_subclass__(**kwargs)
3921
3922
3923 class SearchInfoExtractor(InfoExtractor):
3924     """
3925     Base class for paged search queries extractors.
3926     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3927     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3928     """
3929
3930     _MAX_RESULTS = float('inf')
3931     _RETURN_TYPE = 'playlist'
3932
3933     @classproperty
3934     def _VALID_URL(cls):
3935         return rf'{cls._SEARCH_KEY}(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)'
3936
3937     def _real_extract(self, query):
3938         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3939         if prefix == '':
3940             return self._get_n_results(query, 1)
3941         elif prefix == 'all':
3942             return self._get_n_results(query, self._MAX_RESULTS)
3943         else:
3944             n = int(prefix)
3945             if n <= 0:
3946                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3947             elif n > self._MAX_RESULTS:
3948                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3949                 n = self._MAX_RESULTS
3950             return self._get_n_results(query, n)
3951
3952     def _get_n_results(self, query, n):
3953         """Get a specified number of results for a query.
3954         Either this function or _search_results must be overridden by subclasses """
3955         return self.playlist_result(
3956             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3957             query, query)
3958
3959     def _search_results(self, query):
3960         """Returns an iterator of search results"""
3961         raise NotImplementedError('This method must be implemented by subclasses')
3962
3963     @classproperty
3964     def SEARCH_KEY(cls):
3965         return cls._SEARCH_KEY
3966
3967
3968 class UnsupportedURLIE(InfoExtractor):
3969     _VALID_URL = '.*'
3970     _ENABLED = False
3971     IE_DESC = False
3972
3973     def _real_extract(self, url):
3974         raise UnsupportedError(url)
3975
3976
3977 _PLUGIN_OVERRIDES = collections.defaultdict(list)