yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import subprocess
  17 import sys
  18 import time
  19 import types
  20 import urllib.parse
  21 import urllib.request
  22 import xml.etree.ElementTree
  23
  24 from ..compat import functools  # isort: split
  25 from ..compat import (
  26     compat_etree_fromstring,
  27     compat_expanduser,
  28     compat_os_name,
  29     urllib_req_to_req,
  30 )
  31 from ..cookies import LenientSimpleCookie
  32 from ..downloader.f4m import get_base_url, remove_encrypted_media
  33 from ..downloader.hls import HlsFD
  34 from ..networking import HEADRequest, Request
  35 from ..networking.exceptions import (
  36     HTTPError,
  37     IncompleteRead,
  38     network_exceptions,
  39 )
  40 from ..networking.impersonate import ImpersonateTarget
  41 from ..utils import (
  42     IDENTITY,
  43     JSON_LD_RE,
  44     NO_DEFAULT,
  45     ExtractorError,
  46     FormatSorter,
  47     GeoRestrictedError,
  48     GeoUtils,
  49     LenientJSONDecoder,
  50     Popen,
  51     RegexNotFoundError,
  52     RetryManager,
  53     UnsupportedError,
  54     age_restricted,
  55     base_url,
  56     bug_reports_message,
  57     classproperty,
  58     clean_html,
  59     deprecation_warning,
  60     determine_ext,
  61     dict_get,
  62     encode_data_uri,
  63     error_to_compat_str,
  64     extract_attributes,
  65     filter_dict,
  66     fix_xml_ampersands,
  67     float_or_none,
  68     format_field,
  69     int_or_none,
  70     join_nonempty,
  71     js_to_json,
  72     mimetype2ext,
  73     netrc_from_content,
  74     orderedSet,
  75     parse_bitrate,
  76     parse_codecs,
  77     parse_duration,
  78     parse_iso8601,
  79     parse_m3u8_attributes,
  80     parse_resolution,
  81     sanitize_filename,
  82     sanitize_url,
  83     smuggle_url,
  84     str_or_none,
  85     str_to_int,
  86     strip_or_none,
  87     traverse_obj,
  88     truncate_string,
  89     try_call,
  90     try_get,
  91     unescapeHTML,
  92     unified_strdate,
  93     unified_timestamp,
  94     url_basename,
  95     url_or_none,
  96     urlhandle_detect_ext,
  97     urljoin,
  98     variadic,
  99     xpath_element,
 100     xpath_text,
 101     xpath_with_ns,
 102 )
 103
 104
 105 class InfoExtractor:
 106     """Information Extractor class.
 107
 108     Information extractors are the classes that, given a URL, extract
 109     information about the video (or videos) the URL refers to. This
 110     information includes the real video URL, the video title, author and
 111     others. The information is stored in a dictionary which is then
 112     passed to the YoutubeDL. The YoutubeDL processes this
 113     information possibly downloading the video to the file system, among
 114     other possible outcomes.
 115
 116     The type field determines the type of the result.
 117     By far the most common value (and the default if _type is missing) is
 118     "video", which indicates a single video.
 119
 120     For a video, the dictionaries must include the following fields:
 121
 122     id:             Video identifier.
 123     title:          Video title, unescaped. Set to an empty string if video has
 124                     no title as opposed to "None" which signifies that the
 125                     extractor failed to obtain a title
 126
 127     Additionally, it must contain either a formats entry or a url one:
 128
 129     formats:        A list of dictionaries for each format available, ordered
 130                     from worst to best quality.
 131
 132                     Potential fields:
 133                     * url        The mandatory URL representing the media:
 134                                    for plain file media - HTTP URL of this file,
 135                                    for RTMP - RTMP URL,
 136                                    for HLS - URL of the M3U8 media playlist,
 137                                    for HDS - URL of the F4M manifest,
 138                                    for DASH
 139                                      - HTTP URL to plain file media (in case of
 140                                        unfragmented media)
 141                                      - URL of the MPD manifest or base URL
 142                                        representing the media if MPD manifest
 143                                        is parsed from a string (in case of
 144                                        fragmented media)
 145                                    for MSS - URL of the ISM manifest.
 146                     * request_data  Data to send in POST request to the URL
 147                     * manifest_url
 148                                  The URL of the manifest file in case of
 149                                  fragmented media:
 150                                    for HLS - URL of the M3U8 master playlist,
 151                                    for HDS - URL of the F4M manifest,
 152                                    for DASH - URL of the MPD manifest,
 153                                    for MSS - URL of the ISM manifest.
 154                     * manifest_stream_number  (For internal use only)
 155                                  The index of the stream in the manifest file
 156                     * ext        Will be calculated from URL if missing
 157                     * format     A human-readable description of the format
 158                                  ("mp4 container with h264/opus").
 159                                  Calculated from the format_id, width, height.
 160                                  and format_note fields if missing.
 161                     * format_id  A short description of the format
 162                                  ("mp4_h264_opus" or "19").
 163                                 Technically optional, but strongly recommended.
 164                     * format_note Additional info about the format
 165                                  ("3D" or "DASH video")
 166                     * width      Width of the video, if known
 167                     * height     Height of the video, if known
 168                     * aspect_ratio  Aspect ratio of the video, if known
 169                                  Automatically calculated from width and height
 170                     * resolution Textual description of width and height
 171                                  Automatically calculated from width and height
 172                     * dynamic_range The dynamic range of the video. One of:
 173                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 174                     * tbr        Average bitrate of audio and video in kbps (1000 bits/sec)
 175                     * abr        Average audio bitrate in kbps (1000 bits/sec)
 176                     * acodec     Name of the audio codec in use
 177                     * asr        Audio sampling rate in Hertz
 178                     * audio_channels  Number of audio channels
 179                     * vbr        Average video bitrate in kbps (1000 bits/sec)
 180                     * fps        Frame rate
 181                     * vcodec     Name of the video codec in use
 182                     * container  Name of the container format
 183                     * filesize   The number of bytes, if known in advance
 184                     * filesize_approx  An estimate for the number of bytes
 185                     * player_url SWF Player URL (used for rtmpdump).
 186                     * protocol   The protocol that will be used for the actual
 187                                  download, lower-case. One of "http", "https" or
 188                                  one of the protocols defined in downloader.PROTOCOL_MAP
 189                     * fragment_base_url
 190                                  Base URL for fragments. Each fragment's path
 191                                  value (if present) will be relative to
 192                                  this URL.
 193                     * fragments  A list of fragments of a fragmented media.
 194                                  Each fragment entry must contain either an url
 195                                  or a path. If an url is present it should be
 196                                  considered by a client. Otherwise both path and
 197                                  fragment_base_url must be present. Here is
 198                                  the list of all potential fields:
 199                                  * "url" - fragment's URL
 200                                  * "path" - fragment's path relative to
 201                                             fragment_base_url
 202                                  * "duration" (optional, int or float)
 203                                  * "filesize" (optional, int)
 204                     * is_from_start  Is a live format that can be downloaded
 205                                 from the start. Boolean
 206                     * preference Order number of this format. If this field is
 207                                  present and not None, the formats get sorted
 208                                  by this field, regardless of all other values.
 209                                  -1 for default (order by other properties),
 210                                  -2 or smaller for less than default.
 211                                  < -1000 to hide the format (if there is
 212                                     another one which is strictly better)
 213                     * language   Language code, e.g. "de" or "en-US".
 214                     * language_preference  Is this in the language mentioned in
 215                                  the URL?
 216                                  10 if it's what the URL is about,
 217                                  -1 for default (don't know),
 218                                  -10 otherwise, other values reserved for now.
 219                     * quality    Order number of the video quality of this
 220                                  format, irrespective of the file format.
 221                                  -1 for default (order by other properties),
 222                                  -2 or smaller for less than default.
 223                     * source_preference  Order number for this video source
 224                                   (quality takes higher priority)
 225                                  -1 for default (order by other properties),
 226                                  -2 or smaller for less than default.
 227                     * http_headers  A dictionary of additional HTTP headers
 228                                  to add to the request.
 229                     * stretched_ratio  If given and not 1, indicates that the
 230                                  video's pixels are not square.
 231                                  width : height ratio as float.
 232                     * no_resume  The server does not support resuming the
 233                                  (HTTP or RTMP) download. Boolean.
 234                     * has_drm    True if the format has DRM and cannot be downloaded.
 235                                  'maybe' if the format may have DRM and has to be tested before download.
 236                     * extra_param_to_segment_url  A query string to append to each
 237                                  fragment's URL, or to update each existing query string
 238                                  with. Only applied by the native HLS/DASH downloaders.
 239                     * hls_aes    A dictionary of HLS AES-128 decryption information
 240                                  used by the native HLS downloader to override the
 241                                  values in the media playlist when an '#EXT-X-KEY' tag
 242                                  is present in the playlist:
 243                                  * uri  The URI from which the key will be downloaded
 244                                  * key  The key (as hex) used to decrypt fragments.
 245                                         If `key` is given, any key URI will be ignored
 246                                  * iv   The IV (as hex) used to decrypt fragments
 247                     * downloader_options  A dictionary of downloader options
 248                                  (For internal use only)
 249                                  * http_chunk_size Chunk size for HTTP downloads
 250                                  * ffmpeg_args     Extra arguments for ffmpeg downloader (input)
 251                                  * ffmpeg_args_out Extra arguments for ffmpeg downloader (output)
 252                     * is_dash_periods  Whether the format is a result of merging
 253                                  multiple DASH periods.
 254                     RTMP formats can also have the additional fields: page_url,
 255                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 256                     rtmp_protocol, rtmp_real_time
 257
 258     url:            Final video URL.
 259     ext:            Video filename extension.
 260     format:         The video format, defaults to ext (used for --get-format)
 261     player_url:     SWF Player URL (used for rtmpdump).
 262
 263     The following fields are optional:
 264
 265     direct:         True if a direct video file was given (must only be set by GenericIE)
 266     alt_title:      A secondary title of the video.
 267     display_id:     An alternative identifier for the video, not necessarily
 268                     unique, but available before title. Typically, id is
 269                     something like "4234987", title "Dancing naked mole rats",
 270                     and display_id "dancing-naked-mole-rats"
 271     thumbnails:     A list of dictionaries, with the following entries:
 272                         * "id" (optional, string) - Thumbnail format ID
 273                         * "url"
 274                         * "preference" (optional, int) - quality of the image
 275                         * "width" (optional, int)
 276                         * "height" (optional, int)
 277                         * "resolution" (optional, string "{width}x{height}",
 278                                         deprecated)
 279                         * "filesize" (optional, int)
 280                         * "http_headers" (dict) - HTTP headers for the request
 281     thumbnail:      Full URL to a video thumbnail image.
 282     description:    Full video description.
 283     uploader:       Full name of the video uploader.
 284     license:        License name the video is licensed under.
 285     creators:       List of creators of the video.
 286     timestamp:      UNIX timestamp of the moment the video was uploaded
 287     upload_date:    Video upload date in UTC (YYYYMMDD).
 288                     If not explicitly set, calculated from timestamp
 289     release_timestamp: UNIX timestamp of the moment the video was released.
 290                     If it is not clear whether to use timestamp or this, use the former
 291     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 292                     If not explicitly set, calculated from release_timestamp
 293     release_year:   Year (YYYY) as integer when the video or album was released.
 294                     To be used if no exact release date is known.
 295                     If not explicitly set, calculated from release_date.
 296     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 297     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 298                     If not explicitly set, calculated from modified_timestamp
 299     uploader_id:    Nickname or id of the video uploader.
 300     uploader_url:   Full URL to a personal webpage of the video uploader.
 301     channel:        Full name of the channel the video is uploaded on.
 302                     Note that channel fields may or may not repeat uploader
 303                     fields. This depends on a particular extractor.
 304     channel_id:     Id of the channel.
 305     channel_url:    Full URL to a channel webpage.
 306     channel_follower_count: Number of followers of the channel.
 307     channel_is_verified: Whether the channel is verified on the platform.
 308     location:       Physical location where the video was filmed.
 309     subtitles:      The available subtitles as a dictionary in the format
 310                     {tag: subformats}. "tag" is usually a language code, and
 311                     "subformats" is a list sorted from lower to higher
 312                     preference, each element is a dictionary with the "ext"
 313                     entry and one of:
 314                         * "data": The subtitles file contents
 315                         * "url": A URL pointing to the subtitles file
 316                     It can optionally also have:
 317                         * "name": Name or description of the subtitles
 318                         * "http_headers": A dictionary of additional HTTP headers
 319                                   to add to the request.
 320                     "ext" will be calculated from URL if missing
 321     automatic_captions: Like 'subtitles'; contains automatically generated
 322                     captions instead of normal subtitles
 323     duration:       Length of the video in seconds, as an integer or float.
 324     view_count:     How many users have watched the video on the platform.
 325     concurrent_view_count: How many users are currently watching the video on the platform.
 326     like_count:     Number of positive ratings of the video
 327     dislike_count:  Number of negative ratings of the video
 328     repost_count:   Number of reposts of the video
 329     average_rating: Average rating give by users, the scale used depends on the webpage
 330     comment_count:  Number of comments on the video
 331     comments:       A list of comments, each with one or more of the following
 332                     properties (all but one of text or html optional):
 333                         * "author" - human-readable name of the comment author
 334                         * "author_id" - user ID of the comment author
 335                         * "author_thumbnail" - The thumbnail of the comment author
 336                         * "author_url" - The url to the comment author's page
 337                         * "author_is_verified" - Whether the author is verified
 338                                                  on the platform
 339                         * "author_is_uploader" - Whether the comment is made by
 340                                                  the video uploader
 341                         * "id" - Comment ID
 342                         * "html" - Comment as HTML
 343                         * "text" - Plain text of the comment
 344                         * "timestamp" - UNIX timestamp of comment
 345                         * "parent" - ID of the comment this one is replying to.
 346                                      Set to "root" to indicate that this is a
 347                                      comment to the original video.
 348                         * "like_count" - Number of positive ratings of the comment
 349                         * "dislike_count" - Number of negative ratings of the comment
 350                         * "is_favorited" - Whether the comment is marked as
 351                                            favorite by the video uploader
 352                         * "is_pinned" - Whether the comment is pinned to
 353                                         the top of the comments
 354     age_limit:      Age restriction for the video, as an integer (years)
 355     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 356                     should allow to get the same result again. (It will be set
 357                     by YoutubeDL if it's missing)
 358     categories:     A list of categories that the video falls in, for example
 359                     ["Sports", "Berlin"]
 360     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 361     cast:           A list of the video cast
 362     is_live:        True, False, or None (=unknown). Whether this video is a
 363                     live stream that goes on instead of a fixed-length video.
 364     was_live:       True, False, or None (=unknown). Whether this video was
 365                     originally a live stream.
 366     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 367                     or 'post_live' (was live, but VOD is not yet processed)
 368                     If absent, automatically set from is_live, was_live
 369     start_time:     Time in seconds where the reproduction should start, as
 370                     specified in the URL.
 371     end_time:       Time in seconds where the reproduction should end, as
 372                     specified in the URL.
 373     chapters:       A list of dictionaries, with the following entries:
 374                         * "start_time" - The start time of the chapter in seconds
 375                         * "end_time" - The end time of the chapter in seconds
 376                         * "title" (optional, string)
 377     heatmap:        A list of dictionaries, with the following entries:
 378                         * "start_time" - The start time of the data point in seconds
 379                         * "end_time" - The end time of the data point in seconds
 380                         * "value" - The normalized value of the data point (float between 0 and 1)
 381     playable_in_embed: Whether this video is allowed to play in embedded
 382                     players on other sites. Can be True (=always allowed),
 383                     False (=never allowed), None (=unknown), or a string
 384                     specifying the criteria for embedability; e.g. 'whitelist'
 385     availability:   Under what condition the video is available. One of
 386                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 387                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 388                     to set it
 389     media_type:     The type of media as classified by the site, e.g. "episode", "clip", "trailer"
 390     _old_archive_ids: A list of old archive ids needed for backward compatibility
 391     _format_sort_fields: A list of fields to use for sorting formats
 392     __post_extractor: A function to be called just before the metadata is
 393                     written to either disk, logger or console. The function
 394                     must return a dict which will be added to the info_dict.
 395                     This is usefull for additional information that is
 396                     time-consuming to extract. Note that the fields thus
 397                     extracted will not be available to output template and
 398                     match_filter. So, only "comments" and "comment_count" are
 399                     currently allowed to be extracted via this method.
 400
 401     The following fields should only be used when the video belongs to some logical
 402     chapter or section:
 403
 404     chapter:        Name or title of the chapter the video belongs to.
 405     chapter_number: Number of the chapter the video belongs to, as an integer.
 406     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 407
 408     The following fields should only be used when the video is an episode of some
 409     series, programme or podcast:
 410
 411     series:         Title of the series or programme the video episode belongs to.
 412     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 413     season:         Title of the season the video episode belongs to.
 414     season_number:  Number of the season the video episode belongs to, as an integer.
 415     season_id:      Id of the season the video episode belongs to, as a unicode string.
 416     episode:        Title of the video episode. Unlike mandatory video title field,
 417                     this field should denote the exact title of the video episode
 418                     without any kind of decoration.
 419     episode_number: Number of the video episode within a season, as an integer.
 420     episode_id:     Id of the video episode, as a unicode string.
 421
 422     The following fields should only be used when the media is a track or a part of
 423     a music album:
 424
 425     track:          Title of the track.
 426     track_number:   Number of the track within an album or a disc, as an integer.
 427     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 428                     as a unicode string.
 429     artists:        List of artists of the track.
 430     composers:      List of composers of the piece.
 431     genres:         List of genres of the track.
 432     album:          Title of the album the track belongs to.
 433     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 434     album_artists:  List of all artists appeared on the album.
 435                     E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
 436                     Useful for splits and compilations.
 437     disc_number:    Number of the disc or other physical medium the track belongs to,
 438                     as an integer.
 439
 440     The following fields should only be set for clips that should be cut from the original video:
 441
 442     section_start:  Start time of the section in seconds
 443     section_end:    End time of the section in seconds
 444
 445     The following fields should only be set for storyboards:
 446     rows:           Number of rows in each storyboard fragment, as an integer
 447     columns:        Number of columns in each storyboard fragment, as an integer
 448
 449     The following fields are deprecated and should not be set by new code:
 450     composer:       Use "composers" instead.
 451                     Composer(s) of the piece, comma-separated.
 452     artist:         Use "artists" instead.
 453                     Artist(s) of the track, comma-separated.
 454     genre:          Use "genres" instead.
 455                     Genre(s) of the track, comma-separated.
 456     album_artist:   Use "album_artists" instead.
 457                     All artists appeared on the album, comma-separated.
 458     creator:        Use "creators" instead.
 459                     The creator of the video.
 460
 461     Unless mentioned otherwise, the fields should be Unicode strings.
 462
 463     Unless mentioned otherwise, None is equivalent to absence of information.
 464
 465
 466     _type "playlist" indicates multiple videos.
 467     There must be a key "entries", which is a list, an iterable, or a PagedList
 468     object, each element of which is a valid dictionary by this specification.
 469
 470     Additionally, playlists can have "id", "title", and any other relevant
 471     attributes with the same semantics as videos (see above).
 472
 473     It can also have the following optional fields:
 474
 475     playlist_count: The total number of videos in a playlist. If not given,
 476                     YoutubeDL tries to calculate it from "entries"
 477
 478
 479     _type "multi_video" indicates that there are multiple videos that
 480     form a single show, for examples multiple acts of an opera or TV episode.
 481     It must have an entries key like a playlist and contain all the keys
 482     required for a video at the same time.
 483
 484
 485     _type "url" indicates that the video must be extracted from another
 486     location, possibly by a different extractor. Its only required key is:
 487     "url" - the next URL to extract.
 488     The key "ie_key" can be set to the class name (minus the trailing "IE",
 489     e.g. "Youtube") if the extractor class is known in advance.
 490     Additionally, the dictionary may have any properties of the resolved entity
 491     known in advance, for example "title" if the title of the referred video is
 492     known ahead of time.
 493
 494
 495     _type "url_transparent" entities have the same specification as "url", but
 496     indicate that the given additional information is more precise than the one
 497     associated with the resolved URL.
 498     This is useful when a site employs a video service that hosts the video and
 499     its technical metadata, but that video service does not embed a useful
 500     title, description etc.
 501
 502
 503     Subclasses of this should also be added to the list of extractors and
 504     should define _VALID_URL as a regexp or a Sequence of regexps, and
 505     re-define the _real_extract() and (optionally) _real_initialize() methods.
 506
 507     Subclasses may also override suitable() if necessary, but ensure the function
 508     signature is preserved and that this function imports everything it needs
 509     (except other extractors), so that lazy_extractors works correctly.
 510
 511     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 512     the HTML of Generic webpages. It may also override _extract_embed_urls
 513     or _extract_from_webpage as necessary. While these are normally classmethods,
 514     _extract_from_webpage is allowed to be an instance method.
 515
 516     _extract_from_webpage may raise self.StopExtraction() to stop further
 517     processing of the webpage and obtain exclusive rights to it. This is useful
 518     when the extractor cannot reliably be matched using just the URL,
 519     e.g. invidious/peertube instances
 520
 521     Embed-only extractors can be defined by setting _VALID_URL = False.
 522
 523     To support username + password (or netrc) login, the extractor must define a
 524     _NETRC_MACHINE and re-define _perform_login(username, password) and
 525     (optionally) _initialize_pre_login() methods. The _perform_login method will
 526     be called between _initialize_pre_login and _real_initialize if credentials
 527     are passed by the user. In cases where it is necessary to have the login
 528     process as part of the extraction rather than initialization, _perform_login
 529     can be left undefined.
 530
 531     _GEO_BYPASS attribute may be set to False in order to disable
 532     geo restriction bypass mechanisms for a particular extractor.
 533     Though it won't disable explicit geo restriction bypass based on
 534     country code provided with geo_bypass_country.
 535
 536     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 537     countries for this extractor. One of these countries will be used by
 538     geo restriction bypass mechanism right away in order to bypass
 539     geo restriction, of course, if the mechanism is not disabled.
 540
 541     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 542     IP blocks in CIDR notation for this extractor. One of these IP blocks
 543     will be used by geo restriction bypass mechanism similarly
 544     to _GEO_COUNTRIES.
 545
 546     The _ENABLED attribute should be set to False for IEs that
 547     are disabled by default and must be explicitly enabled.
 548
 549     The _WORKING attribute should be set to False for broken IEs
 550     in order to warn the users and skip the tests.
 551     """
 552
 553     _ready = False
 554     _downloader = None
 555     _x_forwarded_for_ip = None
 556     _GEO_BYPASS = True
 557     _GEO_COUNTRIES = None
 558     _GEO_IP_BLOCKS = None
 559     _WORKING = True
 560     _ENABLED = True
 561     _NETRC_MACHINE = None
 562     IE_DESC = None
 563     SEARCH_KEY = None
 564     _VALID_URL = None
 565     _EMBED_REGEX = []
 566
 567     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 568         password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 569         return {
 570             None: '',
 571             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 572             'password': f'Use {password_hint}',
 573             'cookies': (
 574                 'Use --cookies-from-browser or --cookies for the authentication. '
 575                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 576         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 577
 578     def __init__(self, downloader=None):
 579         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 580         If a downloader is not passed during initialization,
 581         it must be set using "set_downloader()" before "extract()" is called"""
 582         self._ready = False
 583         self._x_forwarded_for_ip = None
 584         self._printed_messages = set()
 585         self.set_downloader(downloader)
 586
 587     @classmethod
 588     def _match_valid_url(cls, url):
 589         if cls._VALID_URL is False:
 590             return None
 591         # This does not use has/getattr intentionally - we want to know whether
 592         # we have cached the regexp for *this* class, whereas getattr would also
 593         # match the superclass
 594         if '_VALID_URL_RE' not in cls.__dict__:
 595             cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
 596         return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
 597
 598     @classmethod
 599     def suitable(cls, url):
 600         """Receives a URL and returns True if suitable for this IE."""
 601         # This function must import everything it needs (except other extractors),
 602         # so that lazy_extractors works correctly
 603         return cls._match_valid_url(url) is not None
 604
 605     @classmethod
 606     def _match_id(cls, url):
 607         return cls._match_valid_url(url).group('id')
 608
 609     @classmethod
 610     def get_temp_id(cls, url):
 611         try:
 612             return cls._match_id(url)
 613         except (IndexError, AttributeError):
 614             return None
 615
 616     @classmethod
 617     def working(cls):
 618         """Getter method for _WORKING."""
 619         return cls._WORKING
 620
 621     @classmethod
 622     def supports_login(cls):
 623         return bool(cls._NETRC_MACHINE)
 624
 625     def initialize(self):
 626         """Initializes an instance (authentication, etc)."""
 627         self._printed_messages = set()
 628         self._initialize_geo_bypass({
 629             'countries': self._GEO_COUNTRIES,
 630             'ip_blocks': self._GEO_IP_BLOCKS,
 631         })
 632         if not self._ready:
 633             self._initialize_pre_login()
 634             if self.supports_login():
 635                 username, password = self._get_login_info()
 636                 if username:
 637                     self._perform_login(username, password)
 638             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 639                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 640             self._real_initialize()
 641             self._ready = True
 642
 643     def _initialize_geo_bypass(self, geo_bypass_context):
 644         """
 645         Initialize geo restriction bypass mechanism.
 646
 647         This method is used to initialize geo bypass mechanism based on faking
 648         X-Forwarded-For HTTP header. A random country from provided country list
 649         is selected and a random IP belonging to this country is generated. This
 650         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 651         HTTP requests.
 652
 653         This method will be used for initial geo bypass mechanism initialization
 654         during the instance initialization with _GEO_COUNTRIES and
 655         _GEO_IP_BLOCKS.
 656
 657         You may also manually call it from extractor's code if geo bypass
 658         information is not available beforehand (e.g. obtained during
 659         extraction) or due to some other reason. In this case you should pass
 660         this information in geo bypass context passed as first argument. It may
 661         contain following fields:
 662
 663         countries:  List of geo unrestricted countries (similar
 664                     to _GEO_COUNTRIES)
 665         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 666                     (similar to _GEO_IP_BLOCKS)
 667
 668         """
 669         if not self._x_forwarded_for_ip:
 670
 671             # Geo bypass mechanism is explicitly disabled by user
 672             if not self.get_param('geo_bypass', True):
 673                 return
 674
 675             if not geo_bypass_context:
 676                 geo_bypass_context = {}
 677
 678             # Backward compatibility: previously _initialize_geo_bypass
 679             # expected a list of countries, some 3rd party code may still use
 680             # it this way
 681             if isinstance(geo_bypass_context, (list, tuple)):
 682                 geo_bypass_context = {
 683                     'countries': geo_bypass_context,
 684                 }
 685
 686             # The whole point of geo bypass mechanism is to fake IP
 687             # as X-Forwarded-For HTTP header based on some IP block or
 688             # country code.
 689
 690             # Path 1: bypassing based on IP block in CIDR notation
 691
 692             # Explicit IP block specified by user, use it right away
 693             # regardless of whether extractor is geo bypassable or not
 694             ip_block = self.get_param('geo_bypass_ip_block', None)
 695
 696             # Otherwise use random IP block from geo bypass context but only
 697             # if extractor is known as geo bypassable
 698             if not ip_block:
 699                 ip_blocks = geo_bypass_context.get('ip_blocks')
 700                 if self._GEO_BYPASS and ip_blocks:
 701                     ip_block = random.choice(ip_blocks)
 702
 703             if ip_block:
 704                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 705                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 706                 return
 707
 708             # Path 2: bypassing based on country code
 709
 710             # Explicit country code specified by user, use it right away
 711             # regardless of whether extractor is geo bypassable or not
 712             country = self.get_param('geo_bypass_country', None)
 713
 714             # Otherwise use random country code from geo bypass context but
 715             # only if extractor is known as geo bypassable
 716             if not country:
 717                 countries = geo_bypass_context.get('countries')
 718                 if self._GEO_BYPASS and countries:
 719                     country = random.choice(countries)
 720
 721             if country:
 722                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 723                 self._downloader.write_debug(
 724                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 725
 726     def extract(self, url):
 727         """Extracts URL information and returns it in list of dicts."""
 728         try:
 729             for _ in range(2):
 730                 try:
 731                     self.initialize()
 732                     self.to_screen('Extracting URL: %s' % (
 733                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 734                     ie_result = self._real_extract(url)
 735                     if ie_result is None:
 736                         return None
 737                     if self._x_forwarded_for_ip:
 738                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 739                     subtitles = ie_result.get('subtitles') or {}
 740                     if 'no-live-chat' in self.get_param('compat_opts'):
 741                         for lang in ('live_chat', 'comments', 'danmaku'):
 742                             subtitles.pop(lang, None)
 743                     return ie_result
 744                 except GeoRestrictedError as e:
 745                     if self.__maybe_fake_ip_and_retry(e.countries):
 746                         continue
 747                     raise
 748         except UnsupportedError:
 749             raise
 750         except ExtractorError as e:
 751             e.video_id = e.video_id or self.get_temp_id(url)
 752             e.ie = e.ie or self.IE_NAME
 753             e.traceback = e.traceback or sys.exc_info()[2]
 754             raise
 755         except IncompleteRead as e:
 756             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 757         except (KeyError, StopIteration) as e:
 758             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 759
 760     def __maybe_fake_ip_and_retry(self, countries):
 761         if (not self.get_param('geo_bypass_country', None)
 762                 and self._GEO_BYPASS
 763                 and self.get_param('geo_bypass', True)
 764                 and not self._x_forwarded_for_ip
 765                 and countries):
 766             country_code = random.choice(countries)
 767             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 768             if self._x_forwarded_for_ip:
 769                 self.report_warning(
 770                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 771                     % (self._x_forwarded_for_ip, country_code.upper()))
 772                 return True
 773         return False
 774
 775     def set_downloader(self, downloader):
 776         """Sets a YoutubeDL instance as the downloader for this IE."""
 777         self._downloader = downloader
 778
 779     @property
 780     def cache(self):
 781         return self._downloader.cache
 782
 783     @property
 784     def cookiejar(self):
 785         return self._downloader.cookiejar
 786
 787     def _initialize_pre_login(self):
 788         """ Initialization before login. Redefine in subclasses."""
 789         pass
 790
 791     def _perform_login(self, username, password):
 792         """ Login with username and password. Redefine in subclasses."""
 793         pass
 794
 795     def _real_initialize(self):
 796         """Real initialization process. Redefine in subclasses."""
 797         pass
 798
 799     def _real_extract(self, url):
 800         """Real extraction process. Redefine in subclasses."""
 801         raise NotImplementedError('This method must be implemented by subclasses')
 802
 803     @classmethod
 804     def ie_key(cls):
 805         """A string for getting the InfoExtractor with get_info_extractor"""
 806         return cls.__name__[:-2]
 807
 808     @classproperty
 809     def IE_NAME(cls):
 810         return cls.__name__[:-2]
 811
 812     @staticmethod
 813     def __can_accept_status_code(err, expected_status):
 814         assert isinstance(err, HTTPError)
 815         if expected_status is None:
 816             return False
 817         elif callable(expected_status):
 818             return expected_status(err.status) is True
 819         else:
 820             return err.status in variadic(expected_status)
 821
 822     def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None):
 823         if isinstance(url_or_request, urllib.request.Request):
 824             self._downloader.deprecation_warning(
 825                 'Passing a urllib.request.Request to _create_request() is deprecated. '
 826                 'Use yt_dlp.networking.common.Request instead.')
 827             url_or_request = urllib_req_to_req(url_or_request)
 828         elif not isinstance(url_or_request, Request):
 829             url_or_request = Request(url_or_request)
 830
 831         url_or_request.update(data=data, headers=headers, query=query, extensions=extensions)
 832         return url_or_request
 833
 834     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None,
 835                          headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False):
 836         """
 837         Return the response handle.
 838
 839         See _download_webpage docstring for arguments specification.
 840         """
 841         if not self._downloader._first_webpage_request:
 842             sleep_interval = self.get_param('sleep_interval_requests') or 0
 843             if sleep_interval > 0:
 844                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 845                 time.sleep(sleep_interval)
 846         else:
 847             self._downloader._first_webpage_request = False
 848
 849         if note is None:
 850             self.report_download_webpage(video_id)
 851         elif note is not False:
 852             if video_id is None:
 853                 self.to_screen(str(note))
 854             else:
 855                 self.to_screen(f'{video_id}: {note}')
 856
 857         # Some sites check X-Forwarded-For HTTP header in order to figure out
 858         # the origin of the client behind proxy. This allows bypassing geo
 859         # restriction by faking this header's value to IP that belongs to some
 860         # geo unrestricted country. We will do so once we encounter any
 861         # geo restriction error.
 862         if self._x_forwarded_for_ip:
 863             headers = (headers or {}).copy()
 864             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 865
 866         extensions = {}
 867
 868         if impersonate in (True, ''):
 869             impersonate = ImpersonateTarget()
 870         requested_targets = [
 871             t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t)
 872             for t in variadic(impersonate)
 873         ] if impersonate else []
 874
 875         available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None)
 876         if available_target:
 877             extensions['impersonate'] = available_target
 878         elif requested_targets:
 879             message = 'The extractor is attempting impersonation, but '
 880             message += (
 881                 'no impersonate target is available' if not str(impersonate)
 882                 else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"')
 883             info_msg = ('see  https://github.com/yt-dlp/yt-dlp#impersonation  '
 884                         'for information on installing the required dependencies')
 885             if require_impersonation:
 886                 raise ExtractorError(f'{message}; {info_msg}', expected=True)
 887             self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True)
 888
 889         try:
 890             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions))
 891         except network_exceptions as err:
 892             if isinstance(err, HTTPError):
 893                 if self.__can_accept_status_code(err, expected_status):
 894                     return err.response
 895
 896             if errnote is False:
 897                 return False
 898             if errnote is None:
 899                 errnote = 'Unable to download webpage'
 900
 901             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 902             if fatal:
 903                 raise ExtractorError(errmsg, cause=err)
 904             else:
 905                 self.report_warning(errmsg)
 906                 return False
 907
 908     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 909                                  encoding=None, data=None, headers={}, query={}, expected_status=None,
 910                                  impersonate=None, require_impersonation=False):
 911         """
 912         Return a tuple (page content as string, URL handle).
 913
 914         Arguments:
 915         url_or_request -- plain text URL as a string or
 916             a yt_dlp.networking.Request object
 917         video_id -- Video/playlist/item identifier (string)
 918
 919         Keyword arguments:
 920         note -- note printed before downloading (string)
 921         errnote -- note printed in case of an error (string)
 922         fatal -- flag denoting whether error should be considered fatal,
 923             i.e. whether it should cause ExtractionError to be raised,
 924             otherwise a warning will be reported and extraction continued
 925         encoding -- encoding for a page content decoding, guessed automatically
 926             when not explicitly specified
 927         data -- POST data (bytes)
 928         headers -- HTTP headers (dict)
 929         query -- URL query (dict)
 930         expected_status -- allows to accept failed HTTP requests (non 2xx
 931             status code) by explicitly specifying a set of accepted status
 932             codes. Can be any of the following entities:
 933                 - an integer type specifying an exact failed status code to
 934                   accept
 935                 - a list or a tuple of integer types specifying a list of
 936                   failed status codes to accept
 937                 - a callable accepting an actual failed status code and
 938                   returning True if it should be accepted
 939             Note that this argument does not affect success status codes (2xx)
 940             which are always accepted.
 941         impersonate -- the impersonate target. Can be any of the following entities:
 942                 - an instance of yt_dlp.networking.impersonate.ImpersonateTarget
 943                 - a string in the format of CLIENT[:OS]
 944                 - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances
 945                 - a boolean value; True means any impersonate target is sufficient
 946         require_impersonation -- flag to toggle whether the request should raise an error
 947             if impersonation is not possible (bool, default: False)
 948         """
 949
 950         # Strip hashes from the URL (#1038)
 951         if isinstance(url_or_request, str):
 952             url_or_request = url_or_request.partition('#')[0]
 953
 954         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data,
 955                                      headers=headers, query=query, expected_status=expected_status,
 956                                      impersonate=impersonate, require_impersonation=require_impersonation)
 957         if urlh is False:
 958             assert not fatal
 959             return False
 960         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 961         return (content, urlh)
 962
 963     @staticmethod
 964     def _guess_encoding_from_content(content_type, webpage_bytes):
 965         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 966         if m:
 967             encoding = m.group(1)
 968         else:
 969             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 970                           webpage_bytes[:1024])
 971             if m:
 972                 encoding = m.group(1).decode('ascii')
 973             elif webpage_bytes.startswith(b'\xff\xfe'):
 974                 encoding = 'utf-16'
 975             else:
 976                 encoding = 'utf-8'
 977
 978         return encoding
 979
 980     def __check_blocked(self, content):
 981         first_block = content[:512]
 982         if ('<title>Access to this site is blocked</title>' in content
 983                 and 'Websense' in first_block):
 984             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 985             blocked_iframe = self._html_search_regex(
 986                 r'<iframe src="([^"]+)"', content,
 987                 'Websense information URL', default=None)
 988             if blocked_iframe:
 989                 msg += ' Visit %s for more details' % blocked_iframe
 990             raise ExtractorError(msg, expected=True)
 991         if '<title>The URL you requested has been blocked</title>' in first_block:
 992             msg = (
 993                 'Access to this webpage has been blocked by Indian censorship. '
 994                 'Use a VPN or proxy server (with --proxy) to route around it.')
 995             block_msg = self._html_search_regex(
 996                 r'</h1><p>(.*?)</p>',
 997                 content, 'block message', default=None)
 998             if block_msg:
 999                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
1000             raise ExtractorError(msg, expected=True)
1001         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
1002                 and 'blocklist.rkn.gov.ru' in content):
1003             raise ExtractorError(
1004                 'Access to this webpage has been blocked by decision of the Russian government. '
1005                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
1006                 expected=True)
1007
1008     def _request_dump_filename(self, url, video_id):
1009         basen = f'{video_id}_{url}'
1010         trim_length = self.get_param('trim_file_name') or 240
1011         if len(basen) > trim_length:
1012             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
1013             basen = basen[:trim_length - len(h)] + h
1014         filename = sanitize_filename(f'{basen}.dump', restricted=True)
1015         # Working around MAX_PATH limitation on Windows (see
1016         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
1017         if compat_os_name == 'nt':
1018             absfilepath = os.path.abspath(filename)
1019             if len(absfilepath) > 259:
1020                 filename = fR'\\?\{absfilepath}'
1021         return filename
1022
1023     def __decode_webpage(self, webpage_bytes, encoding, headers):
1024         if not encoding:
1025             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
1026         try:
1027             return webpage_bytes.decode(encoding, 'replace')
1028         except LookupError:
1029             return webpage_bytes.decode('utf-8', 'replace')
1030
1031     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
1032         webpage_bytes = urlh.read()
1033         if prefix is not None:
1034             webpage_bytes = prefix + webpage_bytes
1035         if self.get_param('dump_intermediate_pages', False):
1036             self.to_screen('Dumping request to ' + urlh.url)
1037             dump = base64.b64encode(webpage_bytes).decode('ascii')
1038             self._downloader.to_screen(dump)
1039         if self.get_param('write_pages'):
1040             filename = self._request_dump_filename(urlh.url, video_id)
1041             self.to_screen(f'Saving request to {filename}')
1042             with open(filename, 'wb') as outf:
1043                 outf.write(webpage_bytes)
1044
1045         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
1046         self.__check_blocked(content)
1047
1048         return content
1049
1050     def __print_error(self, errnote, fatal, video_id, err):
1051         if fatal:
1052             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
1053         elif errnote:
1054             self.report_warning(f'{video_id}: {errnote}: {err}')
1055
1056     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
1057         if transform_source:
1058             xml_string = transform_source(xml_string)
1059         try:
1060             return compat_etree_fromstring(xml_string.encode('utf-8'))
1061         except xml.etree.ElementTree.ParseError as ve:
1062             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1063
1064     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1065         try:
1066             return json.loads(
1067                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1068         except ValueError as ve:
1069             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1070
1071     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1072         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1073
1074     def __create_download_methods(name, parser, note, errnote, return_value):
1075
1076         def parse(ie, content, *args, errnote=errnote, **kwargs):
1077             if parser is None:
1078                 return content
1079             if errnote is False:
1080                 kwargs['errnote'] = errnote
1081             # parser is fetched by name so subclasses can override it
1082             return getattr(ie, parser)(content, *args, **kwargs)
1083
1084         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1085                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
1086                             impersonate=None, require_impersonation=False):
1087             res = self._download_webpage_handle(
1088                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1089                 data=data, headers=headers, query=query, expected_status=expected_status,
1090                 impersonate=impersonate, require_impersonation=require_impersonation)
1091             if res is False:
1092                 return res
1093             content, urlh = res
1094             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1095
1096         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1097                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
1098                              impersonate=None, require_impersonation=False):
1099             if self.get_param('load_pages'):
1100                 url_or_request = self._create_request(url_or_request, data, headers, query)
1101                 filename = self._request_dump_filename(url_or_request.url, video_id)
1102                 self.to_screen(f'Loading request from {filename}')
1103                 try:
1104                     with open(filename, 'rb') as dumpf:
1105                         webpage_bytes = dumpf.read()
1106                 except OSError as e:
1107                     self.report_warning(f'Unable to load request from disk: {e}')
1108                 else:
1109                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1110                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1111             kwargs = {
1112                 'note': note,
1113                 'errnote': errnote,
1114                 'transform_source': transform_source,
1115                 'fatal': fatal,
1116                 'encoding': encoding,
1117                 'data': data,
1118                 'headers': headers,
1119                 'query': query,
1120                 'expected_status': expected_status,
1121                 'impersonate': impersonate,
1122                 'require_impersonation': require_impersonation,
1123             }
1124             if parser is None:
1125                 kwargs.pop('transform_source')
1126             # The method is fetched by name so subclasses can override _download_..._handle
1127             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1128             return res if res is False else res[0]
1129
1130         def impersonate(func, name, return_value):
1131             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1132             func.__doc__ = f'''
1133                 @param transform_source     Apply this transformation before parsing
1134                 @returns                    {return_value}
1135
1136                 See _download_webpage_handle docstring for other arguments specification
1137             '''
1138
1139         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1140         impersonate(download_content, f'_download_{name}', f'{return_value}')
1141         return download_handle, download_content
1142
1143     _download_xml_handle, _download_xml = __create_download_methods(
1144         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1145     _download_json_handle, _download_json = __create_download_methods(
1146         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1147     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1148         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1149     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1150
1151     def _download_webpage(
1152             self, url_or_request, video_id, note=None, errnote=None,
1153             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1154         """
1155         Return the data of the page as a string.
1156
1157         Keyword arguments:
1158         tries -- number of tries
1159         timeout -- sleep interval between tries
1160
1161         See _download_webpage_handle docstring for other arguments specification.
1162         """
1163
1164         R''' # NB: These are unused; should they be deprecated?
1165         if tries != 1:
1166             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1167         if timeout is NO_DEFAULT:
1168             timeout = 5
1169         else:
1170             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1171         '''
1172
1173         try_count = 0
1174         while True:
1175             try:
1176                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1177             except IncompleteRead as e:
1178                 try_count += 1
1179                 if try_count >= tries:
1180                     raise e
1181                 self._sleep(timeout, video_id)
1182
1183     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1184         idstr = format_field(video_id, None, '%s: ')
1185         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1186         if only_once:
1187             if f'WARNING: {msg}' in self._printed_messages:
1188                 return
1189             self._printed_messages.add(f'WARNING: {msg}')
1190         self._downloader.report_warning(msg, *args, **kwargs)
1191
1192     def to_screen(self, msg, *args, **kwargs):
1193         """Print msg to screen, prefixing it with '[ie_name]'"""
1194         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1195
1196     def write_debug(self, msg, *args, **kwargs):
1197         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1198
1199     def get_param(self, name, default=None, *args, **kwargs):
1200         if self._downloader:
1201             return self._downloader.params.get(name, default, *args, **kwargs)
1202         return default
1203
1204     def report_drm(self, video_id, partial=NO_DEFAULT):
1205         if partial is not NO_DEFAULT:
1206             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1207         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1208
1209     def report_extraction(self, id_or_name):
1210         """Report information extraction."""
1211         self.to_screen('%s: Extracting information' % id_or_name)
1212
1213     def report_download_webpage(self, video_id):
1214         """Report webpage download."""
1215         self.to_screen('%s: Downloading webpage' % video_id)
1216
1217     def report_age_confirmation(self):
1218         """Report attempt to confirm age."""
1219         self.to_screen('Confirming age')
1220
1221     def report_login(self):
1222         """Report attempt to log in."""
1223         self.to_screen('Logging in')
1224
1225     def raise_login_required(
1226             self, msg='This video is only available for registered users',
1227             metadata_available=False, method=NO_DEFAULT):
1228         if metadata_available and (
1229                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1230             self.report_warning(msg)
1231             return
1232         msg += format_field(self._login_hint(method), None, '. %s')
1233         raise ExtractorError(msg, expected=True)
1234
1235     def raise_geo_restricted(
1236             self, msg='This video is not available from your location due to geo restriction',
1237             countries=None, metadata_available=False):
1238         if metadata_available and (
1239                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1240             self.report_warning(msg)
1241         else:
1242             raise GeoRestrictedError(msg, countries=countries)
1243
1244     def raise_no_formats(self, msg, expected=False, video_id=None):
1245         if expected and (
1246                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1247             self.report_warning(msg, video_id)
1248         elif isinstance(msg, ExtractorError):
1249             raise msg
1250         else:
1251             raise ExtractorError(msg, expected=expected, video_id=video_id)
1252
1253     # Methods for following #608
1254     @staticmethod
1255     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1256         """Returns a URL that points to a page that should be processed"""
1257         if ie is not None:
1258             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1259         if video_id is not None:
1260             kwargs['id'] = video_id
1261         if video_title is not None:
1262             kwargs['title'] = video_title
1263         return {
1264             **kwargs,
1265             '_type': 'url_transparent' if url_transparent else 'url',
1266             'url': url,
1267         }
1268
1269     @classmethod
1270     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1271                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1272         return cls.playlist_result(
1273             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1274             playlist_id, playlist_title, **kwargs)
1275
1276     @staticmethod
1277     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1278         """Returns a playlist"""
1279         if playlist_id:
1280             kwargs['id'] = playlist_id
1281         if playlist_title:
1282             kwargs['title'] = playlist_title
1283         if playlist_description is not None:
1284             kwargs['description'] = playlist_description
1285         return {
1286             **kwargs,
1287             '_type': 'multi_video' if multi_video else 'playlist',
1288             'entries': entries,
1289         }
1290
1291     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1292         """
1293         Perform a regex search on the given string, using a single or a list of
1294         patterns returning the first matching group.
1295         In case of failure return a default value or raise a WARNING or a
1296         RegexNotFoundError, depending on fatal, specifying the field name.
1297         """
1298         if string is None:
1299             mobj = None
1300         elif isinstance(pattern, (str, re.Pattern)):
1301             mobj = re.search(pattern, string, flags)
1302         else:
1303             for p in pattern:
1304                 mobj = re.search(p, string, flags)
1305                 if mobj:
1306                     break
1307
1308         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1309
1310         if mobj:
1311             if group is None:
1312                 # return the first matching group
1313                 return next(g for g in mobj.groups() if g is not None)
1314             elif isinstance(group, (list, tuple)):
1315                 return tuple(mobj.group(g) for g in group)
1316             else:
1317                 return mobj.group(group)
1318         elif default is not NO_DEFAULT:
1319             return default
1320         elif fatal:
1321             raise RegexNotFoundError('Unable to extract %s' % _name)
1322         else:
1323             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1324             return None
1325
1326     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1327                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1328         """Searches string for the JSON object specified by start_pattern"""
1329         # NB: end_pattern is only used to reduce the size of the initial match
1330         if default is NO_DEFAULT:
1331             default, has_default = {}, False
1332         else:
1333             fatal, has_default = False, True
1334
1335         json_string = self._search_regex(
1336             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1337             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1338         if not json_string:
1339             return default
1340
1341         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1342         try:
1343             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1344         except ExtractorError as e:
1345             if fatal:
1346                 raise ExtractorError(
1347                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1348             elif not has_default:
1349                 self.report_warning(
1350                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1351         return default
1352
1353     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1354         """
1355         Like _search_regex, but strips HTML tags and unescapes entities.
1356         """
1357         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1358         if isinstance(res, tuple):
1359             return tuple(map(clean_html, res))
1360         return clean_html(res)
1361
1362     def _get_netrc_login_info(self, netrc_machine=None):
1363         netrc_machine = netrc_machine or self._NETRC_MACHINE
1364
1365         cmd = self.get_param('netrc_cmd')
1366         if cmd:
1367             cmd = cmd.replace('{}', netrc_machine)
1368             self.to_screen(f'Executing command: {cmd}')
1369             stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1370             if ret != 0:
1371                 raise OSError(f'Command returned error code {ret}')
1372             info = netrc_from_content(stdout).authenticators(netrc_machine)
1373
1374         elif self.get_param('usenetrc', False):
1375             netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1376             if os.path.isdir(netrc_file):
1377                 netrc_file = os.path.join(netrc_file, '.netrc')
1378             info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1379
1380         else:
1381             return None, None
1382         if not info:
1383             self.to_screen(f'No authenticators for {netrc_machine}')
1384             return None, None
1385
1386         self.write_debug(f'Using netrc for {netrc_machine} authentication')
1387         return info[0], info[2]
1388
1389     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1390         """
1391         Get the login info as (username, password)
1392         First look for the manually specified credentials using username_option
1393         and password_option as keys in params dictionary. If no such credentials
1394         are available try the netrc_cmd if it is defined or look in the
1395         netrc file using the netrc_machine or _NETRC_MACHINE value.
1396         If there's no info available, return (None, None)
1397         """
1398
1399         username = self.get_param(username_option)
1400         if username is not None:
1401             password = self.get_param(password_option)
1402         else:
1403             try:
1404                 username, password = self._get_netrc_login_info(netrc_machine)
1405             except (OSError, netrc.NetrcParseError) as err:
1406                 self.report_warning(f'Failed to parse .netrc: {err}')
1407                 return None, None
1408         return username, password
1409
1410     def _get_tfa_info(self, note='two-factor verification code'):
1411         """
1412         Get the two-factor authentication info
1413         TODO - asking the user will be required for sms/phone verify
1414         currently just uses the command line option
1415         If there's no info available, return None
1416         """
1417
1418         tfa = self.get_param('twofactor')
1419         if tfa is not None:
1420             return tfa
1421
1422         return getpass.getpass('Type %s and press [Return]: ' % note)
1423
1424     # Helper functions for extracting OpenGraph info
1425     @staticmethod
1426     def _og_regexes(prop):
1427         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1428         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1429                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1430         template = r'<meta[^>]+?%s[^>]+?%s'
1431         return [
1432             template % (property_re, content_re),
1433             template % (content_re, property_re),
1434         ]
1435
1436     @staticmethod
1437     def _meta_regex(prop):
1438         return r'''(?isx)<meta
1439                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1440                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1441
1442     def _og_search_property(self, prop, html, name=None, **kargs):
1443         prop = variadic(prop)
1444         if name is None:
1445             name = 'OpenGraph %s' % prop[0]
1446         og_regexes = []
1447         for p in prop:
1448             og_regexes.extend(self._og_regexes(p))
1449         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1450         if escaped is None:
1451             return None
1452         return unescapeHTML(escaped)
1453
1454     def _og_search_thumbnail(self, html, **kargs):
1455         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1456
1457     def _og_search_description(self, html, **kargs):
1458         return self._og_search_property('description', html, fatal=False, **kargs)
1459
1460     def _og_search_title(self, html, *, fatal=False, **kargs):
1461         return self._og_search_property('title', html, fatal=fatal, **kargs)
1462
1463     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1464         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1465         if secure:
1466             regexes = self._og_regexes('video:secure_url') + regexes
1467         return self._html_search_regex(regexes, html, name, **kargs)
1468
1469     def _og_search_url(self, html, **kargs):
1470         return self._og_search_property('url', html, **kargs)
1471
1472     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1473         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1474
1475     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1476         name = variadic(name)
1477         if display_name is None:
1478             display_name = name[0]
1479         return self._html_search_regex(
1480             [self._meta_regex(n) for n in name],
1481             html, display_name, fatal=fatal, group='content', **kwargs)
1482
1483     def _dc_search_uploader(self, html):
1484         return self._html_search_meta('dc.creator', html, 'uploader')
1485
1486     @staticmethod
1487     def _rta_search(html):
1488         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1489         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1490                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1491                      html):
1492             return 18
1493
1494         # And then there are the jokers who advertise that they use RTA, but actually don't.
1495         AGE_LIMIT_MARKERS = [
1496             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1497             r'>[^<]*you acknowledge you are at least (\d+) years old',
1498             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1499         ]
1500
1501         age_limit = 0
1502         for marker in AGE_LIMIT_MARKERS:
1503             mobj = re.search(marker, html)
1504             if mobj:
1505                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1506         return age_limit
1507
1508     def _media_rating_search(self, html):
1509         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1510         rating = self._html_search_meta('rating', html)
1511
1512         if not rating:
1513             return None
1514
1515         RATING_TABLE = {
1516             'safe for kids': 0,
1517             'general': 8,
1518             '14 years': 14,
1519             'mature': 17,
1520             'restricted': 19,
1521         }
1522         return RATING_TABLE.get(rating.lower())
1523
1524     def _family_friendly_search(self, html):
1525         # See http://schema.org/VideoObject
1526         family_friendly = self._html_search_meta(
1527             'isFamilyFriendly', html, default=None)
1528
1529         if not family_friendly:
1530             return None
1531
1532         RATING_TABLE = {
1533             '1': 0,
1534             'true': 0,
1535             '0': 18,
1536             'false': 18,
1537         }
1538         return RATING_TABLE.get(family_friendly.lower())
1539
1540     def _twitter_search_player(self, html):
1541         return self._html_search_meta('twitter:player', html,
1542                                       'twitter card player')
1543
1544     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1545         """Yield all json ld objects in the html"""
1546         if default is not NO_DEFAULT:
1547             fatal = False
1548         for mobj in re.finditer(JSON_LD_RE, html):
1549             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1550             for json_ld in variadic(json_ld_item):
1551                 if isinstance(json_ld, dict):
1552                     yield json_ld
1553
1554     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1555         """Search for a video in any json ld in the html"""
1556         if default is not NO_DEFAULT:
1557             fatal = False
1558         info = self._json_ld(
1559             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1560             video_id, fatal=fatal, expected_type=expected_type)
1561         if info:
1562             return info
1563         if default is not NO_DEFAULT:
1564             return default
1565         elif fatal:
1566             raise RegexNotFoundError('Unable to extract JSON-LD')
1567         else:
1568             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1569             return {}
1570
1571     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1572         if isinstance(json_ld, str):
1573             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1574         if not json_ld:
1575             return {}
1576         info = {}
1577
1578         INTERACTION_TYPE_MAP = {
1579             'CommentAction': 'comment',
1580             'AgreeAction': 'like',
1581             'DisagreeAction': 'dislike',
1582             'LikeAction': 'like',
1583             'DislikeAction': 'dislike',
1584             'ListenAction': 'view',
1585             'WatchAction': 'view',
1586             'ViewAction': 'view',
1587         }
1588
1589         def is_type(e, *expected_types):
1590             type = variadic(traverse_obj(e, '@type'))
1591             return any(x in type for x in expected_types)
1592
1593         def extract_interaction_type(e):
1594             interaction_type = e.get('interactionType')
1595             if isinstance(interaction_type, dict):
1596                 interaction_type = interaction_type.get('@type')
1597             return str_or_none(interaction_type)
1598
1599         def extract_interaction_statistic(e):
1600             interaction_statistic = e.get('interactionStatistic')
1601             if isinstance(interaction_statistic, dict):
1602                 interaction_statistic = [interaction_statistic]
1603             if not isinstance(interaction_statistic, list):
1604                 return
1605             for is_e in interaction_statistic:
1606                 if not is_type(is_e, 'InteractionCounter'):
1607                     continue
1608                 interaction_type = extract_interaction_type(is_e)
1609                 if not interaction_type:
1610                     continue
1611                 # For interaction count some sites provide string instead of
1612                 # an integer (as per spec) with non digit characters (e.g. ",")
1613                 # so extracting count with more relaxed str_to_int
1614                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1615                 if interaction_count is None:
1616                     continue
1617                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1618                 if not count_kind:
1619                     continue
1620                 count_key = '%s_count' % count_kind
1621                 if info.get(count_key) is not None:
1622                     continue
1623                 info[count_key] = interaction_count
1624
1625         def extract_chapter_information(e):
1626             chapters = [{
1627                 'title': part.get('name'),
1628                 'start_time': part.get('startOffset'),
1629                 'end_time': part.get('endOffset'),
1630             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1631             for idx, (last_c, current_c, next_c) in enumerate(zip(
1632                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1633                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1634                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1635                 if None in current_c.values():
1636                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1637                     return
1638             if chapters:
1639                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1640                 info['chapters'] = chapters
1641
1642         def extract_video_object(e):
1643             author = e.get('author')
1644             info.update({
1645                 'url': url_or_none(e.get('contentUrl')),
1646                 'ext': mimetype2ext(e.get('encodingFormat')),
1647                 'title': unescapeHTML(e.get('name')),
1648                 'description': unescapeHTML(e.get('description')),
1649                 'thumbnails': [{'url': unescapeHTML(url)}
1650                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1651                                if url_or_none(url)],
1652                 'duration': parse_duration(e.get('duration')),
1653                 'timestamp': unified_timestamp(e.get('uploadDate')),
1654                 # author can be an instance of 'Organization' or 'Person' types.
1655                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1656                 # however some websites are using 'Text' type instead.
1657                 # 1. https://schema.org/VideoObject
1658                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1659                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1660                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1661                 'tbr': int_or_none(e.get('bitrate')),
1662                 'width': int_or_none(e.get('width')),
1663                 'height': int_or_none(e.get('height')),
1664                 'view_count': int_or_none(e.get('interactionCount')),
1665                 'tags': try_call(lambda: e.get('keywords').split(',')),
1666             })
1667             if is_type(e, 'AudioObject'):
1668                 info.update({
1669                     'vcodec': 'none',
1670                     'abr': int_or_none(e.get('bitrate')),
1671                 })
1672             extract_interaction_statistic(e)
1673             extract_chapter_information(e)
1674
1675         def traverse_json_ld(json_ld, at_top_level=True):
1676             for e in variadic(json_ld):
1677                 if not isinstance(e, dict):
1678                     continue
1679                 if at_top_level and '@context' not in e:
1680                     continue
1681                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1682                     traverse_json_ld(e['@graph'], at_top_level=False)
1683                     continue
1684                 if expected_type is not None and not is_type(e, expected_type):
1685                     continue
1686                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1687                 if rating is not None:
1688                     info['average_rating'] = rating
1689                 if is_type(e, 'TVEpisode', 'Episode'):
1690                     episode_name = unescapeHTML(e.get('name'))
1691                     info.update({
1692                         'episode': episode_name,
1693                         'episode_number': int_or_none(e.get('episodeNumber')),
1694                         'description': unescapeHTML(e.get('description')),
1695                     })
1696                     if not info.get('title') and episode_name:
1697                         info['title'] = episode_name
1698                     part_of_season = e.get('partOfSeason')
1699                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1700                         info.update({
1701                             'season': unescapeHTML(part_of_season.get('name')),
1702                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1703                         })
1704                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1705                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1706                         info['series'] = unescapeHTML(part_of_series.get('name'))
1707                 elif is_type(e, 'Movie'):
1708                     info.update({
1709                         'title': unescapeHTML(e.get('name')),
1710                         'description': unescapeHTML(e.get('description')),
1711                         'duration': parse_duration(e.get('duration')),
1712                         'timestamp': unified_timestamp(e.get('dateCreated')),
1713                     })
1714                 elif is_type(e, 'Article', 'NewsArticle'):
1715                     info.update({
1716                         'timestamp': parse_iso8601(e.get('datePublished')),
1717                         'title': unescapeHTML(e.get('headline')),
1718                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1719                     })
1720                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1721                         extract_video_object(e['video'][0])
1722                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1723                         extract_video_object(e['subjectOf'][0])
1724                 elif is_type(e, 'VideoObject', 'AudioObject'):
1725                     extract_video_object(e)
1726                     if expected_type is None:
1727                         continue
1728                     else:
1729                         break
1730                 video = e.get('video')
1731                 if is_type(video, 'VideoObject'):
1732                     extract_video_object(video)
1733                 if expected_type is None:
1734                     continue
1735                 else:
1736                     break
1737
1738         traverse_json_ld(json_ld)
1739         return filter_dict(info)
1740
1741     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1742         return self._parse_json(
1743             self._search_regex(
1744                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1745                 webpage, 'next.js data', fatal=fatal, **kw),
1746             video_id, transform_source=transform_source, fatal=fatal)
1747
1748     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1749         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1750         rectx = re.escape(context_name)
1751         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1752         js, arg_keys, arg_vals = self._search_regex(
1753             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1754             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1755             default=NO_DEFAULT if fatal else (None, None, None))
1756         if js is None:
1757             return {}
1758
1759         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1760             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1761
1762         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1763         return traverse_obj(ret, traverse) or {}
1764
1765     @staticmethod
1766     def _hidden_inputs(html):
1767         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1768         hidden_inputs = {}
1769         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1770             attrs = extract_attributes(input)
1771             if not input:
1772                 continue
1773             if attrs.get('type') not in ('hidden', 'submit'):
1774                 continue
1775             name = attrs.get('name') or attrs.get('id')
1776             value = attrs.get('value')
1777             if name and value is not None:
1778                 hidden_inputs[name] = value
1779         return hidden_inputs
1780
1781     def _form_hidden_inputs(self, form_id, html):
1782         form = self._search_regex(
1783             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1784             html, '%s form' % form_id, group='form')
1785         return self._hidden_inputs(form)
1786
1787     @classproperty(cache=True)
1788     def FormatSort(cls):
1789         class FormatSort(FormatSorter):
1790             def __init__(ie, *args, **kwargs):
1791                 super().__init__(ie._downloader, *args, **kwargs)
1792
1793         deprecation_warning(
1794             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1795             'Use yt_dlp.utils.FormatSorter instead')
1796         return FormatSort
1797
1798     def _sort_formats(self, formats, field_preference=[]):
1799         if not field_preference:
1800             self._downloader.deprecation_warning(
1801                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1802             return
1803         self._downloader.deprecation_warning(
1804             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1805             'Return _format_sort_fields in the info_dict instead')
1806         if formats:
1807             formats[0]['__sort_fields'] = field_preference
1808
1809     def _check_formats(self, formats, video_id):
1810         if formats:
1811             formats[:] = filter(
1812                 lambda f: self._is_valid_url(
1813                     f['url'], video_id,
1814                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1815                 formats)
1816
1817     @staticmethod
1818     def _remove_duplicate_formats(formats):
1819         format_urls = set()
1820         unique_formats = []
1821         for f in formats:
1822             if f['url'] not in format_urls:
1823                 format_urls.add(f['url'])
1824                 unique_formats.append(f)
1825         formats[:] = unique_formats
1826
1827     def _is_valid_url(self, url, video_id, item='video', headers={}):
1828         url = self._proto_relative_url(url, scheme='http:')
1829         # For now assume non HTTP(S) URLs always valid
1830         if not (url.startswith('http://') or url.startswith('https://')):
1831             return True
1832         try:
1833             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1834             return True
1835         except ExtractorError as e:
1836             self.to_screen(
1837                 '%s: %s URL is invalid, skipping: %s'
1838                 % (video_id, item, error_to_compat_str(e.cause)))
1839             return False
1840
1841     def http_scheme(self):
1842         """ Either "http:" or "https:", depending on the user's preferences """
1843         return (
1844             'http:'
1845             if self.get_param('prefer_insecure', False)
1846             else 'https:')
1847
1848     def _proto_relative_url(self, url, scheme=None):
1849         scheme = scheme or self.http_scheme()
1850         assert scheme.endswith(':')
1851         return sanitize_url(url, scheme=scheme[:-1])
1852
1853     def _sleep(self, timeout, video_id, msg_template=None):
1854         if msg_template is None:
1855             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1856         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1857         self.to_screen(msg)
1858         time.sleep(timeout)
1859
1860     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1861                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1862                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1863         if self.get_param('ignore_no_formats_error'):
1864             fatal = False
1865
1866         res = self._download_xml_handle(
1867             manifest_url, video_id, 'Downloading f4m manifest',
1868             'Unable to download f4m manifest',
1869             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1870             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1871             transform_source=transform_source,
1872             fatal=fatal, data=data, headers=headers, query=query)
1873         if res is False:
1874             return []
1875
1876         manifest, urlh = res
1877         manifest_url = urlh.url
1878
1879         return self._parse_f4m_formats(
1880             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1881             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1882
1883     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1884                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1885                            fatal=True, m3u8_id=None):
1886         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1887             return []
1888
1889         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1890         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1891         if akamai_pv is not None and ';' in akamai_pv.text:
1892             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1893             if playerVerificationChallenge.strip() != '':
1894                 return []
1895
1896         formats = []
1897         manifest_version = '1.0'
1898         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1899         if not media_nodes:
1900             manifest_version = '2.0'
1901             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1902         # Remove unsupported DRM protected media from final formats
1903         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1904         media_nodes = remove_encrypted_media(media_nodes)
1905         if not media_nodes:
1906             return formats
1907
1908         manifest_base_url = get_base_url(manifest)
1909
1910         bootstrap_info = xpath_element(
1911             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1912             'bootstrap info', default=None)
1913
1914         vcodec = None
1915         mime_type = xpath_text(
1916             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1917             'base URL', default=None)
1918         if mime_type and mime_type.startswith('audio/'):
1919             vcodec = 'none'
1920
1921         for i, media_el in enumerate(media_nodes):
1922             tbr = int_or_none(media_el.attrib.get('bitrate'))
1923             width = int_or_none(media_el.attrib.get('width'))
1924             height = int_or_none(media_el.attrib.get('height'))
1925             format_id = join_nonempty(f4m_id, tbr or i)
1926             # If <bootstrapInfo> is present, the specified f4m is a
1927             # stream-level manifest, and only set-level manifests may refer to
1928             # external resources.  See section 11.4 and section 4 of F4M spec
1929             if bootstrap_info is None:
1930                 media_url = None
1931                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1932                 if manifest_version == '2.0':
1933                     media_url = media_el.attrib.get('href')
1934                 if media_url is None:
1935                     media_url = media_el.attrib.get('url')
1936                 if not media_url:
1937                     continue
1938                 manifest_url = (
1939                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1940                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1941                 # If media_url is itself a f4m manifest do the recursive extraction
1942                 # since bitrates in parent manifest (this one) and media_url manifest
1943                 # may differ leading to inability to resolve the format by requested
1944                 # bitrate in f4m downloader
1945                 ext = determine_ext(manifest_url)
1946                 if ext == 'f4m':
1947                     f4m_formats = self._extract_f4m_formats(
1948                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1949                         transform_source=transform_source, fatal=fatal)
1950                     # Sometimes stream-level manifest contains single media entry that
1951                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1952                     # At the same time parent's media entry in set-level manifest may
1953                     # contain it. We will copy it from parent in such cases.
1954                     if len(f4m_formats) == 1:
1955                         f = f4m_formats[0]
1956                         f.update({
1957                             'tbr': f.get('tbr') or tbr,
1958                             'width': f.get('width') or width,
1959                             'height': f.get('height') or height,
1960                             'format_id': f.get('format_id') if not tbr else format_id,
1961                             'vcodec': vcodec,
1962                         })
1963                     formats.extend(f4m_formats)
1964                     continue
1965                 elif ext == 'm3u8':
1966                     formats.extend(self._extract_m3u8_formats(
1967                         manifest_url, video_id, 'mp4', preference=preference,
1968                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1969                     continue
1970             formats.append({
1971                 'format_id': format_id,
1972                 'url': manifest_url,
1973                 'manifest_url': manifest_url,
1974                 'ext': 'flv' if bootstrap_info is not None else None,
1975                 'protocol': 'f4m',
1976                 'tbr': tbr,
1977                 'width': width,
1978                 'height': height,
1979                 'vcodec': vcodec,
1980                 'preference': preference,
1981                 'quality': quality,
1982             })
1983         return formats
1984
1985     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1986         return {
1987             'format_id': join_nonempty(m3u8_id, 'meta'),
1988             'url': m3u8_url,
1989             'ext': ext,
1990             'protocol': 'm3u8',
1991             'preference': preference - 100 if preference else -100,
1992             'quality': quality,
1993             'resolution': 'multiple',
1994             'format_note': 'Quality selection URL',
1995         }
1996
1997     def _report_ignoring_subs(self, name):
1998         self.report_warning(bug_reports_message(
1999             f'Ignoring subtitle tracks found in the {name} manifest; '
2000             'if any subtitle tracks are missing,'
2001         ), only_once=True)
2002
2003     def _extract_m3u8_formats(self, *args, **kwargs):
2004         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2005         if subs:
2006             self._report_ignoring_subs('HLS')
2007         return fmts
2008
2009     def _extract_m3u8_formats_and_subtitles(
2010             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2011             preference=None, quality=None, m3u8_id=None, note=None,
2012             errnote=None, fatal=True, live=False, data=None, headers={},
2013             query={}):
2014
2015         if self.get_param('ignore_no_formats_error'):
2016             fatal = False
2017
2018         if not m3u8_url:
2019             if errnote is not False:
2020                 errnote = errnote or 'Failed to obtain m3u8 URL'
2021                 if fatal:
2022                     raise ExtractorError(errnote, video_id=video_id)
2023                 self.report_warning(f'{errnote}{bug_reports_message()}')
2024             return [], {}
2025
2026         res = self._download_webpage_handle(
2027             m3u8_url, video_id,
2028             note='Downloading m3u8 information' if note is None else note,
2029             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2030             fatal=fatal, data=data, headers=headers, query=query)
2031
2032         if res is False:
2033             return [], {}
2034
2035         m3u8_doc, urlh = res
2036         m3u8_url = urlh.url
2037
2038         return self._parse_m3u8_formats_and_subtitles(
2039             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2040             preference=preference, quality=quality, m3u8_id=m3u8_id,
2041             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2042             headers=headers, query=query, video_id=video_id)
2043
2044     def _parse_m3u8_formats_and_subtitles(
2045             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2046             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2047             errnote=None, fatal=True, data=None, headers={}, query={},
2048             video_id=None):
2049         formats, subtitles = [], {}
2050         has_drm = HlsFD._has_drm(m3u8_doc)
2051
2052         def format_url(url):
2053             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2054
2055         if self.get_param('hls_split_discontinuity', False):
2056             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2057                 if not m3u8_doc:
2058                     if not manifest_url:
2059                         return []
2060                     m3u8_doc = self._download_webpage(
2061                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2062                         note=False, errnote='Failed to download m3u8 playlist information')
2063                     if m3u8_doc is False:
2064                         return []
2065                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2066
2067         else:
2068             def _extract_m3u8_playlist_indices(*args, **kwargs):
2069                 return [None]
2070
2071         # References:
2072         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2073         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2074         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2075
2076         # We should try extracting formats only from master playlists [1, 4.3.4],
2077         # i.e. playlists that describe available qualities. On the other hand
2078         # media playlists [1, 4.3.3] should be returned as is since they contain
2079         # just the media without qualities renditions.
2080         # Fortunately, master playlist can be easily distinguished from media
2081         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2082         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2083         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2084         # media playlist and MUST NOT appear in master playlist thus we can
2085         # clearly detect media playlist with this criterion.
2086
2087         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2088             formats = [{
2089                 'format_id': join_nonempty(m3u8_id, idx),
2090                 'format_index': idx,
2091                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2092                 'ext': ext,
2093                 'protocol': entry_protocol,
2094                 'preference': preference,
2095                 'quality': quality,
2096                 'has_drm': has_drm,
2097             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2098
2099             return formats, subtitles
2100
2101         groups = {}
2102         last_stream_inf = {}
2103
2104         def extract_media(x_media_line):
2105             media = parse_m3u8_attributes(x_media_line)
2106             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2107             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2108             if not (media_type and group_id and name):
2109                 return
2110             groups.setdefault(group_id, []).append(media)
2111             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2112             if media_type == 'SUBTITLES':
2113                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2114                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2115                 # However, lack of URI has been spotted in the wild.
2116                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2117                 if not media.get('URI'):
2118                     return
2119                 url = format_url(media['URI'])
2120                 sub_info = {
2121                     'url': url,
2122                     'ext': determine_ext(url),
2123                 }
2124                 if sub_info['ext'] == 'm3u8':
2125                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2126                     # files may contain is WebVTT:
2127                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2128                     sub_info['ext'] = 'vtt'
2129                     sub_info['protocol'] = 'm3u8_native'
2130                 lang = media.get('LANGUAGE') or 'und'
2131                 subtitles.setdefault(lang, []).append(sub_info)
2132             if media_type not in ('VIDEO', 'AUDIO'):
2133                 return
2134             media_url = media.get('URI')
2135             if media_url:
2136                 manifest_url = format_url(media_url)
2137                 formats.extend({
2138                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2139                     'format_note': name,
2140                     'format_index': idx,
2141                     'url': manifest_url,
2142                     'manifest_url': m3u8_url,
2143                     'language': media.get('LANGUAGE'),
2144                     'ext': ext,
2145                     'protocol': entry_protocol,
2146                     'preference': preference,
2147                     'quality': quality,
2148                     'has_drm': has_drm,
2149                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2150                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2151
2152         def build_stream_name():
2153             # Despite specification does not mention NAME attribute for
2154             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2155             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2156             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2157             stream_name = last_stream_inf.get('NAME')
2158             if stream_name:
2159                 return stream_name
2160             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2161             # from corresponding rendition group
2162             stream_group_id = last_stream_inf.get('VIDEO')
2163             if not stream_group_id:
2164                 return
2165             stream_group = groups.get(stream_group_id)
2166             if not stream_group:
2167                 return stream_group_id
2168             rendition = stream_group[0]
2169             return rendition.get('NAME') or stream_group_id
2170
2171         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2172         # chance to detect video only formats when EXT-X-STREAM-INF tags
2173         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2174         for line in m3u8_doc.splitlines():
2175             if line.startswith('#EXT-X-MEDIA:'):
2176                 extract_media(line)
2177
2178         for line in m3u8_doc.splitlines():
2179             if line.startswith('#EXT-X-STREAM-INF:'):
2180                 last_stream_inf = parse_m3u8_attributes(line)
2181             elif line.startswith('#') or not line.strip():
2182                 continue
2183             else:
2184                 tbr = float_or_none(
2185                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2186                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2187                 manifest_url = format_url(line.strip())
2188
2189                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2190                     format_id = [m3u8_id, None, idx]
2191                     # Bandwidth of live streams may differ over time thus making
2192                     # format_id unpredictable. So it's better to keep provided
2193                     # format_id intact.
2194                     if not live:
2195                         stream_name = build_stream_name()
2196                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2197                     f = {
2198                         'format_id': join_nonempty(*format_id),
2199                         'format_index': idx,
2200                         'url': manifest_url,
2201                         'manifest_url': m3u8_url,
2202                         'tbr': tbr,
2203                         'ext': ext,
2204                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2205                         'protocol': entry_protocol,
2206                         'preference': preference,
2207                         'quality': quality,
2208                         'has_drm': has_drm,
2209                     }
2210                     resolution = last_stream_inf.get('RESOLUTION')
2211                     if resolution:
2212                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2213                         if mobj:
2214                             f['width'] = int(mobj.group('width'))
2215                             f['height'] = int(mobj.group('height'))
2216                     # Unified Streaming Platform
2217                     mobj = re.search(
2218                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2219                     if mobj:
2220                         abr, vbr = mobj.groups()
2221                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2222                         f.update({
2223                             'vbr': vbr,
2224                             'abr': abr,
2225                         })
2226                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2227                     f.update(codecs)
2228                     audio_group_id = last_stream_inf.get('AUDIO')
2229                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2230                     # references a rendition group MUST have a CODECS attribute.
2231                     # However, this is not always respected. E.g. [2]
2232                     # contains EXT-X-STREAM-INF tag which references AUDIO
2233                     # rendition group but does not have CODECS and despite
2234                     # referencing an audio group it represents a complete
2235                     # (with audio and video) format. So, for such cases we will
2236                     # ignore references to rendition groups and treat them
2237                     # as complete formats.
2238                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2239                         audio_group = groups.get(audio_group_id)
2240                         if audio_group and audio_group[0].get('URI'):
2241                             # TODO: update acodec for audio only formats with
2242                             # the same GROUP-ID
2243                             f['acodec'] = 'none'
2244                     if not f.get('ext'):
2245                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2246                     formats.append(f)
2247
2248                     # for DailyMotion
2249                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2250                     if progressive_uri:
2251                         http_f = f.copy()
2252                         del http_f['manifest_url']
2253                         http_f.update({
2254                             'format_id': f['format_id'].replace('hls-', 'http-'),
2255                             'protocol': 'http',
2256                             'url': progressive_uri,
2257                         })
2258                         formats.append(http_f)
2259
2260                 last_stream_inf = {}
2261         return formats, subtitles
2262
2263     def _extract_m3u8_vod_duration(
2264             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2265
2266         m3u8_vod = self._download_webpage(
2267             m3u8_vod_url, video_id,
2268             note='Downloading m3u8 VOD manifest' if note is None else note,
2269             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2270             fatal=False, data=data, headers=headers, query=query)
2271
2272         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2273
2274     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2275         if '#EXT-X-ENDLIST' not in m3u8_vod:
2276             return None
2277
2278         return int(sum(
2279             float(line[len('#EXTINF:'):].split(',')[0])
2280             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2281
2282     def _extract_mpd_vod_duration(
2283             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2284
2285         mpd_doc = self._download_xml(
2286             mpd_url, video_id,
2287             note='Downloading MPD VOD manifest' if note is None else note,
2288             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2289             fatal=False, data=data, headers=headers, query=query)
2290         if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2291             return None
2292         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2293
2294     @staticmethod
2295     def _xpath_ns(path, namespace=None):
2296         if not namespace:
2297             return path
2298         out = []
2299         for c in path.split('/'):
2300             if not c or c == '.':
2301                 out.append(c)
2302             else:
2303                 out.append('{%s}%s' % (namespace, c))
2304         return '/'.join(out)
2305
2306     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2307         if self.get_param('ignore_no_formats_error'):
2308             fatal = False
2309
2310         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2311         if res is False:
2312             assert not fatal
2313             return [], {}
2314         smil, urlh = res
2315
2316         return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2317                                                       namespace=self._parse_smil_namespace(smil))
2318
2319     def _extract_smil_formats(self, *args, **kwargs):
2320         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2321         if subs:
2322             self._report_ignoring_subs('SMIL')
2323         return fmts
2324
2325     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2326         res = self._download_smil(smil_url, video_id, fatal=fatal)
2327         if res is False:
2328             return {}
2329
2330         smil, urlh = res
2331         smil_url = urlh.url
2332
2333         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2334
2335     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2336         return self._download_xml_handle(
2337             smil_url, video_id, 'Downloading SMIL file',
2338             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2339
2340     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2341         namespace = self._parse_smil_namespace(smil)
2342
2343         formats, subtitles = self._parse_smil_formats_and_subtitles(
2344             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2345
2346         video_id = os.path.splitext(url_basename(smil_url))[0]
2347         title = None
2348         description = None
2349         upload_date = None
2350         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2351             name = meta.attrib.get('name')
2352             content = meta.attrib.get('content')
2353             if not name or not content:
2354                 continue
2355             if not title and name == 'title':
2356                 title = content
2357             elif not description and name in ('description', 'abstract'):
2358                 description = content
2359             elif not upload_date and name == 'date':
2360                 upload_date = unified_strdate(content)
2361
2362         thumbnails = [{
2363             'id': image.get('type'),
2364             'url': image.get('src'),
2365             'width': int_or_none(image.get('width')),
2366             'height': int_or_none(image.get('height')),
2367         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2368
2369         return {
2370             'id': video_id,
2371             'title': title or video_id,
2372             'description': description,
2373             'upload_date': upload_date,
2374             'thumbnails': thumbnails,
2375             'formats': formats,
2376             'subtitles': subtitles,
2377         }
2378
2379     def _parse_smil_namespace(self, smil):
2380         return self._search_regex(
2381             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2382
2383     def _parse_smil_formats(self, *args, **kwargs):
2384         fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2385         if subs:
2386             self._report_ignoring_subs('SMIL')
2387         return fmts
2388
2389     def _parse_smil_formats_and_subtitles(
2390             self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2391         base = smil_url
2392         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2393             b = meta.get('base') or meta.get('httpBase')
2394             if b:
2395                 base = b
2396                 break
2397
2398         formats, subtitles = [], {}
2399         rtmp_count = 0
2400         http_count = 0
2401         m3u8_count = 0
2402         imgs_count = 0
2403
2404         srcs = set()
2405         media = itertools.chain.from_iterable(
2406             smil.findall(self._xpath_ns(arg, namespace))
2407             for arg in ['.//video', './/audio', './/media'])
2408         for medium in media:
2409             src = medium.get('src')
2410             if not src or src in srcs:
2411                 continue
2412             srcs.add(src)
2413
2414             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2415             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2416             width = int_or_none(medium.get('width'))
2417             height = int_or_none(medium.get('height'))
2418             proto = medium.get('proto')
2419             ext = medium.get('ext')
2420             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2421                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2422             streamer = medium.get('streamer') or base
2423
2424             if proto == 'rtmp' or streamer.startswith('rtmp'):
2425                 rtmp_count += 1
2426                 formats.append({
2427                     'url': streamer,
2428                     'play_path': src,
2429                     'ext': 'flv',
2430                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2431                     'tbr': bitrate,
2432                     'filesize': filesize,
2433                     'width': width,
2434                     'height': height,
2435                 })
2436                 if transform_rtmp_url:
2437                     streamer, src = transform_rtmp_url(streamer, src)
2438                     formats[-1].update({
2439                         'url': streamer,
2440                         'play_path': src,
2441                     })
2442                 continue
2443
2444             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2445             src_url = src_url.strip()
2446
2447             if proto == 'm3u8' or src_ext == 'm3u8':
2448                 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
2449                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2450                 self._merge_subtitles(m3u8_subs, target=subtitles)
2451                 if len(m3u8_formats) == 1:
2452                     m3u8_count += 1
2453                     m3u8_formats[0].update({
2454                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2455                         'tbr': bitrate,
2456                         'width': width,
2457                         'height': height,
2458                     })
2459                 formats.extend(m3u8_formats)
2460             elif src_ext == 'f4m':
2461                 f4m_url = src_url
2462                 if not f4m_params:
2463                     f4m_params = {
2464                         'hdcore': '3.2.0',
2465                         'plugin': 'flowplayer-3.2.0.1',
2466                     }
2467                 f4m_url += '&' if '?' in f4m_url else '?'
2468                 f4m_url += urllib.parse.urlencode(f4m_params)
2469                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2470             elif src_ext == 'mpd':
2471                 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2472                     src_url, video_id, mpd_id='dash', fatal=False)
2473                 formats.extend(mpd_formats)
2474                 self._merge_subtitles(mpd_subs, target=subtitles)
2475             elif re.search(r'\.ism/[Mm]anifest', src_url):
2476                 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2477                     src_url, video_id, ism_id='mss', fatal=False)
2478                 formats.extend(ism_formats)
2479                 self._merge_subtitles(ism_subs, target=subtitles)
2480             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2481                 http_count += 1
2482                 formats.append({
2483                     'url': src_url,
2484                     'ext': ext or src_ext or 'flv',
2485                     'format_id': 'http-%d' % (bitrate or http_count),
2486                     'tbr': bitrate,
2487                     'filesize': filesize,
2488                     'width': width,
2489                     'height': height,
2490                 })
2491
2492         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2493             src = medium.get('src')
2494             if not src or src in srcs:
2495                 continue
2496             srcs.add(src)
2497
2498             imgs_count += 1
2499             formats.append({
2500                 'format_id': 'imagestream-%d' % (imgs_count),
2501                 'url': src,
2502                 'ext': mimetype2ext(medium.get('type')),
2503                 'acodec': 'none',
2504                 'vcodec': 'none',
2505                 'width': int_or_none(medium.get('width')),
2506                 'height': int_or_none(medium.get('height')),
2507                 'format_note': 'SMIL storyboards',
2508             })
2509
2510         smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2511         self._merge_subtitles(smil_subs, target=subtitles)
2512
2513         return formats, subtitles
2514
2515     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2516         urls = []
2517         subtitles = {}
2518         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2519             src = textstream.get('src')
2520             if not src or src in urls:
2521                 continue
2522             urls.append(src)
2523             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2524             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2525             subtitles.setdefault(lang, []).append({
2526                 'url': src,
2527                 'ext': ext,
2528             })
2529         return subtitles
2530
2531     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2532         res = self._download_xml_handle(
2533             xspf_url, playlist_id, 'Downloading xpsf playlist',
2534             'Unable to download xspf manifest', fatal=fatal)
2535         if res is False:
2536             return []
2537
2538         xspf, urlh = res
2539         xspf_url = urlh.url
2540
2541         return self._parse_xspf(
2542             xspf, playlist_id, xspf_url=xspf_url,
2543             xspf_base_url=base_url(xspf_url))
2544
2545     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2546         NS_MAP = {
2547             'xspf': 'http://xspf.org/ns/0/',
2548             's1': 'http://static.streamone.nl/player/ns/0',
2549         }
2550
2551         entries = []
2552         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2553             title = xpath_text(
2554                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2555             description = xpath_text(
2556                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2557             thumbnail = xpath_text(
2558                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2559             duration = float_or_none(
2560                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2561
2562             formats = []
2563             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2564                 format_url = urljoin(xspf_base_url, location.text)
2565                 if not format_url:
2566                     continue
2567                 formats.append({
2568                     'url': format_url,
2569                     'manifest_url': xspf_url,
2570                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2571                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2572                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2573                 })
2574
2575             entries.append({
2576                 'id': playlist_id,
2577                 'title': title,
2578                 'description': description,
2579                 'thumbnail': thumbnail,
2580                 'duration': duration,
2581                 'formats': formats,
2582             })
2583         return entries
2584
2585     def _extract_mpd_formats(self, *args, **kwargs):
2586         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2587         if subs:
2588             self._report_ignoring_subs('DASH')
2589         return fmts
2590
2591     def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
2592         periods = self._extract_mpd_periods(*args, **kwargs)
2593         return self._merge_mpd_periods(periods)
2594
2595     def _extract_mpd_periods(
2596             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2597             fatal=True, data=None, headers={}, query={}):
2598
2599         if self.get_param('ignore_no_formats_error'):
2600             fatal = False
2601
2602         res = self._download_xml_handle(
2603             mpd_url, video_id,
2604             note='Downloading MPD manifest' if note is None else note,
2605             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2606             fatal=fatal, data=data, headers=headers, query=query)
2607         if res is False:
2608             return []
2609         mpd_doc, urlh = res
2610         if mpd_doc is None:
2611             return []
2612
2613         # We could have been redirected to a new url when we retrieved our mpd file.
2614         mpd_url = urlh.url
2615         mpd_base_url = base_url(mpd_url)
2616
2617         return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
2618
2619     def _parse_mpd_formats(self, *args, **kwargs):
2620         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2621         if subs:
2622             self._report_ignoring_subs('DASH')
2623         return fmts
2624
2625     def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
2626         periods = self._parse_mpd_periods(*args, **kwargs)
2627         return self._merge_mpd_periods(periods)
2628
2629     def _merge_mpd_periods(self, periods):
2630         """
2631         Combine all formats and subtitles from an MPD manifest into a single list,
2632         by concatenate streams with similar formats.
2633         """
2634         formats, subtitles = {}, {}
2635         for period in periods:
2636             for f in period['formats']:
2637                 assert 'is_dash_periods' not in f, 'format already processed'
2638                 f['is_dash_periods'] = True
2639                 format_key = tuple(v for k, v in f.items() if k not in (
2640                     ('format_id', 'fragments', 'manifest_stream_number')))
2641                 if format_key not in formats:
2642                     formats[format_key] = f
2643                 elif 'fragments' in f:
2644                     formats[format_key].setdefault('fragments', []).extend(f['fragments'])
2645
2646             if subtitles and period['subtitles']:
2647                 self.report_warning(bug_reports_message(
2648                     'Found subtitles in multiple periods in the DASH manifest; '
2649                     'if part of the subtitles are missing,'
2650                 ), only_once=True)
2651
2652             for sub_lang, sub_info in period['subtitles'].items():
2653                 subtitles.setdefault(sub_lang, []).extend(sub_info)
2654
2655         return list(formats.values()), subtitles
2656
2657     def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2658         """
2659         Parse formats from MPD manifest.
2660         References:
2661          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2662             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2663          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2664         """
2665         if not self.get_param('dynamic_mpd', True):
2666             if mpd_doc.get('type') == 'dynamic':
2667                 return [], {}
2668
2669         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2670
2671         def _add_ns(path):
2672             return self._xpath_ns(path, namespace)
2673
2674         def is_drm_protected(element):
2675             return element.find(_add_ns('ContentProtection')) is not None
2676
2677         def extract_multisegment_info(element, ms_parent_info):
2678             ms_info = ms_parent_info.copy()
2679
2680             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2681             # common attributes and elements.  We will only extract relevant
2682             # for us.
2683             def extract_common(source):
2684                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2685                 if segment_timeline is not None:
2686                     s_e = segment_timeline.findall(_add_ns('S'))
2687                     if s_e:
2688                         ms_info['total_number'] = 0
2689                         ms_info['s'] = []
2690                         for s in s_e:
2691                             r = int(s.get('r', 0))
2692                             ms_info['total_number'] += 1 + r
2693                             ms_info['s'].append({
2694                                 't': int(s.get('t', 0)),
2695                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2696                                 'd': int(s.attrib['d']),
2697                                 'r': r,
2698                             })
2699                 start_number = source.get('startNumber')
2700                 if start_number:
2701                     ms_info['start_number'] = int(start_number)
2702                 timescale = source.get('timescale')
2703                 if timescale:
2704                     ms_info['timescale'] = int(timescale)
2705                 segment_duration = source.get('duration')
2706                 if segment_duration:
2707                     ms_info['segment_duration'] = float(segment_duration)
2708
2709             def extract_Initialization(source):
2710                 initialization = source.find(_add_ns('Initialization'))
2711                 if initialization is not None:
2712                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2713
2714             segment_list = element.find(_add_ns('SegmentList'))
2715             if segment_list is not None:
2716                 extract_common(segment_list)
2717                 extract_Initialization(segment_list)
2718                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2719                 if segment_urls_e:
2720                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2721             else:
2722                 segment_template = element.find(_add_ns('SegmentTemplate'))
2723                 if segment_template is not None:
2724                     extract_common(segment_template)
2725                     media = segment_template.get('media')
2726                     if media:
2727                         ms_info['media'] = media
2728                     initialization = segment_template.get('initialization')
2729                     if initialization:
2730                         ms_info['initialization'] = initialization
2731                     else:
2732                         extract_Initialization(segment_template)
2733             return ms_info
2734
2735         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2736         stream_numbers = collections.defaultdict(int)
2737         for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
2738             period_entry = {
2739                 'id': period.get('id', f'period-{period_idx}'),
2740                 'formats': [],
2741                 'subtitles': collections.defaultdict(list),
2742             }
2743             period_duration = parse_duration(period.get('duration')) or mpd_duration
2744             period_ms_info = extract_multisegment_info(period, {
2745                 'start_number': 1,
2746                 'timescale': 1,
2747             })
2748             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2749                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2750                 for representation in adaptation_set.findall(_add_ns('Representation')):
2751                     representation_attrib = adaptation_set.attrib.copy()
2752                     representation_attrib.update(representation.attrib)
2753                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2754                     mime_type = representation_attrib['mimeType']
2755                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2756
2757                     codec_str = representation_attrib.get('codecs', '')
2758                     # Some kind of binary subtitle found in some youtube livestreams
2759                     if mime_type == 'application/x-rawcc':
2760                         codecs = {'scodec': codec_str}
2761                     else:
2762                         codecs = parse_codecs(codec_str)
2763                     if content_type not in ('video', 'audio', 'text'):
2764                         if mime_type == 'image/jpeg':
2765                             content_type = mime_type
2766                         elif codecs.get('vcodec', 'none') != 'none':
2767                             content_type = 'video'
2768                         elif codecs.get('acodec', 'none') != 'none':
2769                             content_type = 'audio'
2770                         elif codecs.get('scodec', 'none') != 'none':
2771                             content_type = 'text'
2772                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2773                             content_type = 'text'
2774                         else:
2775                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2776                             continue
2777
2778                     base_url = ''
2779                     for element in (representation, adaptation_set, period, mpd_doc):
2780                         base_url_e = element.find(_add_ns('BaseURL'))
2781                         if try_call(lambda: base_url_e.text) is not None:
2782                             base_url = base_url_e.text + base_url
2783                             if re.match(r'^https?://', base_url):
2784                                 break
2785                     if mpd_base_url and base_url.startswith('/'):
2786                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2787                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2788                         if not mpd_base_url.endswith('/'):
2789                             mpd_base_url += '/'
2790                         base_url = mpd_base_url + base_url
2791                     representation_id = representation_attrib.get('id')
2792                     lang = representation_attrib.get('lang')
2793                     url_el = representation.find(_add_ns('BaseURL'))
2794                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2795                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2796                     if representation_id is not None:
2797                         format_id = representation_id
2798                     else:
2799                         format_id = content_type
2800                     if mpd_id:
2801                         format_id = mpd_id + '-' + format_id
2802                     if content_type in ('video', 'audio'):
2803                         f = {
2804                             'format_id': format_id,
2805                             'manifest_url': mpd_url,
2806                             'ext': mimetype2ext(mime_type),
2807                             'width': int_or_none(representation_attrib.get('width')),
2808                             'height': int_or_none(representation_attrib.get('height')),
2809                             'tbr': float_or_none(bandwidth, 1000),
2810                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2811                             'fps': int_or_none(representation_attrib.get('frameRate')),
2812                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2813                             'format_note': 'DASH %s' % content_type,
2814                             'filesize': filesize,
2815                             'container': mimetype2ext(mime_type) + '_dash',
2816                             **codecs
2817                         }
2818                     elif content_type == 'text':
2819                         f = {
2820                             'ext': mimetype2ext(mime_type),
2821                             'manifest_url': mpd_url,
2822                             'filesize': filesize,
2823                         }
2824                     elif content_type == 'image/jpeg':
2825                         # See test case in VikiIE
2826                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2827                         f = {
2828                             'format_id': format_id,
2829                             'ext': 'mhtml',
2830                             'manifest_url': mpd_url,
2831                             'format_note': 'DASH storyboards (jpeg)',
2832                             'acodec': 'none',
2833                             'vcodec': 'none',
2834                         }
2835                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2836                         f['has_drm'] = True
2837                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2838
2839                     def prepare_template(template_name, identifiers):
2840                         tmpl = representation_ms_info[template_name]
2841                         if representation_id is not None:
2842                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2843                         # First of, % characters outside $...$ templates
2844                         # must be escaped by doubling for proper processing
2845                         # by % operator string formatting used further (see
2846                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2847                         t = ''
2848                         in_template = False
2849                         for c in tmpl:
2850                             t += c
2851                             if c == '$':
2852                                 in_template = not in_template
2853                             elif c == '%' and not in_template:
2854                                 t += c
2855                         # Next, $...$ templates are translated to their
2856                         # %(...) counterparts to be used with % operator
2857                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2858                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2859                         t.replace('$$', '$')
2860                         return t
2861
2862                     # @initialization is a regular template like @media one
2863                     # so it should be handled just the same way (see
2864                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2865                     if 'initialization' in representation_ms_info:
2866                         initialization_template = prepare_template(
2867                             'initialization',
2868                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2869                             # $Time$ shall not be included for @initialization thus
2870                             # only $Bandwidth$ remains
2871                             ('Bandwidth', ))
2872                         representation_ms_info['initialization_url'] = initialization_template % {
2873                             'Bandwidth': bandwidth,
2874                         }
2875
2876                     def location_key(location):
2877                         return 'url' if re.match(r'^https?://', location) else 'path'
2878
2879                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2880
2881                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2882                         media_location_key = location_key(media_template)
2883
2884                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2885                         # can't be used at the same time
2886                         if '%(Number' in media_template and 's' not in representation_ms_info:
2887                             segment_duration = None
2888                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2889                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2890                                 representation_ms_info['total_number'] = int(math.ceil(
2891                                     float_or_none(period_duration, segment_duration, default=0)))
2892                             representation_ms_info['fragments'] = [{
2893                                 media_location_key: media_template % {
2894                                     'Number': segment_number,
2895                                     'Bandwidth': bandwidth,
2896                                 },
2897                                 'duration': segment_duration,
2898                             } for segment_number in range(
2899                                 representation_ms_info['start_number'],
2900                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2901                         else:
2902                             # $Number*$ or $Time$ in media template with S list available
2903                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2904                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2905                             representation_ms_info['fragments'] = []
2906                             segment_time = 0
2907                             segment_d = None
2908                             segment_number = representation_ms_info['start_number']
2909
2910                             def add_segment_url():
2911                                 segment_url = media_template % {
2912                                     'Time': segment_time,
2913                                     'Bandwidth': bandwidth,
2914                                     'Number': segment_number,
2915                                 }
2916                                 representation_ms_info['fragments'].append({
2917                                     media_location_key: segment_url,
2918                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2919                                 })
2920
2921                             for num, s in enumerate(representation_ms_info['s']):
2922                                 segment_time = s.get('t') or segment_time
2923                                 segment_d = s['d']
2924                                 add_segment_url()
2925                                 segment_number += 1
2926                                 for r in range(s.get('r', 0)):
2927                                     segment_time += segment_d
2928                                     add_segment_url()
2929                                     segment_number += 1
2930                                 segment_time += segment_d
2931                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2932                         # No media template,
2933                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2934                         # or any YouTube dashsegments video
2935                         fragments = []
2936                         segment_index = 0
2937                         timescale = representation_ms_info['timescale']
2938                         for s in representation_ms_info['s']:
2939                             duration = float_or_none(s['d'], timescale)
2940                             for r in range(s.get('r', 0) + 1):
2941                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2942                                 fragments.append({
2943                                     location_key(segment_uri): segment_uri,
2944                                     'duration': duration,
2945                                 })
2946                                 segment_index += 1
2947                         representation_ms_info['fragments'] = fragments
2948                     elif 'segment_urls' in representation_ms_info:
2949                         # Segment URLs with no SegmentTimeline
2950                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2951                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2952                         fragments = []
2953                         segment_duration = float_or_none(
2954                             representation_ms_info['segment_duration'],
2955                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2956                         for segment_url in representation_ms_info['segment_urls']:
2957                             fragment = {
2958                                 location_key(segment_url): segment_url,
2959                             }
2960                             if segment_duration:
2961                                 fragment['duration'] = segment_duration
2962                             fragments.append(fragment)
2963                         representation_ms_info['fragments'] = fragments
2964                     # If there is a fragments key available then we correctly recognized fragmented media.
2965                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2966                     # assumption is not necessarily correct since we may simply have no support for
2967                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2968                     if 'fragments' in representation_ms_info:
2969                         f.update({
2970                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2971                             'url': mpd_url or base_url,
2972                             'fragment_base_url': base_url,
2973                             'fragments': [],
2974                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2975                         })
2976                         if 'initialization_url' in representation_ms_info:
2977                             initialization_url = representation_ms_info['initialization_url']
2978                             if not f.get('url'):
2979                                 f['url'] = initialization_url
2980                             f['fragments'].append({location_key(initialization_url): initialization_url})
2981                         f['fragments'].extend(representation_ms_info['fragments'])
2982                         if not period_duration:
2983                             period_duration = try_get(
2984                                 representation_ms_info,
2985                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2986                     else:
2987                         # Assuming direct URL to unfragmented media.
2988                         f['url'] = base_url
2989                     if content_type in ('video', 'audio', 'image/jpeg'):
2990                         f['manifest_stream_number'] = stream_numbers[f['url']]
2991                         stream_numbers[f['url']] += 1
2992                         period_entry['formats'].append(f)
2993                     elif content_type == 'text':
2994                         period_entry['subtitles'][lang or 'und'].append(f)
2995             yield period_entry
2996
2997     def _extract_ism_formats(self, *args, **kwargs):
2998         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2999         if subs:
3000             self._report_ignoring_subs('ISM')
3001         return fmts
3002
3003     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3004         if self.get_param('ignore_no_formats_error'):
3005             fatal = False
3006
3007         res = self._download_xml_handle(
3008             ism_url, video_id,
3009             note='Downloading ISM manifest' if note is None else note,
3010             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3011             fatal=fatal, data=data, headers=headers, query=query)
3012         if res is False:
3013             return [], {}
3014         ism_doc, urlh = res
3015         if ism_doc is None:
3016             return [], {}
3017
3018         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
3019
3020     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3021         """
3022         Parse formats from ISM manifest.
3023         References:
3024          1. [MS-SSTR]: Smooth Streaming Protocol,
3025             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3026         """
3027         if ism_doc.get('IsLive') == 'TRUE':
3028             return [], {}
3029
3030         duration = int(ism_doc.attrib['Duration'])
3031         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3032
3033         formats = []
3034         subtitles = {}
3035         for stream in ism_doc.findall('StreamIndex'):
3036             stream_type = stream.get('Type')
3037             if stream_type not in ('video', 'audio', 'text'):
3038                 continue
3039             url_pattern = stream.attrib['Url']
3040             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3041             stream_name = stream.get('Name')
3042             stream_language = stream.get('Language', 'und')
3043             for track in stream.findall('QualityLevel'):
3044                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3045                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3046                 # TODO: add support for WVC1 and WMAP
3047                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3048                     self.report_warning('%s is not a supported codec' % fourcc)
3049                     continue
3050                 tbr = int(track.attrib['Bitrate']) // 1000
3051                 # [1] does not mention Width and Height attributes. However,
3052                 # they're often present while MaxWidth and MaxHeight are
3053                 # missing, so should be used as fallbacks
3054                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3055                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3056                 sampling_rate = int_or_none(track.get('SamplingRate'))
3057
3058                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3059                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3060
3061                 fragments = []
3062                 fragment_ctx = {
3063                     'time': 0,
3064                 }
3065                 stream_fragments = stream.findall('c')
3066                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3067                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3068                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3069                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3070                     if not fragment_ctx['duration']:
3071                         try:
3072                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3073                         except IndexError:
3074                             next_fragment_time = duration
3075                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3076                     for _ in range(fragment_repeat):
3077                         fragments.append({
3078                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3079                             'duration': fragment_ctx['duration'] / stream_timescale,
3080                         })
3081                         fragment_ctx['time'] += fragment_ctx['duration']
3082
3083                 if stream_type == 'text':
3084                     subtitles.setdefault(stream_language, []).append({
3085                         'ext': 'ismt',
3086                         'protocol': 'ism',
3087                         'url': ism_url,
3088                         'manifest_url': ism_url,
3089                         'fragments': fragments,
3090                         '_download_params': {
3091                             'stream_type': stream_type,
3092                             'duration': duration,
3093                             'timescale': stream_timescale,
3094                             'fourcc': fourcc,
3095                             'language': stream_language,
3096                             'codec_private_data': track.get('CodecPrivateData'),
3097                         }
3098                     })
3099                 elif stream_type in ('video', 'audio'):
3100                     formats.append({
3101                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3102                         'url': ism_url,
3103                         'manifest_url': ism_url,
3104                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3105                         'width': width,
3106                         'height': height,
3107                         'tbr': tbr,
3108                         'asr': sampling_rate,
3109                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3110                         'acodec': 'none' if stream_type == 'video' else fourcc,
3111                         'protocol': 'ism',
3112                         'fragments': fragments,
3113                         'has_drm': ism_doc.find('Protection') is not None,
3114                         'language': stream_language,
3115                         'audio_channels': int_or_none(track.get('Channels')),
3116                         '_download_params': {
3117                             'stream_type': stream_type,
3118                             'duration': duration,
3119                             'timescale': stream_timescale,
3120                             'width': width or 0,
3121                             'height': height or 0,
3122                             'fourcc': fourcc,
3123                             'language': stream_language,
3124                             'codec_private_data': track.get('CodecPrivateData'),
3125                             'sampling_rate': sampling_rate,
3126                             'channels': int_or_none(track.get('Channels', 2)),
3127                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3128                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3129                         },
3130                     })
3131         return formats, subtitles
3132
3133     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3134         def absolute_url(item_url):
3135             return urljoin(base_url, item_url)
3136
3137         def parse_content_type(content_type):
3138             if not content_type:
3139                 return {}
3140             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3141             if ctr:
3142                 mimetype, codecs = ctr.groups()
3143                 f = parse_codecs(codecs)
3144                 f['ext'] = mimetype2ext(mimetype)
3145                 return f
3146             return {}
3147
3148         def _media_formats(src, cur_media_type, type_info=None):
3149             type_info = type_info or {}
3150             full_url = absolute_url(src)
3151             ext = type_info.get('ext') or determine_ext(full_url)
3152             if ext == 'm3u8':
3153                 is_plain_url = False
3154                 formats = self._extract_m3u8_formats(
3155                     full_url, video_id, ext='mp4',
3156                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3157                     preference=preference, quality=quality, fatal=False)
3158             elif ext == 'mpd':
3159                 is_plain_url = False
3160                 formats = self._extract_mpd_formats(
3161                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3162             else:
3163                 is_plain_url = True
3164                 formats = [{
3165                     'url': full_url,
3166                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3167                     'ext': ext,
3168                 }]
3169             return is_plain_url, formats
3170
3171         entries = []
3172         # amp-video and amp-audio are very similar to their HTML5 counterparts
3173         # so we will include them right here (see
3174         # https://www.ampproject.org/docs/reference/components/amp-video)
3175         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3176         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3177         media_tags = [(media_tag, media_tag_name, media_type, '')
3178                       for media_tag, media_tag_name, media_type
3179                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3180         media_tags.extend(re.findall(
3181             # We only allow video|audio followed by a whitespace or '>'.
3182             # Allowing more characters may end up in significant slow down (see
3183             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3184             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3185             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3186         for media_tag, _, media_type, media_content in media_tags:
3187             media_info = {
3188                 'formats': [],
3189                 'subtitles': {},
3190             }
3191             media_attributes = extract_attributes(media_tag)
3192             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3193             if src:
3194                 f = parse_content_type(media_attributes.get('type'))
3195                 _, formats = _media_formats(src, media_type, f)
3196                 media_info['formats'].extend(formats)
3197             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3198             if media_content:
3199                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3200                     s_attr = extract_attributes(source_tag)
3201                     # data-video-src and data-src are non standard but seen
3202                     # several times in the wild
3203                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3204                     if not src:
3205                         continue
3206                     f = parse_content_type(s_attr.get('type'))
3207                     is_plain_url, formats = _media_formats(src, media_type, f)
3208                     if is_plain_url:
3209                         # width, height, res, label and title attributes are
3210                         # all not standard but seen several times in the wild
3211                         labels = [
3212                             s_attr.get(lbl)
3213                             for lbl in ('label', 'title')
3214                             if str_or_none(s_attr.get(lbl))
3215                         ]
3216                         width = int_or_none(s_attr.get('width'))
3217                         height = (int_or_none(s_attr.get('height'))
3218                                   or int_or_none(s_attr.get('res')))
3219                         if not width or not height:
3220                             for lbl in labels:
3221                                 resolution = parse_resolution(lbl)
3222                                 if not resolution:
3223                                     continue
3224                                 width = width or resolution.get('width')
3225                                 height = height or resolution.get('height')
3226                         for lbl in labels:
3227                             tbr = parse_bitrate(lbl)
3228                             if tbr:
3229                                 break
3230                         else:
3231                             tbr = None
3232                         f.update({
3233                             'width': width,
3234                             'height': height,
3235                             'tbr': tbr,
3236                             'format_id': s_attr.get('label') or s_attr.get('title'),
3237                         })
3238                         f.update(formats[0])
3239                         media_info['formats'].append(f)
3240                     else:
3241                         media_info['formats'].extend(formats)
3242                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3243                     track_attributes = extract_attributes(track_tag)
3244                     kind = track_attributes.get('kind')
3245                     if not kind or kind in ('subtitles', 'captions'):
3246                         src = strip_or_none(track_attributes.get('src'))
3247                         if not src:
3248                             continue
3249                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3250                         media_info['subtitles'].setdefault(lang, []).append({
3251                             'url': absolute_url(src),
3252                         })
3253             for f in media_info['formats']:
3254                 f.setdefault('http_headers', {})['Referer'] = base_url
3255             if media_info['formats'] or media_info['subtitles']:
3256                 entries.append(media_info)
3257         return entries
3258
3259     def _extract_akamai_formats(self, *args, **kwargs):
3260         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3261         if subs:
3262             self._report_ignoring_subs('akamai')
3263         return fmts
3264
3265     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3266         signed = 'hdnea=' in manifest_url
3267         if not signed:
3268             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3269             manifest_url = re.sub(
3270                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3271                 '', manifest_url).strip('?')
3272
3273         formats = []
3274         subtitles = {}
3275
3276         hdcore_sign = 'hdcore=3.7.0'
3277         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3278         hds_host = hosts.get('hds')
3279         if hds_host:
3280             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3281         if 'hdcore=' not in f4m_url:
3282             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3283         f4m_formats = self._extract_f4m_formats(
3284             f4m_url, video_id, f4m_id='hds', fatal=False)
3285         for entry in f4m_formats:
3286             entry.update({'extra_param_to_segment_url': hdcore_sign})
3287         formats.extend(f4m_formats)
3288
3289         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3290         hls_host = hosts.get('hls')
3291         if hls_host:
3292             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3293         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3294             m3u8_url, video_id, 'mp4', 'm3u8_native',
3295             m3u8_id='hls', fatal=False)
3296         formats.extend(m3u8_formats)
3297         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3298
3299         http_host = hosts.get('http')
3300         if http_host and m3u8_formats and not signed:
3301             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3302             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3303             qualities_length = len(qualities)
3304             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3305                 i = 0
3306                 for f in m3u8_formats:
3307                     if f['vcodec'] != 'none':
3308                         for protocol in ('http', 'https'):
3309                             http_f = f.copy()
3310                             del http_f['manifest_url']
3311                             http_url = re.sub(
3312                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3313                             http_f.update({
3314                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3315                                 'url': http_url,
3316                                 'protocol': protocol,
3317                             })
3318                             formats.append(http_f)
3319                         i += 1
3320
3321         return formats, subtitles
3322
3323     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3324         query = urllib.parse.urlparse(url).query
3325         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3326         mobj = re.search(
3327             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3328         url_base = mobj.group('url')
3329         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3330         formats = []
3331
3332         def manifest_url(manifest):
3333             m_url = f'{http_base_url}/{manifest}'
3334             if query:
3335                 m_url += '?%s' % query
3336             return m_url
3337
3338         if 'm3u8' not in skip_protocols:
3339             formats.extend(self._extract_m3u8_formats(
3340                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3341                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3342         if 'f4m' not in skip_protocols:
3343             formats.extend(self._extract_f4m_formats(
3344                 manifest_url('manifest.f4m'),
3345                 video_id, f4m_id='hds', fatal=False))
3346         if 'dash' not in skip_protocols:
3347             formats.extend(self._extract_mpd_formats(
3348                 manifest_url('manifest.mpd'),
3349                 video_id, mpd_id='dash', fatal=False))
3350         if re.search(r'(?:/smil:|\.smil)', url_base):
3351             if 'smil' not in skip_protocols:
3352                 rtmp_formats = self._extract_smil_formats(
3353                     manifest_url('jwplayer.smil'),
3354                     video_id, fatal=False)
3355                 for rtmp_format in rtmp_formats:
3356                     rtsp_format = rtmp_format.copy()
3357                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3358                     del rtsp_format['play_path']
3359                     del rtsp_format['ext']
3360                     rtsp_format.update({
3361                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3362                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3363                         'protocol': 'rtsp',
3364                     })
3365                     formats.extend([rtmp_format, rtsp_format])
3366         else:
3367             for protocol in ('rtmp', 'rtsp'):
3368                 if protocol not in skip_protocols:
3369                     formats.append({
3370                         'url': f'{protocol}:{url_base}',
3371                         'format_id': protocol,
3372                         'protocol': protocol,
3373                     })
3374         return formats
3375
3376     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3377         mobj = re.search(
3378             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3379             webpage)
3380         if mobj:
3381             try:
3382                 jwplayer_data = self._parse_json(mobj.group('options'),
3383                                                  video_id=video_id,
3384                                                  transform_source=transform_source)
3385             except ExtractorError:
3386                 pass
3387             else:
3388                 if isinstance(jwplayer_data, dict):
3389                     return jwplayer_data
3390
3391     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3392         jwplayer_data = self._find_jwplayer_data(
3393             webpage, video_id, transform_source=js_to_json)
3394         return self._parse_jwplayer_data(
3395             jwplayer_data, video_id, *args, **kwargs)
3396
3397     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3398                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3399         entries = []
3400         if not isinstance(jwplayer_data, dict):
3401             return entries
3402
3403         playlist_items = jwplayer_data.get('playlist')
3404         # JWPlayer backward compatibility: single playlist item/flattened playlists
3405         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3406         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3407         if not isinstance(playlist_items, list):
3408             playlist_items = (playlist_items or jwplayer_data, )
3409
3410         for video_data in playlist_items:
3411             if not isinstance(video_data, dict):
3412                 continue
3413             # JWPlayer backward compatibility: flattened sources
3414             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3415             if 'sources' not in video_data:
3416                 video_data['sources'] = [video_data]
3417
3418             this_video_id = video_id or video_data['mediaid']
3419
3420             formats = self._parse_jwplayer_formats(
3421                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3422                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3423
3424             subtitles = {}
3425             tracks = video_data.get('tracks')
3426             if tracks and isinstance(tracks, list):
3427                 for track in tracks:
3428                     if not isinstance(track, dict):
3429                         continue
3430                     track_kind = track.get('kind')
3431                     if not track_kind or not isinstance(track_kind, str):
3432                         continue
3433                     if track_kind.lower() not in ('captions', 'subtitles'):
3434                         continue
3435                     track_url = urljoin(base_url, track.get('file'))
3436                     if not track_url:
3437                         continue
3438                     subtitles.setdefault(track.get('label') or 'en', []).append({
3439                         'url': self._proto_relative_url(track_url)
3440                     })
3441
3442             entry = {
3443                 'id': this_video_id,
3444                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3445                 'description': clean_html(video_data.get('description')),
3446                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3447                 'timestamp': int_or_none(video_data.get('pubdate')),
3448                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3449                 'subtitles': subtitles,
3450                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3451                 'genre': clean_html(video_data.get('genre')),
3452                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3453                 'season_number': int_or_none(video_data.get('season')),
3454                 'episode_number': int_or_none(video_data.get('episode')),
3455                 'release_year': int_or_none(video_data.get('releasedate')),
3456                 'age_limit': int_or_none(video_data.get('age_restriction')),
3457             }
3458             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3459             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3460                 entry.update({
3461                     '_type': 'url_transparent',
3462                     'url': formats[0]['url'],
3463                 })
3464             else:
3465                 entry['formats'] = formats
3466             entries.append(entry)
3467         if len(entries) == 1:
3468             return entries[0]
3469         else:
3470             return self.playlist_result(entries)
3471
3472     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3473                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3474         urls = set()
3475         formats = []
3476         for source in jwplayer_sources_data:
3477             if not isinstance(source, dict):
3478                 continue
3479             source_url = urljoin(
3480                 base_url, self._proto_relative_url(source.get('file')))
3481             if not source_url or source_url in urls:
3482                 continue
3483             urls.add(source_url)
3484             source_type = source.get('type') or ''
3485             ext = mimetype2ext(source_type) or determine_ext(source_url)
3486             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3487                 formats.extend(self._extract_m3u8_formats(
3488                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3489                     m3u8_id=m3u8_id, fatal=False))
3490             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3491                 formats.extend(self._extract_mpd_formats(
3492                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3493             elif ext == 'smil':
3494                 formats.extend(self._extract_smil_formats(
3495                     source_url, video_id, fatal=False))
3496             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3497             elif source_type.startswith('audio') or ext in (
3498                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3499                 formats.append({
3500                     'url': source_url,
3501                     'vcodec': 'none',
3502                     'ext': ext,
3503                 })
3504             else:
3505                 format_id = str_or_none(source.get('label'))
3506                 height = int_or_none(source.get('height'))
3507                 if height is None and format_id:
3508                     # Often no height is provided but there is a label in
3509                     # format like "1080p", "720p SD", or 1080.
3510                     height = parse_resolution(format_id).get('height')
3511                 a_format = {
3512                     'url': source_url,
3513                     'width': int_or_none(source.get('width')),
3514                     'height': height,
3515                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3516                     'filesize': int_or_none(source.get('filesize')),
3517                     'ext': ext,
3518                     'format_id': format_id
3519                 }
3520                 if source_url.startswith('rtmp'):
3521                     a_format['ext'] = 'flv'
3522                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3523                     # of jwplayer.flash.swf
3524                     rtmp_url_parts = re.split(
3525                         r'((?:mp4|mp3|flv):)', source_url, 1)
3526                     if len(rtmp_url_parts) == 3:
3527                         rtmp_url, prefix, play_path = rtmp_url_parts
3528                         a_format.update({
3529                             'url': rtmp_url,
3530                             'play_path': prefix + play_path,
3531                         })
3532                     if rtmp_params:
3533                         a_format.update(rtmp_params)
3534                 formats.append(a_format)
3535         return formats
3536
3537     def _live_title(self, name):
3538         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3539         return name
3540
3541     def _int(self, v, name, fatal=False, **kwargs):
3542         res = int_or_none(v, **kwargs)
3543         if res is None:
3544             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3545             if fatal:
3546                 raise ExtractorError(msg)
3547             else:
3548                 self.report_warning(msg)
3549         return res
3550
3551     def _float(self, v, name, fatal=False, **kwargs):
3552         res = float_or_none(v, **kwargs)
3553         if res is None:
3554             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3555             if fatal:
3556                 raise ExtractorError(msg)
3557             else:
3558                 self.report_warning(msg)
3559         return res
3560
3561     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3562                     path='/', secure=False, discard=False, rest={}, **kwargs):
3563         cookie = http.cookiejar.Cookie(
3564             0, name, value, port, port is not None, domain, True,
3565             domain.startswith('.'), path, True, secure, expire_time,
3566             discard, None, None, rest)
3567         self.cookiejar.set_cookie(cookie)
3568
3569     def _get_cookies(self, url):
3570         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3571         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3572
3573     def _apply_first_set_cookie_header(self, url_handle, cookie):
3574         """
3575         Apply first Set-Cookie header instead of the last. Experimental.
3576
3577         Some sites (e.g. [1-3]) may serve two cookies under the same name
3578         in Set-Cookie header and expect the first (old) one to be set rather
3579         than second (new). However, as of RFC6265 the newer one cookie
3580         should be set into cookie store what actually happens.
3581         We will workaround this issue by resetting the cookie to
3582         the first one manually.
3583         1. https://new.vk.com/
3584         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3585         3. https://learning.oreilly.com/
3586         """
3587         for header, cookies in url_handle.headers.items():
3588             if header.lower() != 'set-cookie':
3589                 continue
3590             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3591             cookie_value = re.search(
3592                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3593             if cookie_value:
3594                 value, domain = cookie_value.groups()
3595                 self._set_cookie(domain, cookie, value)
3596                 break
3597
3598     @classmethod
3599     def get_testcases(cls, include_onlymatching=False):
3600         # Do not look in super classes
3601         t = vars(cls).get('_TEST')
3602         if t:
3603             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3604             tests = [t]
3605         else:
3606             tests = vars(cls).get('_TESTS', [])
3607         for t in tests:
3608             if not include_onlymatching and t.get('only_matching', False):
3609                 continue
3610             t['name'] = cls.ie_key()
3611             yield t
3612         if getattr(cls, '__wrapped__', None):
3613             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3614
3615     @classmethod
3616     def get_webpage_testcases(cls):
3617         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3618         for t in tests:
3619             t['name'] = cls.ie_key()
3620             yield t
3621         if getattr(cls, '__wrapped__', None):
3622             yield from cls.__wrapped__.get_webpage_testcases()
3623
3624     @classproperty(cache=True)
3625     def age_limit(cls):
3626         """Get age limit from the testcases"""
3627         return max(traverse_obj(
3628             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3629             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3630
3631     @classproperty(cache=True)
3632     def _RETURN_TYPE(cls):
3633         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3634         tests = tuple(cls.get_testcases(include_onlymatching=False))
3635         if not tests:
3636             return None
3637         elif not any(k.startswith('playlist') for test in tests for k in test):
3638             return 'video'
3639         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3640             return 'playlist'
3641         return 'any'
3642
3643     @classmethod
3644     def is_single_video(cls, url):
3645         """Returns whether the URL is of a single video, None if unknown"""
3646         if cls.suitable(url):
3647             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3648
3649     @classmethod
3650     def is_suitable(cls, age_limit):
3651         """Test whether the extractor is generally suitable for the given age limit"""
3652         return not age_restricted(cls.age_limit, age_limit)
3653
3654     @classmethod
3655     def description(cls, *, markdown=True, search_examples=None):
3656         """Description of the extractor"""
3657         desc = ''
3658         if cls._NETRC_MACHINE:
3659             if markdown:
3660                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3661             else:
3662                 desc += f' [{cls._NETRC_MACHINE}]'
3663         if cls.IE_DESC is False:
3664             desc += ' [HIDDEN]'
3665         elif cls.IE_DESC:
3666             desc += f' {cls.IE_DESC}'
3667         if cls.SEARCH_KEY:
3668             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3669             if search_examples:
3670                 _COUNTS = ('', '5', '10', 'all')
3671                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3672         if not cls.working():
3673             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3674
3675         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3676         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3677         return f'{name}:{desc}' if desc else name
3678
3679     def extract_subtitles(self, *args, **kwargs):
3680         if (self.get_param('writesubtitles', False)
3681                 or self.get_param('listsubtitles')):
3682             return self._get_subtitles(*args, **kwargs)
3683         return {}
3684
3685     def _get_subtitles(self, *args, **kwargs):
3686         raise NotImplementedError('This method must be implemented by subclasses')
3687
3688     class CommentsDisabled(Exception):
3689         """Raise in _get_comments if comments are disabled for the video"""
3690
3691     def extract_comments(self, *args, **kwargs):
3692         if not self.get_param('getcomments'):
3693             return None
3694         generator = self._get_comments(*args, **kwargs)
3695
3696         def extractor():
3697             comments = []
3698             interrupted = True
3699             try:
3700                 while True:
3701                     comments.append(next(generator))
3702             except StopIteration:
3703                 interrupted = False
3704             except KeyboardInterrupt:
3705                 self.to_screen('Interrupted by user')
3706             except self.CommentsDisabled:
3707                 return {'comments': None, 'comment_count': None}
3708             except Exception as e:
3709                 if self.get_param('ignoreerrors') is not True:
3710                     raise
3711                 self._downloader.report_error(e)
3712             comment_count = len(comments)
3713             self.to_screen(f'Extracted {comment_count} comments')
3714             return {
3715                 'comments': comments,
3716                 'comment_count': None if interrupted else comment_count
3717             }
3718         return extractor
3719
3720     def _get_comments(self, *args, **kwargs):
3721         raise NotImplementedError('This method must be implemented by subclasses')
3722
3723     @staticmethod
3724     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3725         """ Merge subtitle items for one language. Items with duplicated URLs/data
3726         will be dropped. """
3727         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3728         ret = list(subtitle_list1)
3729         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3730         return ret
3731
3732     @classmethod
3733     def _merge_subtitles(cls, *dicts, target=None):
3734         """ Merge subtitle dictionaries, language by language. """
3735         if target is None:
3736             target = {}
3737         for d in dicts:
3738             for lang, subs in d.items():
3739                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3740         return target
3741
3742     def extract_automatic_captions(self, *args, **kwargs):
3743         if (self.get_param('writeautomaticsub', False)
3744                 or self.get_param('listsubtitles')):
3745             return self._get_automatic_captions(*args, **kwargs)
3746         return {}
3747
3748     def _get_automatic_captions(self, *args, **kwargs):
3749         raise NotImplementedError('This method must be implemented by subclasses')
3750
3751     @functools.cached_property
3752     def _cookies_passed(self):
3753         """Whether cookies have been passed to YoutubeDL"""
3754         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3755
3756     def mark_watched(self, *args, **kwargs):
3757         if not self.get_param('mark_watched', False):
3758             return
3759         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3760             self._mark_watched(*args, **kwargs)
3761
3762     def _mark_watched(self, *args, **kwargs):
3763         raise NotImplementedError('This method must be implemented by subclasses')
3764
3765     def geo_verification_headers(self):
3766         headers = {}
3767         geo_verification_proxy = self.get_param('geo_verification_proxy')
3768         if geo_verification_proxy:
3769             headers['Ytdl-request-proxy'] = geo_verification_proxy
3770         return headers
3771
3772     @staticmethod
3773     def _generic_id(url):
3774         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3775
3776     def _generic_title(self, url='', webpage='', *, default=None):
3777         return (self._og_search_title(webpage, default=None)
3778                 or self._html_extract_title(webpage, default=None)
3779                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3780                 or default)
3781
3782     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3783         if not duration:
3784             return
3785         chapter_list = [{
3786             'start_time': start_function(chapter),
3787             'title': title_function(chapter),
3788         } for chapter in chapter_list or []]
3789         if strict:
3790             warn = self.report_warning
3791         else:
3792             warn = self.write_debug
3793             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3794
3795         chapters = [{'start_time': 0}]
3796         for idx, chapter in enumerate(chapter_list):
3797             if chapter['start_time'] is None:
3798                 warn(f'Incomplete chapter {idx}')
3799             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3800                 chapters.append(chapter)
3801             elif chapter not in chapters:
3802                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3803                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3804                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3805         return chapters[1:]
3806
3807     def _extract_chapters_from_description(self, description, duration):
3808         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3809         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3810         return self._extract_chapters_helper(
3811             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3812             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3813             duration=duration, strict=False) or self._extract_chapters_helper(
3814             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3815             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3816             duration=duration, strict=False)
3817
3818     @staticmethod
3819     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3820         all_known = all(map(
3821             lambda x: x is not None,
3822             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3823         return (
3824             'private' if is_private
3825             else 'premium_only' if needs_premium
3826             else 'subscriber_only' if needs_subscription
3827             else 'needs_auth' if needs_auth
3828             else 'unlisted' if is_unlisted
3829             else 'public' if all_known
3830             else None)
3831
3832     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3833         '''
3834         @returns            A list of values for the extractor argument given by "key"
3835                             or "default" if no such key is present
3836         @param default      The default value to return when the key is not present (default: [])
3837         @param casesense    When false, the values are converted to lower case
3838         '''
3839         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3840         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3841         if val is None:
3842             return [] if default is NO_DEFAULT else default
3843         return list(val) if casesense else [x.lower() for x in val]
3844
3845     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3846         if not playlist_id or not video_id:
3847             return not video_id
3848
3849         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3850         if no_playlist is not None:
3851             return not no_playlist
3852
3853         video_id = '' if video_id is True else f' {video_id}'
3854         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3855         if self.get_param('noplaylist'):
3856             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3857             return False
3858         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3859         return True
3860
3861     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3862         RetryManager.report_retry(
3863             err, _count or int(fatal), _retries,
3864             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3865             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3866
3867     def RetryManager(self, **kwargs):
3868         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3869
3870     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3871         display_id = traverse_obj(info_dict, 'display_id', 'id')
3872         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3873         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3874             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3875
3876     @classmethod
3877     def extract_from_webpage(cls, ydl, url, webpage):
3878         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3879               else ydl.get_info_extractor(cls.ie_key()))
3880         for info in ie._extract_from_webpage(url, webpage) or []:
3881             # url = None since we do not want to set (webpage/original)_url
3882             ydl.add_default_extra_info(info, ie, None)
3883             yield info
3884
3885     @classmethod
3886     def _extract_from_webpage(cls, url, webpage):
3887         for embed_url in orderedSet(
3888                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3889             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3890
3891     @classmethod
3892     def _extract_embed_urls(cls, url, webpage):
3893         """@returns all the embed urls on the webpage"""
3894         if '_EMBED_URL_RE' not in cls.__dict__:
3895             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3896             for idx, regex in enumerate(cls._EMBED_REGEX):
3897                 assert regex.count('(?P<url>') == 1, \
3898                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3899             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3900
3901         for regex in cls._EMBED_URL_RE:
3902             for mobj in regex.finditer(webpage):
3903                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3904                 if cls._VALID_URL is False or cls.suitable(embed_url):
3905                     yield embed_url
3906
3907     class StopExtraction(Exception):
3908         pass
3909
3910     @classmethod
3911     def _extract_url(cls, webpage):  # TODO: Remove
3912         """Only for compatibility with some older extractors"""
3913         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3914
3915     @classmethod
3916     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3917         if plugin_name:
3918             mro = inspect.getmro(cls)
3919             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3920             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3921             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3922             while getattr(super_class, '__wrapped__', None):
3923                 super_class = super_class.__wrapped__
3924             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3925             _PLUGIN_OVERRIDES[super_class].append(cls)
3926
3927         return super().__init_subclass__(**kwargs)
3928
3929
3930 class SearchInfoExtractor(InfoExtractor):
3931     """
3932     Base class for paged search queries extractors.
3933     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3934     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3935     """
3936
3937     _MAX_RESULTS = float('inf')
3938     _RETURN_TYPE = 'playlist'
3939
3940     @classproperty
3941     def _VALID_URL(cls):
3942         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3943
3944     def _real_extract(self, query):
3945         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3946         if prefix == '':
3947             return self._get_n_results(query, 1)
3948         elif prefix == 'all':
3949             return self._get_n_results(query, self._MAX_RESULTS)
3950         else:
3951             n = int(prefix)
3952             if n <= 0:
3953                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3954             elif n > self._MAX_RESULTS:
3955                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3956                 n = self._MAX_RESULTS
3957             return self._get_n_results(query, n)
3958
3959     def _get_n_results(self, query, n):
3960         """Get a specified number of results for a query.
3961         Either this function or _search_results must be overridden by subclasses """
3962         return self.playlist_result(
3963             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3964             query, query)
3965
3966     def _search_results(self, query):
3967         """Returns an iterator of search results"""
3968         raise NotImplementedError('This method must be implemented by subclasses')
3969
3970     @classproperty
3971     def SEARCH_KEY(cls):
3972         return cls._SEARCH_KEY
3973
3974
3975 class UnsupportedURLIE(InfoExtractor):
3976     _VALID_URL = '.*'
3977     _ENABLED = False
3978     IE_DESC = False
3979
3980     def _real_extract(self, url):
3981         raise UnsupportedError(url)
3982
3983
3984 _PLUGIN_OVERRIDES = collections.defaultdict(list)