yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import subprocess
  17 import sys
  18 import time
  19 import types
  20 import urllib.parse
  21 import urllib.request
  22 import xml.etree.ElementTree
  23
  24 from ..compat import functools  # isort: split
  25 from ..compat import (
  26     compat_etree_fromstring,
  27     compat_expanduser,
  28     compat_os_name,
  29     urllib_req_to_req,
  30 )
  31 from ..cookies import LenientSimpleCookie
  32 from ..downloader.f4m import get_base_url, remove_encrypted_media
  33 from ..downloader.hls import HlsFD
  34 from ..networking import HEADRequest, Request
  35 from ..networking.exceptions import (
  36     HTTPError,
  37     IncompleteRead,
  38     network_exceptions,
  39 )
  40 from ..utils import (
  41     IDENTITY,
  42     JSON_LD_RE,
  43     NO_DEFAULT,
  44     ExtractorError,
  45     FormatSorter,
  46     GeoRestrictedError,
  47     GeoUtils,
  48     LenientJSONDecoder,
  49     Popen,
  50     RegexNotFoundError,
  51     RetryManager,
  52     UnsupportedError,
  53     age_restricted,
  54     base_url,
  55     bug_reports_message,
  56     classproperty,
  57     clean_html,
  58     deprecation_warning,
  59     determine_ext,
  60     dict_get,
  61     encode_data_uri,
  62     error_to_compat_str,
  63     extract_attributes,
  64     filter_dict,
  65     fix_xml_ampersands,
  66     float_or_none,
  67     format_field,
  68     int_or_none,
  69     join_nonempty,
  70     js_to_json,
  71     mimetype2ext,
  72     netrc_from_content,
  73     orderedSet,
  74     parse_bitrate,
  75     parse_codecs,
  76     parse_duration,
  77     parse_iso8601,
  78     parse_m3u8_attributes,
  79     parse_resolution,
  80     sanitize_filename,
  81     sanitize_url,
  82     smuggle_url,
  83     str_or_none,
  84     str_to_int,
  85     strip_or_none,
  86     traverse_obj,
  87     truncate_string,
  88     try_call,
  89     try_get,
  90     unescapeHTML,
  91     unified_strdate,
  92     unified_timestamp,
  93     url_basename,
  94     url_or_none,
  95     urlhandle_detect_ext,
  96     urljoin,
  97     variadic,
  98     xpath_element,
  99     xpath_text,
 100     xpath_with_ns,
 101 )
 102
 103
 104 class InfoExtractor:
 105     """Information Extractor class.
 106
 107     Information extractors are the classes that, given a URL, extract
 108     information about the video (or videos) the URL refers to. This
 109     information includes the real video URL, the video title, author and
 110     others. The information is stored in a dictionary which is then
 111     passed to the YoutubeDL. The YoutubeDL processes this
 112     information possibly downloading the video to the file system, among
 113     other possible outcomes.
 114
 115     The type field determines the type of the result.
 116     By far the most common value (and the default if _type is missing) is
 117     "video", which indicates a single video.
 118
 119     For a video, the dictionaries must include the following fields:
 120
 121     id:             Video identifier.
 122     title:          Video title, unescaped. Set to an empty string if video has
 123                     no title as opposed to "None" which signifies that the
 124                     extractor failed to obtain a title
 125
 126     Additionally, it must contain either a formats entry or a url one:
 127
 128     formats:        A list of dictionaries for each format available, ordered
 129                     from worst to best quality.
 130
 131                     Potential fields:
 132                     * url        The mandatory URL representing the media:
 133                                    for plain file media - HTTP URL of this file,
 134                                    for RTMP - RTMP URL,
 135                                    for HLS - URL of the M3U8 media playlist,
 136                                    for HDS - URL of the F4M manifest,
 137                                    for DASH
 138                                      - HTTP URL to plain file media (in case of
 139                                        unfragmented media)
 140                                      - URL of the MPD manifest or base URL
 141                                        representing the media if MPD manifest
 142                                        is parsed from a string (in case of
 143                                        fragmented media)
 144                                    for MSS - URL of the ISM manifest.
 145                     * request_data  Data to send in POST request to the URL
 146                     * manifest_url
 147                                  The URL of the manifest file in case of
 148                                  fragmented media:
 149                                    for HLS - URL of the M3U8 master playlist,
 150                                    for HDS - URL of the F4M manifest,
 151                                    for DASH - URL of the MPD manifest,
 152                                    for MSS - URL of the ISM manifest.
 153                     * manifest_stream_number  (For internal use only)
 154                                  The index of the stream in the manifest file
 155                     * ext        Will be calculated from URL if missing
 156                     * format     A human-readable description of the format
 157                                  ("mp4 container with h264/opus").
 158                                  Calculated from the format_id, width, height.
 159                                  and format_note fields if missing.
 160                     * format_id  A short description of the format
 161                                  ("mp4_h264_opus" or "19").
 162                                 Technically optional, but strongly recommended.
 163                     * format_note Additional info about the format
 164                                  ("3D" or "DASH video")
 165                     * width      Width of the video, if known
 166                     * height     Height of the video, if known
 167                     * aspect_ratio  Aspect ratio of the video, if known
 168                                  Automatically calculated from width and height
 169                     * resolution Textual description of width and height
 170                                  Automatically calculated from width and height
 171                     * dynamic_range The dynamic range of the video. One of:
 172                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 173                     * tbr        Average bitrate of audio and video in KBit/s
 174                     * abr        Average audio bitrate in KBit/s
 175                     * acodec     Name of the audio codec in use
 176                     * asr        Audio sampling rate in Hertz
 177                     * audio_channels  Number of audio channels
 178                     * vbr        Average video bitrate in KBit/s
 179                     * fps        Frame rate
 180                     * vcodec     Name of the video codec in use
 181                     * container  Name of the container format
 182                     * filesize   The number of bytes, if known in advance
 183                     * filesize_approx  An estimate for the number of bytes
 184                     * player_url SWF Player URL (used for rtmpdump).
 185                     * protocol   The protocol that will be used for the actual
 186                                  download, lower-case. One of "http", "https" or
 187                                  one of the protocols defined in downloader.PROTOCOL_MAP
 188                     * fragment_base_url
 189                                  Base URL for fragments. Each fragment's path
 190                                  value (if present) will be relative to
 191                                  this URL.
 192                     * fragments  A list of fragments of a fragmented media.
 193                                  Each fragment entry must contain either an url
 194                                  or a path. If an url is present it should be
 195                                  considered by a client. Otherwise both path and
 196                                  fragment_base_url must be present. Here is
 197                                  the list of all potential fields:
 198                                  * "url" - fragment's URL
 199                                  * "path" - fragment's path relative to
 200                                             fragment_base_url
 201                                  * "duration" (optional, int or float)
 202                                  * "filesize" (optional, int)
 203                     * is_from_start  Is a live format that can be downloaded
 204                                 from the start. Boolean
 205                     * preference Order number of this format. If this field is
 206                                  present and not None, the formats get sorted
 207                                  by this field, regardless of all other values.
 208                                  -1 for default (order by other properties),
 209                                  -2 or smaller for less than default.
 210                                  < -1000 to hide the format (if there is
 211                                     another one which is strictly better)
 212                     * language   Language code, e.g. "de" or "en-US".
 213                     * language_preference  Is this in the language mentioned in
 214                                  the URL?
 215                                  10 if it's what the URL is about,
 216                                  -1 for default (don't know),
 217                                  -10 otherwise, other values reserved for now.
 218                     * quality    Order number of the video quality of this
 219                                  format, irrespective of the file format.
 220                                  -1 for default (order by other properties),
 221                                  -2 or smaller for less than default.
 222                     * source_preference  Order number for this video source
 223                                   (quality takes higher priority)
 224                                  -1 for default (order by other properties),
 225                                  -2 or smaller for less than default.
 226                     * http_headers  A dictionary of additional HTTP headers
 227                                  to add to the request.
 228                     * stretched_ratio  If given and not 1, indicates that the
 229                                  video's pixels are not square.
 230                                  width : height ratio as float.
 231                     * no_resume  The server does not support resuming the
 232                                  (HTTP or RTMP) download. Boolean.
 233                     * has_drm    True if the format has DRM and cannot be downloaded.
 234                                  'maybe' if the format may have DRM and has to be tested before download.
 235                     * extra_param_to_segment_url  A query string to append to each
 236                                  fragment's URL, or to update each existing query string
 237                                  with. Only applied by the native HLS/DASH downloaders.
 238                     * hls_aes    A dictionary of HLS AES-128 decryption information
 239                                  used by the native HLS downloader to override the
 240                                  values in the media playlist when an '#EXT-X-KEY' tag
 241                                  is present in the playlist:
 242                                  * uri  The URI from which the key will be downloaded
 243                                  * key  The key (as hex) used to decrypt fragments.
 244                                         If `key` is given, any key URI will be ignored
 245                                  * iv   The IV (as hex) used to decrypt fragments
 246                     * downloader_options  A dictionary of downloader options
 247                                  (For internal use only)
 248                                  * http_chunk_size Chunk size for HTTP downloads
 249                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 250                     RTMP formats can also have the additional fields: page_url,
 251                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 252                     rtmp_protocol, rtmp_real_time
 253
 254     url:            Final video URL.
 255     ext:            Video filename extension.
 256     format:         The video format, defaults to ext (used for --get-format)
 257     player_url:     SWF Player URL (used for rtmpdump).
 258
 259     The following fields are optional:
 260
 261     direct:         True if a direct video file was given (must only be set by GenericIE)
 262     alt_title:      A secondary title of the video.
 263     display_id      An alternative identifier for the video, not necessarily
 264                     unique, but available before title. Typically, id is
 265                     something like "4234987", title "Dancing naked mole rats",
 266                     and display_id "dancing-naked-mole-rats"
 267     thumbnails:     A list of dictionaries, with the following entries:
 268                         * "id" (optional, string) - Thumbnail format ID
 269                         * "url"
 270                         * "preference" (optional, int) - quality of the image
 271                         * "width" (optional, int)
 272                         * "height" (optional, int)
 273                         * "resolution" (optional, string "{width}x{height}",
 274                                         deprecated)
 275                         * "filesize" (optional, int)
 276                         * "http_headers" (dict) - HTTP headers for the request
 277     thumbnail:      Full URL to a video thumbnail image.
 278     description:    Full video description.
 279     uploader:       Full name of the video uploader.
 280     license:        License name the video is licensed under.
 281     creator:        The creator of the video.
 282     timestamp:      UNIX timestamp of the moment the video was uploaded
 283     upload_date:    Video upload date in UTC (YYYYMMDD).
 284                     If not explicitly set, calculated from timestamp
 285     release_timestamp: UNIX timestamp of the moment the video was released.
 286                     If it is not clear whether to use timestamp or this, use the former
 287     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 288                     If not explicitly set, calculated from release_timestamp
 289     release_year:   Year (YYYY) as integer when the video or album was released.
 290                     To be used if no exact release date is known.
 291                     If not explicitly set, calculated from release_date.
 292     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 293     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 294                     If not explicitly set, calculated from modified_timestamp
 295     uploader_id:    Nickname or id of the video uploader.
 296     uploader_url:   Full URL to a personal webpage of the video uploader.
 297     channel:        Full name of the channel the video is uploaded on.
 298                     Note that channel fields may or may not repeat uploader
 299                     fields. This depends on a particular extractor.
 300     channel_id:     Id of the channel.
 301     channel_url:    Full URL to a channel webpage.
 302     channel_follower_count: Number of followers of the channel.
 303     channel_is_verified: Whether the channel is verified on the platform.
 304     location:       Physical location where the video was filmed.
 305     subtitles:      The available subtitles as a dictionary in the format
 306                     {tag: subformats}. "tag" is usually a language code, and
 307                     "subformats" is a list sorted from lower to higher
 308                     preference, each element is a dictionary with the "ext"
 309                     entry and one of:
 310                         * "data": The subtitles file contents
 311                         * "url": A URL pointing to the subtitles file
 312                     It can optionally also have:
 313                         * "name": Name or description of the subtitles
 314                         * "http_headers": A dictionary of additional HTTP headers
 315                                   to add to the request.
 316                     "ext" will be calculated from URL if missing
 317     automatic_captions: Like 'subtitles'; contains automatically generated
 318                     captions instead of normal subtitles
 319     duration:       Length of the video in seconds, as an integer or float.
 320     view_count:     How many users have watched the video on the platform.
 321     concurrent_view_count: How many users are currently watching the video on the platform.
 322     like_count:     Number of positive ratings of the video
 323     dislike_count:  Number of negative ratings of the video
 324     repost_count:   Number of reposts of the video
 325     average_rating: Average rating give by users, the scale used depends on the webpage
 326     comment_count:  Number of comments on the video
 327     comments:       A list of comments, each with one or more of the following
 328                     properties (all but one of text or html optional):
 329                         * "author" - human-readable name of the comment author
 330                         * "author_id" - user ID of the comment author
 331                         * "author_thumbnail" - The thumbnail of the comment author
 332                         * "author_url" - The url to the comment author's page
 333                         * "author_is_verified" - Whether the author is verified
 334                                                  on the platform
 335                         * "author_is_uploader" - Whether the comment is made by
 336                                                  the video uploader
 337                         * "id" - Comment ID
 338                         * "html" - Comment as HTML
 339                         * "text" - Plain text of the comment
 340                         * "timestamp" - UNIX timestamp of comment
 341                         * "parent" - ID of the comment this one is replying to.
 342                                      Set to "root" to indicate that this is a
 343                                      comment to the original video.
 344                         * "like_count" - Number of positive ratings of the comment
 345                         * "dislike_count" - Number of negative ratings of the comment
 346                         * "is_favorited" - Whether the comment is marked as
 347                                            favorite by the video uploader
 348                         * "is_pinned" - Whether the comment is pinned to
 349                                         the top of the comments
 350     age_limit:      Age restriction for the video, as an integer (years)
 351     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 352                     should allow to get the same result again. (It will be set
 353                     by YoutubeDL if it's missing)
 354     categories:     A list of categories that the video falls in, for example
 355                     ["Sports", "Berlin"]
 356     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 357     cast:           A list of the video cast
 358     is_live:        True, False, or None (=unknown). Whether this video is a
 359                     live stream that goes on instead of a fixed-length video.
 360     was_live:       True, False, or None (=unknown). Whether this video was
 361                     originally a live stream.
 362     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 363                     or 'post_live' (was live, but VOD is not yet processed)
 364                     If absent, automatically set from is_live, was_live
 365     start_time:     Time in seconds where the reproduction should start, as
 366                     specified in the URL.
 367     end_time:       Time in seconds where the reproduction should end, as
 368                     specified in the URL.
 369     chapters:       A list of dictionaries, with the following entries:
 370                         * "start_time" - The start time of the chapter in seconds
 371                         * "end_time" - The end time of the chapter in seconds
 372                         * "title" (optional, string)
 373     heatmap:        A list of dictionaries, with the following entries:
 374                         * "start_time" - The start time of the data point in seconds
 375                         * "end_time" - The end time of the data point in seconds
 376                         * "value" - The normalized value of the data point (float between 0 and 1)
 377     playable_in_embed: Whether this video is allowed to play in embedded
 378                     players on other sites. Can be True (=always allowed),
 379                     False (=never allowed), None (=unknown), or a string
 380                     specifying the criteria for embedability; e.g. 'whitelist'
 381     availability:   Under what condition the video is available. One of
 382                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 383                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 384                     to set it
 385     _old_archive_ids: A list of old archive ids needed for backward compatibility
 386     _format_sort_fields: A list of fields to use for sorting formats
 387     __post_extractor: A function to be called just before the metadata is
 388                     written to either disk, logger or console. The function
 389                     must return a dict which will be added to the info_dict.
 390                     This is usefull for additional information that is
 391                     time-consuming to extract. Note that the fields thus
 392                     extracted will not be available to output template and
 393                     match_filter. So, only "comments" and "comment_count" are
 394                     currently allowed to be extracted via this method.
 395
 396     The following fields should only be used when the video belongs to some logical
 397     chapter or section:
 398
 399     chapter:        Name or title of the chapter the video belongs to.
 400     chapter_number: Number of the chapter the video belongs to, as an integer.
 401     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 402
 403     The following fields should only be used when the video is an episode of some
 404     series, programme or podcast:
 405
 406     series:         Title of the series or programme the video episode belongs to.
 407     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 408     season:         Title of the season the video episode belongs to.
 409     season_number:  Number of the season the video episode belongs to, as an integer.
 410     season_id:      Id of the season the video episode belongs to, as a unicode string.
 411     episode:        Title of the video episode. Unlike mandatory video title field,
 412                     this field should denote the exact title of the video episode
 413                     without any kind of decoration.
 414     episode_number: Number of the video episode within a season, as an integer.
 415     episode_id:     Id of the video episode, as a unicode string.
 416
 417     The following fields should only be used when the media is a track or a part of
 418     a music album:
 419
 420     track:          Title of the track.
 421     track_number:   Number of the track within an album or a disc, as an integer.
 422     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 423                     as a unicode string.
 424     artist:         Artist(s) of the track.
 425     genre:          Genre(s) of the track.
 426     album:          Title of the album the track belongs to.
 427     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 428     album_artist:   List of all artists appeared on the album (e.g.
 429                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 430                     and compilations).
 431     disc_number:    Number of the disc or other physical medium the track belongs to,
 432                     as an integer.
 433     composer:       Composer of the piece
 434
 435     The following fields should only be set for clips that should be cut from the original video:
 436
 437     section_start:  Start time of the section in seconds
 438     section_end:    End time of the section in seconds
 439
 440     The following fields should only be set for storyboards:
 441     rows:           Number of rows in each storyboard fragment, as an integer
 442     columns:        Number of columns in each storyboard fragment, as an integer
 443
 444     Unless mentioned otherwise, the fields should be Unicode strings.
 445
 446     Unless mentioned otherwise, None is equivalent to absence of information.
 447
 448
 449     _type "playlist" indicates multiple videos.
 450     There must be a key "entries", which is a list, an iterable, or a PagedList
 451     object, each element of which is a valid dictionary by this specification.
 452
 453     Additionally, playlists can have "id", "title", and any other relevant
 454     attributes with the same semantics as videos (see above).
 455
 456     It can also have the following optional fields:
 457
 458     playlist_count: The total number of videos in a playlist. If not given,
 459                     YoutubeDL tries to calculate it from "entries"
 460
 461
 462     _type "multi_video" indicates that there are multiple videos that
 463     form a single show, for examples multiple acts of an opera or TV episode.
 464     It must have an entries key like a playlist and contain all the keys
 465     required for a video at the same time.
 466
 467
 468     _type "url" indicates that the video must be extracted from another
 469     location, possibly by a different extractor. Its only required key is:
 470     "url" - the next URL to extract.
 471     The key "ie_key" can be set to the class name (minus the trailing "IE",
 472     e.g. "Youtube") if the extractor class is known in advance.
 473     Additionally, the dictionary may have any properties of the resolved entity
 474     known in advance, for example "title" if the title of the referred video is
 475     known ahead of time.
 476
 477
 478     _type "url_transparent" entities have the same specification as "url", but
 479     indicate that the given additional information is more precise than the one
 480     associated with the resolved URL.
 481     This is useful when a site employs a video service that hosts the video and
 482     its technical metadata, but that video service does not embed a useful
 483     title, description etc.
 484
 485
 486     Subclasses of this should also be added to the list of extractors and
 487     should define _VALID_URL as a regexp or a Sequence of regexps, and
 488     re-define the _real_extract() and (optionally) _real_initialize() methods.
 489
 490     Subclasses may also override suitable() if necessary, but ensure the function
 491     signature is preserved and that this function imports everything it needs
 492     (except other extractors), so that lazy_extractors works correctly.
 493
 494     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 495     the HTML of Generic webpages. It may also override _extract_embed_urls
 496     or _extract_from_webpage as necessary. While these are normally classmethods,
 497     _extract_from_webpage is allowed to be an instance method.
 498
 499     _extract_from_webpage may raise self.StopExtraction() to stop further
 500     processing of the webpage and obtain exclusive rights to it. This is useful
 501     when the extractor cannot reliably be matched using just the URL,
 502     e.g. invidious/peertube instances
 503
 504     Embed-only extractors can be defined by setting _VALID_URL = False.
 505
 506     To support username + password (or netrc) login, the extractor must define a
 507     _NETRC_MACHINE and re-define _perform_login(username, password) and
 508     (optionally) _initialize_pre_login() methods. The _perform_login method will
 509     be called between _initialize_pre_login and _real_initialize if credentials
 510     are passed by the user. In cases where it is necessary to have the login
 511     process as part of the extraction rather than initialization, _perform_login
 512     can be left undefined.
 513
 514     _GEO_BYPASS attribute may be set to False in order to disable
 515     geo restriction bypass mechanisms for a particular extractor.
 516     Though it won't disable explicit geo restriction bypass based on
 517     country code provided with geo_bypass_country.
 518
 519     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 520     countries for this extractor. One of these countries will be used by
 521     geo restriction bypass mechanism right away in order to bypass
 522     geo restriction, of course, if the mechanism is not disabled.
 523
 524     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 525     IP blocks in CIDR notation for this extractor. One of these IP blocks
 526     will be used by geo restriction bypass mechanism similarly
 527     to _GEO_COUNTRIES.
 528
 529     The _ENABLED attribute should be set to False for IEs that
 530     are disabled by default and must be explicitly enabled.
 531
 532     The _WORKING attribute should be set to False for broken IEs
 533     in order to warn the users and skip the tests.
 534     """
 535
 536     _ready = False
 537     _downloader = None
 538     _x_forwarded_for_ip = None
 539     _GEO_BYPASS = True
 540     _GEO_COUNTRIES = None
 541     _GEO_IP_BLOCKS = None
 542     _WORKING = True
 543     _ENABLED = True
 544     _NETRC_MACHINE = None
 545     IE_DESC = None
 546     SEARCH_KEY = None
 547     _VALID_URL = None
 548     _EMBED_REGEX = []
 549
 550     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 551         password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 552         return {
 553             None: '',
 554             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 555             'password': f'Use {password_hint}',
 556             'cookies': (
 557                 'Use --cookies-from-browser or --cookies for the authentication. '
 558                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 559         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 560
 561     def __init__(self, downloader=None):
 562         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 563         If a downloader is not passed during initialization,
 564         it must be set using "set_downloader()" before "extract()" is called"""
 565         self._ready = False
 566         self._x_forwarded_for_ip = None
 567         self._printed_messages = set()
 568         self.set_downloader(downloader)
 569
 570     @classmethod
 571     def _match_valid_url(cls, url):
 572         if cls._VALID_URL is False:
 573             return None
 574         # This does not use has/getattr intentionally - we want to know whether
 575         # we have cached the regexp for *this* class, whereas getattr would also
 576         # match the superclass
 577         if '_VALID_URL_RE' not in cls.__dict__:
 578             cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
 579         return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
 580
 581     @classmethod
 582     def suitable(cls, url):
 583         """Receives a URL and returns True if suitable for this IE."""
 584         # This function must import everything it needs (except other extractors),
 585         # so that lazy_extractors works correctly
 586         return cls._match_valid_url(url) is not None
 587
 588     @classmethod
 589     def _match_id(cls, url):
 590         return cls._match_valid_url(url).group('id')
 591
 592     @classmethod
 593     def get_temp_id(cls, url):
 594         try:
 595             return cls._match_id(url)
 596         except (IndexError, AttributeError):
 597             return None
 598
 599     @classmethod
 600     def working(cls):
 601         """Getter method for _WORKING."""
 602         return cls._WORKING
 603
 604     @classmethod
 605     def supports_login(cls):
 606         return bool(cls._NETRC_MACHINE)
 607
 608     def initialize(self):
 609         """Initializes an instance (authentication, etc)."""
 610         self._printed_messages = set()
 611         self._initialize_geo_bypass({
 612             'countries': self._GEO_COUNTRIES,
 613             'ip_blocks': self._GEO_IP_BLOCKS,
 614         })
 615         if not self._ready:
 616             self._initialize_pre_login()
 617             if self.supports_login():
 618                 username, password = self._get_login_info()
 619                 if username:
 620                     self._perform_login(username, password)
 621             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 622                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 623             self._real_initialize()
 624             self._ready = True
 625
 626     def _initialize_geo_bypass(self, geo_bypass_context):
 627         """
 628         Initialize geo restriction bypass mechanism.
 629
 630         This method is used to initialize geo bypass mechanism based on faking
 631         X-Forwarded-For HTTP header. A random country from provided country list
 632         is selected and a random IP belonging to this country is generated. This
 633         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 634         HTTP requests.
 635
 636         This method will be used for initial geo bypass mechanism initialization
 637         during the instance initialization with _GEO_COUNTRIES and
 638         _GEO_IP_BLOCKS.
 639
 640         You may also manually call it from extractor's code if geo bypass
 641         information is not available beforehand (e.g. obtained during
 642         extraction) or due to some other reason. In this case you should pass
 643         this information in geo bypass context passed as first argument. It may
 644         contain following fields:
 645
 646         countries:  List of geo unrestricted countries (similar
 647                     to _GEO_COUNTRIES)
 648         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 649                     (similar to _GEO_IP_BLOCKS)
 650
 651         """
 652         if not self._x_forwarded_for_ip:
 653
 654             # Geo bypass mechanism is explicitly disabled by user
 655             if not self.get_param('geo_bypass', True):
 656                 return
 657
 658             if not geo_bypass_context:
 659                 geo_bypass_context = {}
 660
 661             # Backward compatibility: previously _initialize_geo_bypass
 662             # expected a list of countries, some 3rd party code may still use
 663             # it this way
 664             if isinstance(geo_bypass_context, (list, tuple)):
 665                 geo_bypass_context = {
 666                     'countries': geo_bypass_context,
 667                 }
 668
 669             # The whole point of geo bypass mechanism is to fake IP
 670             # as X-Forwarded-For HTTP header based on some IP block or
 671             # country code.
 672
 673             # Path 1: bypassing based on IP block in CIDR notation
 674
 675             # Explicit IP block specified by user, use it right away
 676             # regardless of whether extractor is geo bypassable or not
 677             ip_block = self.get_param('geo_bypass_ip_block', None)
 678
 679             # Otherwise use random IP block from geo bypass context but only
 680             # if extractor is known as geo bypassable
 681             if not ip_block:
 682                 ip_blocks = geo_bypass_context.get('ip_blocks')
 683                 if self._GEO_BYPASS and ip_blocks:
 684                     ip_block = random.choice(ip_blocks)
 685
 686             if ip_block:
 687                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 688                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 689                 return
 690
 691             # Path 2: bypassing based on country code
 692
 693             # Explicit country code specified by user, use it right away
 694             # regardless of whether extractor is geo bypassable or not
 695             country = self.get_param('geo_bypass_country', None)
 696
 697             # Otherwise use random country code from geo bypass context but
 698             # only if extractor is known as geo bypassable
 699             if not country:
 700                 countries = geo_bypass_context.get('countries')
 701                 if self._GEO_BYPASS and countries:
 702                     country = random.choice(countries)
 703
 704             if country:
 705                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 706                 self._downloader.write_debug(
 707                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 708
 709     def extract(self, url):
 710         """Extracts URL information and returns it in list of dicts."""
 711         try:
 712             for _ in range(2):
 713                 try:
 714                     self.initialize()
 715                     self.to_screen('Extracting URL: %s' % (
 716                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 717                     ie_result = self._real_extract(url)
 718                     if ie_result is None:
 719                         return None
 720                     if self._x_forwarded_for_ip:
 721                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 722                     subtitles = ie_result.get('subtitles') or {}
 723                     if 'no-live-chat' in self.get_param('compat_opts'):
 724                         for lang in ('live_chat', 'comments', 'danmaku'):
 725                             subtitles.pop(lang, None)
 726                     return ie_result
 727                 except GeoRestrictedError as e:
 728                     if self.__maybe_fake_ip_and_retry(e.countries):
 729                         continue
 730                     raise
 731         except UnsupportedError:
 732             raise
 733         except ExtractorError as e:
 734             e.video_id = e.video_id or self.get_temp_id(url)
 735             e.ie = e.ie or self.IE_NAME,
 736             e.traceback = e.traceback or sys.exc_info()[2]
 737             raise
 738         except IncompleteRead as e:
 739             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 740         except (KeyError, StopIteration) as e:
 741             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 742
 743     def __maybe_fake_ip_and_retry(self, countries):
 744         if (not self.get_param('geo_bypass_country', None)
 745                 and self._GEO_BYPASS
 746                 and self.get_param('geo_bypass', True)
 747                 and not self._x_forwarded_for_ip
 748                 and countries):
 749             country_code = random.choice(countries)
 750             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 751             if self._x_forwarded_for_ip:
 752                 self.report_warning(
 753                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 754                     % (self._x_forwarded_for_ip, country_code.upper()))
 755                 return True
 756         return False
 757
 758     def set_downloader(self, downloader):
 759         """Sets a YoutubeDL instance as the downloader for this IE."""
 760         self._downloader = downloader
 761
 762     @property
 763     def cache(self):
 764         return self._downloader.cache
 765
 766     @property
 767     def cookiejar(self):
 768         return self._downloader.cookiejar
 769
 770     def _initialize_pre_login(self):
 771         """ Initialization before login. Redefine in subclasses."""
 772         pass
 773
 774     def _perform_login(self, username, password):
 775         """ Login with username and password. Redefine in subclasses."""
 776         pass
 777
 778     def _real_initialize(self):
 779         """Real initialization process. Redefine in subclasses."""
 780         pass
 781
 782     def _real_extract(self, url):
 783         """Real extraction process. Redefine in subclasses."""
 784         raise NotImplementedError('This method must be implemented by subclasses')
 785
 786     @classmethod
 787     def ie_key(cls):
 788         """A string for getting the InfoExtractor with get_info_extractor"""
 789         return cls.__name__[:-2]
 790
 791     @classproperty
 792     def IE_NAME(cls):
 793         return cls.__name__[:-2]
 794
 795     @staticmethod
 796     def __can_accept_status_code(err, expected_status):
 797         assert isinstance(err, HTTPError)
 798         if expected_status is None:
 799             return False
 800         elif callable(expected_status):
 801             return expected_status(err.status) is True
 802         else:
 803             return err.status in variadic(expected_status)
 804
 805     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 806         if isinstance(url_or_request, urllib.request.Request):
 807             self._downloader.deprecation_warning(
 808                 'Passing a urllib.request.Request to _create_request() is deprecated. '
 809                 'Use yt_dlp.networking.common.Request instead.')
 810             url_or_request = urllib_req_to_req(url_or_request)
 811         elif not isinstance(url_or_request, Request):
 812             url_or_request = Request(url_or_request)
 813
 814         url_or_request.update(data=data, headers=headers, query=query)
 815         return url_or_request
 816
 817     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 818         """
 819         Return the response handle.
 820
 821         See _download_webpage docstring for arguments specification.
 822         """
 823         if not self._downloader._first_webpage_request:
 824             sleep_interval = self.get_param('sleep_interval_requests') or 0
 825             if sleep_interval > 0:
 826                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 827                 time.sleep(sleep_interval)
 828         else:
 829             self._downloader._first_webpage_request = False
 830
 831         if note is None:
 832             self.report_download_webpage(video_id)
 833         elif note is not False:
 834             if video_id is None:
 835                 self.to_screen(str(note))
 836             else:
 837                 self.to_screen(f'{video_id}: {note}')
 838
 839         # Some sites check X-Forwarded-For HTTP header in order to figure out
 840         # the origin of the client behind proxy. This allows bypassing geo
 841         # restriction by faking this header's value to IP that belongs to some
 842         # geo unrestricted country. We will do so once we encounter any
 843         # geo restriction error.
 844         if self._x_forwarded_for_ip:
 845             headers = (headers or {}).copy()
 846             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 847
 848         try:
 849             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 850         except network_exceptions as err:
 851             if isinstance(err, HTTPError):
 852                 if self.__can_accept_status_code(err, expected_status):
 853                     return err.response
 854
 855             if errnote is False:
 856                 return False
 857             if errnote is None:
 858                 errnote = 'Unable to download webpage'
 859
 860             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 861             if fatal:
 862                 raise ExtractorError(errmsg, cause=err)
 863             else:
 864                 self.report_warning(errmsg)
 865                 return False
 866
 867     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 868                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 869         """
 870         Return a tuple (page content as string, URL handle).
 871
 872         Arguments:
 873         url_or_request -- plain text URL as a string or
 874             a urllib.request.Request object
 875         video_id -- Video/playlist/item identifier (string)
 876
 877         Keyword arguments:
 878         note -- note printed before downloading (string)
 879         errnote -- note printed in case of an error (string)
 880         fatal -- flag denoting whether error should be considered fatal,
 881             i.e. whether it should cause ExtractionError to be raised,
 882             otherwise a warning will be reported and extraction continued
 883         encoding -- encoding for a page content decoding, guessed automatically
 884             when not explicitly specified
 885         data -- POST data (bytes)
 886         headers -- HTTP headers (dict)
 887         query -- URL query (dict)
 888         expected_status -- allows to accept failed HTTP requests (non 2xx
 889             status code) by explicitly specifying a set of accepted status
 890             codes. Can be any of the following entities:
 891                 - an integer type specifying an exact failed status code to
 892                   accept
 893                 - a list or a tuple of integer types specifying a list of
 894                   failed status codes to accept
 895                 - a callable accepting an actual failed status code and
 896                   returning True if it should be accepted
 897             Note that this argument does not affect success status codes (2xx)
 898             which are always accepted.
 899         """
 900
 901         # Strip hashes from the URL (#1038)
 902         if isinstance(url_or_request, str):
 903             url_or_request = url_or_request.partition('#')[0]
 904
 905         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 906         if urlh is False:
 907             assert not fatal
 908             return False
 909         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 910         return (content, urlh)
 911
 912     @staticmethod
 913     def _guess_encoding_from_content(content_type, webpage_bytes):
 914         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 915         if m:
 916             encoding = m.group(1)
 917         else:
 918             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 919                           webpage_bytes[:1024])
 920             if m:
 921                 encoding = m.group(1).decode('ascii')
 922             elif webpage_bytes.startswith(b'\xff\xfe'):
 923                 encoding = 'utf-16'
 924             else:
 925                 encoding = 'utf-8'
 926
 927         return encoding
 928
 929     def __check_blocked(self, content):
 930         first_block = content[:512]
 931         if ('<title>Access to this site is blocked</title>' in content
 932                 and 'Websense' in first_block):
 933             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 934             blocked_iframe = self._html_search_regex(
 935                 r'<iframe src="([^"]+)"', content,
 936                 'Websense information URL', default=None)
 937             if blocked_iframe:
 938                 msg += ' Visit %s for more details' % blocked_iframe
 939             raise ExtractorError(msg, expected=True)
 940         if '<title>The URL you requested has been blocked</title>' in first_block:
 941             msg = (
 942                 'Access to this webpage has been blocked by Indian censorship. '
 943                 'Use a VPN or proxy server (with --proxy) to route around it.')
 944             block_msg = self._html_search_regex(
 945                 r'</h1><p>(.*?)</p>',
 946                 content, 'block message', default=None)
 947             if block_msg:
 948                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 949             raise ExtractorError(msg, expected=True)
 950         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 951                 and 'blocklist.rkn.gov.ru' in content):
 952             raise ExtractorError(
 953                 'Access to this webpage has been blocked by decision of the Russian government. '
 954                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 955                 expected=True)
 956
 957     def _request_dump_filename(self, url, video_id):
 958         basen = f'{video_id}_{url}'
 959         trim_length = self.get_param('trim_file_name') or 240
 960         if len(basen) > trim_length:
 961             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 962             basen = basen[:trim_length - len(h)] + h
 963         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 964         # Working around MAX_PATH limitation on Windows (see
 965         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 966         if compat_os_name == 'nt':
 967             absfilepath = os.path.abspath(filename)
 968             if len(absfilepath) > 259:
 969                 filename = fR'\\?\{absfilepath}'
 970         return filename
 971
 972     def __decode_webpage(self, webpage_bytes, encoding, headers):
 973         if not encoding:
 974             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 975         try:
 976             return webpage_bytes.decode(encoding, 'replace')
 977         except LookupError:
 978             return webpage_bytes.decode('utf-8', 'replace')
 979
 980     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 981         webpage_bytes = urlh.read()
 982         if prefix is not None:
 983             webpage_bytes = prefix + webpage_bytes
 984         if self.get_param('dump_intermediate_pages', False):
 985             self.to_screen('Dumping request to ' + urlh.url)
 986             dump = base64.b64encode(webpage_bytes).decode('ascii')
 987             self._downloader.to_screen(dump)
 988         if self.get_param('write_pages'):
 989             filename = self._request_dump_filename(urlh.url, video_id)
 990             self.to_screen(f'Saving request to {filename}')
 991             with open(filename, 'wb') as outf:
 992                 outf.write(webpage_bytes)
 993
 994         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 995         self.__check_blocked(content)
 996
 997         return content
 998
 999     def __print_error(self, errnote, fatal, video_id, err):
1000         if fatal:
1001             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
1002         elif errnote:
1003             self.report_warning(f'{video_id}: {errnote}: {err}')
1004
1005     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
1006         if transform_source:
1007             xml_string = transform_source(xml_string)
1008         try:
1009             return compat_etree_fromstring(xml_string.encode('utf-8'))
1010         except xml.etree.ElementTree.ParseError as ve:
1011             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1012
1013     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1014         try:
1015             return json.loads(
1016                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1017         except ValueError as ve:
1018             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1019
1020     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1021         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1022
1023     def __create_download_methods(name, parser, note, errnote, return_value):
1024
1025         def parse(ie, content, *args, errnote=errnote, **kwargs):
1026             if parser is None:
1027                 return content
1028             if errnote is False:
1029                 kwargs['errnote'] = errnote
1030             # parser is fetched by name so subclasses can override it
1031             return getattr(ie, parser)(content, *args, **kwargs)
1032
1033         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1034                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1035             res = self._download_webpage_handle(
1036                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1037                 data=data, headers=headers, query=query, expected_status=expected_status)
1038             if res is False:
1039                 return res
1040             content, urlh = res
1041             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1042
1043         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1044                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1045             if self.get_param('load_pages'):
1046                 url_or_request = self._create_request(url_or_request, data, headers, query)
1047                 filename = self._request_dump_filename(url_or_request.url, video_id)
1048                 self.to_screen(f'Loading request from {filename}')
1049                 try:
1050                     with open(filename, 'rb') as dumpf:
1051                         webpage_bytes = dumpf.read()
1052                 except OSError as e:
1053                     self.report_warning(f'Unable to load request from disk: {e}')
1054                 else:
1055                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1056                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1057             kwargs = {
1058                 'note': note,
1059                 'errnote': errnote,
1060                 'transform_source': transform_source,
1061                 'fatal': fatal,
1062                 'encoding': encoding,
1063                 'data': data,
1064                 'headers': headers,
1065                 'query': query,
1066                 'expected_status': expected_status,
1067             }
1068             if parser is None:
1069                 kwargs.pop('transform_source')
1070             # The method is fetched by name so subclasses can override _download_..._handle
1071             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1072             return res if res is False else res[0]
1073
1074         def impersonate(func, name, return_value):
1075             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1076             func.__doc__ = f'''
1077                 @param transform_source     Apply this transformation before parsing
1078                 @returns                    {return_value}
1079
1080                 See _download_webpage_handle docstring for other arguments specification
1081             '''
1082
1083         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1084         impersonate(download_content, f'_download_{name}', f'{return_value}')
1085         return download_handle, download_content
1086
1087     _download_xml_handle, _download_xml = __create_download_methods(
1088         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1089     _download_json_handle, _download_json = __create_download_methods(
1090         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1091     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1092         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1093     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1094
1095     def _download_webpage(
1096             self, url_or_request, video_id, note=None, errnote=None,
1097             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1098         """
1099         Return the data of the page as a string.
1100
1101         Keyword arguments:
1102         tries -- number of tries
1103         timeout -- sleep interval between tries
1104
1105         See _download_webpage_handle docstring for other arguments specification.
1106         """
1107
1108         R''' # NB: These are unused; should they be deprecated?
1109         if tries != 1:
1110             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1111         if timeout is NO_DEFAULT:
1112             timeout = 5
1113         else:
1114             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1115         '''
1116
1117         try_count = 0
1118         while True:
1119             try:
1120                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1121             except IncompleteRead as e:
1122                 try_count += 1
1123                 if try_count >= tries:
1124                     raise e
1125                 self._sleep(timeout, video_id)
1126
1127     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1128         idstr = format_field(video_id, None, '%s: ')
1129         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1130         if only_once:
1131             if f'WARNING: {msg}' in self._printed_messages:
1132                 return
1133             self._printed_messages.add(f'WARNING: {msg}')
1134         self._downloader.report_warning(msg, *args, **kwargs)
1135
1136     def to_screen(self, msg, *args, **kwargs):
1137         """Print msg to screen, prefixing it with '[ie_name]'"""
1138         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1139
1140     def write_debug(self, msg, *args, **kwargs):
1141         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1142
1143     def get_param(self, name, default=None, *args, **kwargs):
1144         if self._downloader:
1145             return self._downloader.params.get(name, default, *args, **kwargs)
1146         return default
1147
1148     def report_drm(self, video_id, partial=NO_DEFAULT):
1149         if partial is not NO_DEFAULT:
1150             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1151         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1152
1153     def report_extraction(self, id_or_name):
1154         """Report information extraction."""
1155         self.to_screen('%s: Extracting information' % id_or_name)
1156
1157     def report_download_webpage(self, video_id):
1158         """Report webpage download."""
1159         self.to_screen('%s: Downloading webpage' % video_id)
1160
1161     def report_age_confirmation(self):
1162         """Report attempt to confirm age."""
1163         self.to_screen('Confirming age')
1164
1165     def report_login(self):
1166         """Report attempt to log in."""
1167         self.to_screen('Logging in')
1168
1169     def raise_login_required(
1170             self, msg='This video is only available for registered users',
1171             metadata_available=False, method=NO_DEFAULT):
1172         if metadata_available and (
1173                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1174             self.report_warning(msg)
1175             return
1176         msg += format_field(self._login_hint(method), None, '. %s')
1177         raise ExtractorError(msg, expected=True)
1178
1179     def raise_geo_restricted(
1180             self, msg='This video is not available from your location due to geo restriction',
1181             countries=None, metadata_available=False):
1182         if metadata_available and (
1183                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1184             self.report_warning(msg)
1185         else:
1186             raise GeoRestrictedError(msg, countries=countries)
1187
1188     def raise_no_formats(self, msg, expected=False, video_id=None):
1189         if expected and (
1190                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1191             self.report_warning(msg, video_id)
1192         elif isinstance(msg, ExtractorError):
1193             raise msg
1194         else:
1195             raise ExtractorError(msg, expected=expected, video_id=video_id)
1196
1197     # Methods for following #608
1198     @staticmethod
1199     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1200         """Returns a URL that points to a page that should be processed"""
1201         if ie is not None:
1202             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1203         if video_id is not None:
1204             kwargs['id'] = video_id
1205         if video_title is not None:
1206             kwargs['title'] = video_title
1207         return {
1208             **kwargs,
1209             '_type': 'url_transparent' if url_transparent else 'url',
1210             'url': url,
1211         }
1212
1213     @classmethod
1214     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1215                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1216         return cls.playlist_result(
1217             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1218             playlist_id, playlist_title, **kwargs)
1219
1220     @staticmethod
1221     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1222         """Returns a playlist"""
1223         if playlist_id:
1224             kwargs['id'] = playlist_id
1225         if playlist_title:
1226             kwargs['title'] = playlist_title
1227         if playlist_description is not None:
1228             kwargs['description'] = playlist_description
1229         return {
1230             **kwargs,
1231             '_type': 'multi_video' if multi_video else 'playlist',
1232             'entries': entries,
1233         }
1234
1235     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1236         """
1237         Perform a regex search on the given string, using a single or a list of
1238         patterns returning the first matching group.
1239         In case of failure return a default value or raise a WARNING or a
1240         RegexNotFoundError, depending on fatal, specifying the field name.
1241         """
1242         if string is None:
1243             mobj = None
1244         elif isinstance(pattern, (str, re.Pattern)):
1245             mobj = re.search(pattern, string, flags)
1246         else:
1247             for p in pattern:
1248                 mobj = re.search(p, string, flags)
1249                 if mobj:
1250                     break
1251
1252         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1253
1254         if mobj:
1255             if group is None:
1256                 # return the first matching group
1257                 return next(g for g in mobj.groups() if g is not None)
1258             elif isinstance(group, (list, tuple)):
1259                 return tuple(mobj.group(g) for g in group)
1260             else:
1261                 return mobj.group(group)
1262         elif default is not NO_DEFAULT:
1263             return default
1264         elif fatal:
1265             raise RegexNotFoundError('Unable to extract %s' % _name)
1266         else:
1267             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1268             return None
1269
1270     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1271                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1272         """Searches string for the JSON object specified by start_pattern"""
1273         # NB: end_pattern is only used to reduce the size of the initial match
1274         if default is NO_DEFAULT:
1275             default, has_default = {}, False
1276         else:
1277             fatal, has_default = False, True
1278
1279         json_string = self._search_regex(
1280             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1281             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1282         if not json_string:
1283             return default
1284
1285         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1286         try:
1287             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1288         except ExtractorError as e:
1289             if fatal:
1290                 raise ExtractorError(
1291                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1292             elif not has_default:
1293                 self.report_warning(
1294                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1295         return default
1296
1297     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1298         """
1299         Like _search_regex, but strips HTML tags and unescapes entities.
1300         """
1301         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1302         if isinstance(res, tuple):
1303             return tuple(map(clean_html, res))
1304         return clean_html(res)
1305
1306     def _get_netrc_login_info(self, netrc_machine=None):
1307         netrc_machine = netrc_machine or self._NETRC_MACHINE
1308
1309         cmd = self.get_param('netrc_cmd')
1310         if cmd:
1311             cmd = cmd.replace('{}', netrc_machine)
1312             self.to_screen(f'Executing command: {cmd}')
1313             stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1314             if ret != 0:
1315                 raise OSError(f'Command returned error code {ret}')
1316             info = netrc_from_content(stdout).authenticators(netrc_machine)
1317
1318         elif self.get_param('usenetrc', False):
1319             netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1320             if os.path.isdir(netrc_file):
1321                 netrc_file = os.path.join(netrc_file, '.netrc')
1322             info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1323
1324         else:
1325             return None, None
1326         if not info:
1327             raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')
1328         return info[0], info[2]
1329
1330     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1331         """
1332         Get the login info as (username, password)
1333         First look for the manually specified credentials using username_option
1334         and password_option as keys in params dictionary. If no such credentials
1335         are available try the netrc_cmd if it is defined or look in the
1336         netrc file using the netrc_machine or _NETRC_MACHINE value.
1337         If there's no info available, return (None, None)
1338         """
1339
1340         username = self.get_param(username_option)
1341         if username is not None:
1342             password = self.get_param(password_option)
1343         else:
1344             try:
1345                 username, password = self._get_netrc_login_info(netrc_machine)
1346             except (OSError, netrc.NetrcParseError) as err:
1347                 self.report_warning(f'Failed to parse .netrc: {err}')
1348                 return None, None
1349         return username, password
1350
1351     def _get_tfa_info(self, note='two-factor verification code'):
1352         """
1353         Get the two-factor authentication info
1354         TODO - asking the user will be required for sms/phone verify
1355         currently just uses the command line option
1356         If there's no info available, return None
1357         """
1358
1359         tfa = self.get_param('twofactor')
1360         if tfa is not None:
1361             return tfa
1362
1363         return getpass.getpass('Type %s and press [Return]: ' % note)
1364
1365     # Helper functions for extracting OpenGraph info
1366     @staticmethod
1367     def _og_regexes(prop):
1368         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1369         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1370                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1371         template = r'<meta[^>]+?%s[^>]+?%s'
1372         return [
1373             template % (property_re, content_re),
1374             template % (content_re, property_re),
1375         ]
1376
1377     @staticmethod
1378     def _meta_regex(prop):
1379         return r'''(?isx)<meta
1380                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1381                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1382
1383     def _og_search_property(self, prop, html, name=None, **kargs):
1384         prop = variadic(prop)
1385         if name is None:
1386             name = 'OpenGraph %s' % prop[0]
1387         og_regexes = []
1388         for p in prop:
1389             og_regexes.extend(self._og_regexes(p))
1390         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1391         if escaped is None:
1392             return None
1393         return unescapeHTML(escaped)
1394
1395     def _og_search_thumbnail(self, html, **kargs):
1396         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1397
1398     def _og_search_description(self, html, **kargs):
1399         return self._og_search_property('description', html, fatal=False, **kargs)
1400
1401     def _og_search_title(self, html, *, fatal=False, **kargs):
1402         return self._og_search_property('title', html, fatal=fatal, **kargs)
1403
1404     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1405         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1406         if secure:
1407             regexes = self._og_regexes('video:secure_url') + regexes
1408         return self._html_search_regex(regexes, html, name, **kargs)
1409
1410     def _og_search_url(self, html, **kargs):
1411         return self._og_search_property('url', html, **kargs)
1412
1413     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1414         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1415
1416     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1417         name = variadic(name)
1418         if display_name is None:
1419             display_name = name[0]
1420         return self._html_search_regex(
1421             [self._meta_regex(n) for n in name],
1422             html, display_name, fatal=fatal, group='content', **kwargs)
1423
1424     def _dc_search_uploader(self, html):
1425         return self._html_search_meta('dc.creator', html, 'uploader')
1426
1427     @staticmethod
1428     def _rta_search(html):
1429         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1430         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1431                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1432                      html):
1433             return 18
1434
1435         # And then there are the jokers who advertise that they use RTA, but actually don't.
1436         AGE_LIMIT_MARKERS = [
1437             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1438             r'>[^<]*you acknowledge you are at least (\d+) years old',
1439             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1440         ]
1441
1442         age_limit = 0
1443         for marker in AGE_LIMIT_MARKERS:
1444             mobj = re.search(marker, html)
1445             if mobj:
1446                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1447         return age_limit
1448
1449     def _media_rating_search(self, html):
1450         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1451         rating = self._html_search_meta('rating', html)
1452
1453         if not rating:
1454             return None
1455
1456         RATING_TABLE = {
1457             'safe for kids': 0,
1458             'general': 8,
1459             '14 years': 14,
1460             'mature': 17,
1461             'restricted': 19,
1462         }
1463         return RATING_TABLE.get(rating.lower())
1464
1465     def _family_friendly_search(self, html):
1466         # See http://schema.org/VideoObject
1467         family_friendly = self._html_search_meta(
1468             'isFamilyFriendly', html, default=None)
1469
1470         if not family_friendly:
1471             return None
1472
1473         RATING_TABLE = {
1474             '1': 0,
1475             'true': 0,
1476             '0': 18,
1477             'false': 18,
1478         }
1479         return RATING_TABLE.get(family_friendly.lower())
1480
1481     def _twitter_search_player(self, html):
1482         return self._html_search_meta('twitter:player', html,
1483                                       'twitter card player')
1484
1485     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1486         """Yield all json ld objects in the html"""
1487         if default is not NO_DEFAULT:
1488             fatal = False
1489         for mobj in re.finditer(JSON_LD_RE, html):
1490             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1491             for json_ld in variadic(json_ld_item):
1492                 if isinstance(json_ld, dict):
1493                     yield json_ld
1494
1495     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1496         """Search for a video in any json ld in the html"""
1497         if default is not NO_DEFAULT:
1498             fatal = False
1499         info = self._json_ld(
1500             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1501             video_id, fatal=fatal, expected_type=expected_type)
1502         if info:
1503             return info
1504         if default is not NO_DEFAULT:
1505             return default
1506         elif fatal:
1507             raise RegexNotFoundError('Unable to extract JSON-LD')
1508         else:
1509             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1510             return {}
1511
1512     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1513         if isinstance(json_ld, str):
1514             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1515         if not json_ld:
1516             return {}
1517         info = {}
1518
1519         INTERACTION_TYPE_MAP = {
1520             'CommentAction': 'comment',
1521             'AgreeAction': 'like',
1522             'DisagreeAction': 'dislike',
1523             'LikeAction': 'like',
1524             'DislikeAction': 'dislike',
1525             'ListenAction': 'view',
1526             'WatchAction': 'view',
1527             'ViewAction': 'view',
1528         }
1529
1530         def is_type(e, *expected_types):
1531             type = variadic(traverse_obj(e, '@type'))
1532             return any(x in type for x in expected_types)
1533
1534         def extract_interaction_type(e):
1535             interaction_type = e.get('interactionType')
1536             if isinstance(interaction_type, dict):
1537                 interaction_type = interaction_type.get('@type')
1538             return str_or_none(interaction_type)
1539
1540         def extract_interaction_statistic(e):
1541             interaction_statistic = e.get('interactionStatistic')
1542             if isinstance(interaction_statistic, dict):
1543                 interaction_statistic = [interaction_statistic]
1544             if not isinstance(interaction_statistic, list):
1545                 return
1546             for is_e in interaction_statistic:
1547                 if not is_type(is_e, 'InteractionCounter'):
1548                     continue
1549                 interaction_type = extract_interaction_type(is_e)
1550                 if not interaction_type:
1551                     continue
1552                 # For interaction count some sites provide string instead of
1553                 # an integer (as per spec) with non digit characters (e.g. ",")
1554                 # so extracting count with more relaxed str_to_int
1555                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1556                 if interaction_count is None:
1557                     continue
1558                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1559                 if not count_kind:
1560                     continue
1561                 count_key = '%s_count' % count_kind
1562                 if info.get(count_key) is not None:
1563                     continue
1564                 info[count_key] = interaction_count
1565
1566         def extract_chapter_information(e):
1567             chapters = [{
1568                 'title': part.get('name'),
1569                 'start_time': part.get('startOffset'),
1570                 'end_time': part.get('endOffset'),
1571             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1572             for idx, (last_c, current_c, next_c) in enumerate(zip(
1573                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1574                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1575                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1576                 if None in current_c.values():
1577                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1578                     return
1579             if chapters:
1580                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1581                 info['chapters'] = chapters
1582
1583         def extract_video_object(e):
1584             author = e.get('author')
1585             info.update({
1586                 'url': url_or_none(e.get('contentUrl')),
1587                 'ext': mimetype2ext(e.get('encodingFormat')),
1588                 'title': unescapeHTML(e.get('name')),
1589                 'description': unescapeHTML(e.get('description')),
1590                 'thumbnails': [{'url': unescapeHTML(url)}
1591                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1592                                if url_or_none(url)],
1593                 'duration': parse_duration(e.get('duration')),
1594                 'timestamp': unified_timestamp(e.get('uploadDate')),
1595                 # author can be an instance of 'Organization' or 'Person' types.
1596                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1597                 # however some websites are using 'Text' type instead.
1598                 # 1. https://schema.org/VideoObject
1599                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1600                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1601                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1602                 'tbr': int_or_none(e.get('bitrate')),
1603                 'width': int_or_none(e.get('width')),
1604                 'height': int_or_none(e.get('height')),
1605                 'view_count': int_or_none(e.get('interactionCount')),
1606                 'tags': try_call(lambda: e.get('keywords').split(',')),
1607             })
1608             if is_type(e, 'AudioObject'):
1609                 info.update({
1610                     'vcodec': 'none',
1611                     'abr': int_or_none(e.get('bitrate')),
1612                 })
1613             extract_interaction_statistic(e)
1614             extract_chapter_information(e)
1615
1616         def traverse_json_ld(json_ld, at_top_level=True):
1617             for e in variadic(json_ld):
1618                 if not isinstance(e, dict):
1619                     continue
1620                 if at_top_level and '@context' not in e:
1621                     continue
1622                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1623                     traverse_json_ld(e['@graph'], at_top_level=False)
1624                     continue
1625                 if expected_type is not None and not is_type(e, expected_type):
1626                     continue
1627                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1628                 if rating is not None:
1629                     info['average_rating'] = rating
1630                 if is_type(e, 'TVEpisode', 'Episode'):
1631                     episode_name = unescapeHTML(e.get('name'))
1632                     info.update({
1633                         'episode': episode_name,
1634                         'episode_number': int_or_none(e.get('episodeNumber')),
1635                         'description': unescapeHTML(e.get('description')),
1636                     })
1637                     if not info.get('title') and episode_name:
1638                         info['title'] = episode_name
1639                     part_of_season = e.get('partOfSeason')
1640                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1641                         info.update({
1642                             'season': unescapeHTML(part_of_season.get('name')),
1643                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1644                         })
1645                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1646                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1647                         info['series'] = unescapeHTML(part_of_series.get('name'))
1648                 elif is_type(e, 'Movie'):
1649                     info.update({
1650                         'title': unescapeHTML(e.get('name')),
1651                         'description': unescapeHTML(e.get('description')),
1652                         'duration': parse_duration(e.get('duration')),
1653                         'timestamp': unified_timestamp(e.get('dateCreated')),
1654                     })
1655                 elif is_type(e, 'Article', 'NewsArticle'):
1656                     info.update({
1657                         'timestamp': parse_iso8601(e.get('datePublished')),
1658                         'title': unescapeHTML(e.get('headline')),
1659                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1660                     })
1661                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1662                         extract_video_object(e['video'][0])
1663                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1664                         extract_video_object(e['subjectOf'][0])
1665                 elif is_type(e, 'VideoObject', 'AudioObject'):
1666                     extract_video_object(e)
1667                     if expected_type is None:
1668                         continue
1669                     else:
1670                         break
1671                 video = e.get('video')
1672                 if is_type(video, 'VideoObject'):
1673                     extract_video_object(video)
1674                 if expected_type is None:
1675                     continue
1676                 else:
1677                     break
1678
1679         traverse_json_ld(json_ld)
1680         return filter_dict(info)
1681
1682     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1683         return self._parse_json(
1684             self._search_regex(
1685                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1686                 webpage, 'next.js data', fatal=fatal, **kw),
1687             video_id, transform_source=transform_source, fatal=fatal)
1688
1689     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1690         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1691         rectx = re.escape(context_name)
1692         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1693         js, arg_keys, arg_vals = self._search_regex(
1694             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1695             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1696             default=NO_DEFAULT if fatal else (None, None, None))
1697         if js is None:
1698             return {}
1699
1700         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1701             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1702
1703         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1704         return traverse_obj(ret, traverse) or {}
1705
1706     @staticmethod
1707     def _hidden_inputs(html):
1708         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1709         hidden_inputs = {}
1710         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1711             attrs = extract_attributes(input)
1712             if not input:
1713                 continue
1714             if attrs.get('type') not in ('hidden', 'submit'):
1715                 continue
1716             name = attrs.get('name') or attrs.get('id')
1717             value = attrs.get('value')
1718             if name and value is not None:
1719                 hidden_inputs[name] = value
1720         return hidden_inputs
1721
1722     def _form_hidden_inputs(self, form_id, html):
1723         form = self._search_regex(
1724             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1725             html, '%s form' % form_id, group='form')
1726         return self._hidden_inputs(form)
1727
1728     @classproperty(cache=True)
1729     def FormatSort(cls):
1730         class FormatSort(FormatSorter):
1731             def __init__(ie, *args, **kwargs):
1732                 super().__init__(ie._downloader, *args, **kwargs)
1733
1734         deprecation_warning(
1735             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1736             'Use yt_dlp.utils.FormatSorter instead')
1737         return FormatSort
1738
1739     def _sort_formats(self, formats, field_preference=[]):
1740         if not field_preference:
1741             self._downloader.deprecation_warning(
1742                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1743             return
1744         self._downloader.deprecation_warning(
1745             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1746             'Return _format_sort_fields in the info_dict instead')
1747         if formats:
1748             formats[0]['__sort_fields'] = field_preference
1749
1750     def _check_formats(self, formats, video_id):
1751         if formats:
1752             formats[:] = filter(
1753                 lambda f: self._is_valid_url(
1754                     f['url'], video_id,
1755                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1756                 formats)
1757
1758     @staticmethod
1759     def _remove_duplicate_formats(formats):
1760         format_urls = set()
1761         unique_formats = []
1762         for f in formats:
1763             if f['url'] not in format_urls:
1764                 format_urls.add(f['url'])
1765                 unique_formats.append(f)
1766         formats[:] = unique_formats
1767
1768     def _is_valid_url(self, url, video_id, item='video', headers={}):
1769         url = self._proto_relative_url(url, scheme='http:')
1770         # For now assume non HTTP(S) URLs always valid
1771         if not (url.startswith('http://') or url.startswith('https://')):
1772             return True
1773         try:
1774             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1775             return True
1776         except ExtractorError as e:
1777             self.to_screen(
1778                 '%s: %s URL is invalid, skipping: %s'
1779                 % (video_id, item, error_to_compat_str(e.cause)))
1780             return False
1781
1782     def http_scheme(self):
1783         """ Either "http:" or "https:", depending on the user's preferences """
1784         return (
1785             'http:'
1786             if self.get_param('prefer_insecure', False)
1787             else 'https:')
1788
1789     def _proto_relative_url(self, url, scheme=None):
1790         scheme = scheme or self.http_scheme()
1791         assert scheme.endswith(':')
1792         return sanitize_url(url, scheme=scheme[:-1])
1793
1794     def _sleep(self, timeout, video_id, msg_template=None):
1795         if msg_template is None:
1796             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1797         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1798         self.to_screen(msg)
1799         time.sleep(timeout)
1800
1801     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1802                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1803                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1804         if self.get_param('ignore_no_formats_error'):
1805             fatal = False
1806
1807         res = self._download_xml_handle(
1808             manifest_url, video_id, 'Downloading f4m manifest',
1809             'Unable to download f4m manifest',
1810             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1811             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1812             transform_source=transform_source,
1813             fatal=fatal, data=data, headers=headers, query=query)
1814         if res is False:
1815             return []
1816
1817         manifest, urlh = res
1818         manifest_url = urlh.url
1819
1820         return self._parse_f4m_formats(
1821             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1822             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1823
1824     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1825                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1826                            fatal=True, m3u8_id=None):
1827         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1828             return []
1829
1830         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1831         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1832         if akamai_pv is not None and ';' in akamai_pv.text:
1833             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1834             if playerVerificationChallenge.strip() != '':
1835                 return []
1836
1837         formats = []
1838         manifest_version = '1.0'
1839         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1840         if not media_nodes:
1841             manifest_version = '2.0'
1842             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1843         # Remove unsupported DRM protected media from final formats
1844         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1845         media_nodes = remove_encrypted_media(media_nodes)
1846         if not media_nodes:
1847             return formats
1848
1849         manifest_base_url = get_base_url(manifest)
1850
1851         bootstrap_info = xpath_element(
1852             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1853             'bootstrap info', default=None)
1854
1855         vcodec = None
1856         mime_type = xpath_text(
1857             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1858             'base URL', default=None)
1859         if mime_type and mime_type.startswith('audio/'):
1860             vcodec = 'none'
1861
1862         for i, media_el in enumerate(media_nodes):
1863             tbr = int_or_none(media_el.attrib.get('bitrate'))
1864             width = int_or_none(media_el.attrib.get('width'))
1865             height = int_or_none(media_el.attrib.get('height'))
1866             format_id = join_nonempty(f4m_id, tbr or i)
1867             # If <bootstrapInfo> is present, the specified f4m is a
1868             # stream-level manifest, and only set-level manifests may refer to
1869             # external resources.  See section 11.4 and section 4 of F4M spec
1870             if bootstrap_info is None:
1871                 media_url = None
1872                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1873                 if manifest_version == '2.0':
1874                     media_url = media_el.attrib.get('href')
1875                 if media_url is None:
1876                     media_url = media_el.attrib.get('url')
1877                 if not media_url:
1878                     continue
1879                 manifest_url = (
1880                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1881                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1882                 # If media_url is itself a f4m manifest do the recursive extraction
1883                 # since bitrates in parent manifest (this one) and media_url manifest
1884                 # may differ leading to inability to resolve the format by requested
1885                 # bitrate in f4m downloader
1886                 ext = determine_ext(manifest_url)
1887                 if ext == 'f4m':
1888                     f4m_formats = self._extract_f4m_formats(
1889                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1890                         transform_source=transform_source, fatal=fatal)
1891                     # Sometimes stream-level manifest contains single media entry that
1892                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1893                     # At the same time parent's media entry in set-level manifest may
1894                     # contain it. We will copy it from parent in such cases.
1895                     if len(f4m_formats) == 1:
1896                         f = f4m_formats[0]
1897                         f.update({
1898                             'tbr': f.get('tbr') or tbr,
1899                             'width': f.get('width') or width,
1900                             'height': f.get('height') or height,
1901                             'format_id': f.get('format_id') if not tbr else format_id,
1902                             'vcodec': vcodec,
1903                         })
1904                     formats.extend(f4m_formats)
1905                     continue
1906                 elif ext == 'm3u8':
1907                     formats.extend(self._extract_m3u8_formats(
1908                         manifest_url, video_id, 'mp4', preference=preference,
1909                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1910                     continue
1911             formats.append({
1912                 'format_id': format_id,
1913                 'url': manifest_url,
1914                 'manifest_url': manifest_url,
1915                 'ext': 'flv' if bootstrap_info is not None else None,
1916                 'protocol': 'f4m',
1917                 'tbr': tbr,
1918                 'width': width,
1919                 'height': height,
1920                 'vcodec': vcodec,
1921                 'preference': preference,
1922                 'quality': quality,
1923             })
1924         return formats
1925
1926     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1927         return {
1928             'format_id': join_nonempty(m3u8_id, 'meta'),
1929             'url': m3u8_url,
1930             'ext': ext,
1931             'protocol': 'm3u8',
1932             'preference': preference - 100 if preference else -100,
1933             'quality': quality,
1934             'resolution': 'multiple',
1935             'format_note': 'Quality selection URL',
1936         }
1937
1938     def _report_ignoring_subs(self, name):
1939         self.report_warning(bug_reports_message(
1940             f'Ignoring subtitle tracks found in the {name} manifest; '
1941             'if any subtitle tracks are missing,'
1942         ), only_once=True)
1943
1944     def _extract_m3u8_formats(self, *args, **kwargs):
1945         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1946         if subs:
1947             self._report_ignoring_subs('HLS')
1948         return fmts
1949
1950     def _extract_m3u8_formats_and_subtitles(
1951             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1952             preference=None, quality=None, m3u8_id=None, note=None,
1953             errnote=None, fatal=True, live=False, data=None, headers={},
1954             query={}):
1955
1956         if self.get_param('ignore_no_formats_error'):
1957             fatal = False
1958
1959         if not m3u8_url:
1960             if errnote is not False:
1961                 errnote = errnote or 'Failed to obtain m3u8 URL'
1962                 if fatal:
1963                     raise ExtractorError(errnote, video_id=video_id)
1964                 self.report_warning(f'{errnote}{bug_reports_message()}')
1965             return [], {}
1966
1967         res = self._download_webpage_handle(
1968             m3u8_url, video_id,
1969             note='Downloading m3u8 information' if note is None else note,
1970             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1971             fatal=fatal, data=data, headers=headers, query=query)
1972
1973         if res is False:
1974             return [], {}
1975
1976         m3u8_doc, urlh = res
1977         m3u8_url = urlh.url
1978
1979         return self._parse_m3u8_formats_and_subtitles(
1980             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1981             preference=preference, quality=quality, m3u8_id=m3u8_id,
1982             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1983             headers=headers, query=query, video_id=video_id)
1984
1985     def _parse_m3u8_formats_and_subtitles(
1986             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
1987             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1988             errnote=None, fatal=True, data=None, headers={}, query={},
1989             video_id=None):
1990         formats, subtitles = [], {}
1991         has_drm = HlsFD._has_drm(m3u8_doc)
1992
1993         def format_url(url):
1994             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
1995
1996         if self.get_param('hls_split_discontinuity', False):
1997             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1998                 if not m3u8_doc:
1999                     if not manifest_url:
2000                         return []
2001                     m3u8_doc = self._download_webpage(
2002                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2003                         note=False, errnote='Failed to download m3u8 playlist information')
2004                     if m3u8_doc is False:
2005                         return []
2006                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2007
2008         else:
2009             def _extract_m3u8_playlist_indices(*args, **kwargs):
2010                 return [None]
2011
2012         # References:
2013         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2014         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2015         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2016
2017         # We should try extracting formats only from master playlists [1, 4.3.4],
2018         # i.e. playlists that describe available qualities. On the other hand
2019         # media playlists [1, 4.3.3] should be returned as is since they contain
2020         # just the media without qualities renditions.
2021         # Fortunately, master playlist can be easily distinguished from media
2022         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2023         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2024         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2025         # media playlist and MUST NOT appear in master playlist thus we can
2026         # clearly detect media playlist with this criterion.
2027
2028         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2029             formats = [{
2030                 'format_id': join_nonempty(m3u8_id, idx),
2031                 'format_index': idx,
2032                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2033                 'ext': ext,
2034                 'protocol': entry_protocol,
2035                 'preference': preference,
2036                 'quality': quality,
2037                 'has_drm': has_drm,
2038             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2039
2040             return formats, subtitles
2041
2042         groups = {}
2043         last_stream_inf = {}
2044
2045         def extract_media(x_media_line):
2046             media = parse_m3u8_attributes(x_media_line)
2047             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2048             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2049             if not (media_type and group_id and name):
2050                 return
2051             groups.setdefault(group_id, []).append(media)
2052             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2053             if media_type == 'SUBTITLES':
2054                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2055                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2056                 # However, lack of URI has been spotted in the wild.
2057                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2058                 if not media.get('URI'):
2059                     return
2060                 url = format_url(media['URI'])
2061                 sub_info = {
2062                     'url': url,
2063                     'ext': determine_ext(url),
2064                 }
2065                 if sub_info['ext'] == 'm3u8':
2066                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2067                     # files may contain is WebVTT:
2068                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2069                     sub_info['ext'] = 'vtt'
2070                     sub_info['protocol'] = 'm3u8_native'
2071                 lang = media.get('LANGUAGE') or 'und'
2072                 subtitles.setdefault(lang, []).append(sub_info)
2073             if media_type not in ('VIDEO', 'AUDIO'):
2074                 return
2075             media_url = media.get('URI')
2076             if media_url:
2077                 manifest_url = format_url(media_url)
2078                 formats.extend({
2079                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2080                     'format_note': name,
2081                     'format_index': idx,
2082                     'url': manifest_url,
2083                     'manifest_url': m3u8_url,
2084                     'language': media.get('LANGUAGE'),
2085                     'ext': ext,
2086                     'protocol': entry_protocol,
2087                     'preference': preference,
2088                     'quality': quality,
2089                     'has_drm': has_drm,
2090                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2091                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2092
2093         def build_stream_name():
2094             # Despite specification does not mention NAME attribute for
2095             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2096             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2097             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2098             stream_name = last_stream_inf.get('NAME')
2099             if stream_name:
2100                 return stream_name
2101             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2102             # from corresponding rendition group
2103             stream_group_id = last_stream_inf.get('VIDEO')
2104             if not stream_group_id:
2105                 return
2106             stream_group = groups.get(stream_group_id)
2107             if not stream_group:
2108                 return stream_group_id
2109             rendition = stream_group[0]
2110             return rendition.get('NAME') or stream_group_id
2111
2112         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2113         # chance to detect video only formats when EXT-X-STREAM-INF tags
2114         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2115         for line in m3u8_doc.splitlines():
2116             if line.startswith('#EXT-X-MEDIA:'):
2117                 extract_media(line)
2118
2119         for line in m3u8_doc.splitlines():
2120             if line.startswith('#EXT-X-STREAM-INF:'):
2121                 last_stream_inf = parse_m3u8_attributes(line)
2122             elif line.startswith('#') or not line.strip():
2123                 continue
2124             else:
2125                 tbr = float_or_none(
2126                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2127                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2128                 manifest_url = format_url(line.strip())
2129
2130                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2131                     format_id = [m3u8_id, None, idx]
2132                     # Bandwidth of live streams may differ over time thus making
2133                     # format_id unpredictable. So it's better to keep provided
2134                     # format_id intact.
2135                     if not live:
2136                         stream_name = build_stream_name()
2137                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2138                     f = {
2139                         'format_id': join_nonempty(*format_id),
2140                         'format_index': idx,
2141                         'url': manifest_url,
2142                         'manifest_url': m3u8_url,
2143                         'tbr': tbr,
2144                         'ext': ext,
2145                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2146                         'protocol': entry_protocol,
2147                         'preference': preference,
2148                         'quality': quality,
2149                         'has_drm': has_drm,
2150                     }
2151                     resolution = last_stream_inf.get('RESOLUTION')
2152                     if resolution:
2153                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2154                         if mobj:
2155                             f['width'] = int(mobj.group('width'))
2156                             f['height'] = int(mobj.group('height'))
2157                     # Unified Streaming Platform
2158                     mobj = re.search(
2159                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2160                     if mobj:
2161                         abr, vbr = mobj.groups()
2162                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2163                         f.update({
2164                             'vbr': vbr,
2165                             'abr': abr,
2166                         })
2167                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2168                     f.update(codecs)
2169                     audio_group_id = last_stream_inf.get('AUDIO')
2170                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2171                     # references a rendition group MUST have a CODECS attribute.
2172                     # However, this is not always respected. E.g. [2]
2173                     # contains EXT-X-STREAM-INF tag which references AUDIO
2174                     # rendition group but does not have CODECS and despite
2175                     # referencing an audio group it represents a complete
2176                     # (with audio and video) format. So, for such cases we will
2177                     # ignore references to rendition groups and treat them
2178                     # as complete formats.
2179                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2180                         audio_group = groups.get(audio_group_id)
2181                         if audio_group and audio_group[0].get('URI'):
2182                             # TODO: update acodec for audio only formats with
2183                             # the same GROUP-ID
2184                             f['acodec'] = 'none'
2185                     if not f.get('ext'):
2186                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2187                     formats.append(f)
2188
2189                     # for DailyMotion
2190                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2191                     if progressive_uri:
2192                         http_f = f.copy()
2193                         del http_f['manifest_url']
2194                         http_f.update({
2195                             'format_id': f['format_id'].replace('hls-', 'http-'),
2196                             'protocol': 'http',
2197                             'url': progressive_uri,
2198                         })
2199                         formats.append(http_f)
2200
2201                 last_stream_inf = {}
2202         return formats, subtitles
2203
2204     def _extract_m3u8_vod_duration(
2205             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2206
2207         m3u8_vod = self._download_webpage(
2208             m3u8_vod_url, video_id,
2209             note='Downloading m3u8 VOD manifest' if note is None else note,
2210             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2211             fatal=False, data=data, headers=headers, query=query)
2212
2213         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2214
2215     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2216         if '#EXT-X-ENDLIST' not in m3u8_vod:
2217             return None
2218
2219         return int(sum(
2220             float(line[len('#EXTINF:'):].split(',')[0])
2221             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2222
2223     def _extract_mpd_vod_duration(
2224             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2225
2226         mpd_doc = self._download_xml(
2227             mpd_url, video_id,
2228             note='Downloading MPD VOD manifest' if note is None else note,
2229             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2230             fatal=False, data=data, headers=headers, query=query)
2231         if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2232             return None
2233         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2234
2235     @staticmethod
2236     def _xpath_ns(path, namespace=None):
2237         if not namespace:
2238             return path
2239         out = []
2240         for c in path.split('/'):
2241             if not c or c == '.':
2242                 out.append(c)
2243             else:
2244                 out.append('{%s}%s' % (namespace, c))
2245         return '/'.join(out)
2246
2247     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2248         if self.get_param('ignore_no_formats_error'):
2249             fatal = False
2250
2251         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2252         if res is False:
2253             assert not fatal
2254             return [], {}
2255         smil, urlh = res
2256
2257         return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2258                                                       namespace=self._parse_smil_namespace(smil))
2259
2260     def _extract_smil_formats(self, *args, **kwargs):
2261         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2262         if subs:
2263             self._report_ignoring_subs('SMIL')
2264         return fmts
2265
2266     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2267         res = self._download_smil(smil_url, video_id, fatal=fatal)
2268         if res is False:
2269             return {}
2270
2271         smil, urlh = res
2272         smil_url = urlh.url
2273
2274         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2275
2276     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2277         return self._download_xml_handle(
2278             smil_url, video_id, 'Downloading SMIL file',
2279             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2280
2281     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2282         namespace = self._parse_smil_namespace(smil)
2283
2284         formats, subtitles = self._parse_smil_formats_and_subtitles(
2285             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2286
2287         video_id = os.path.splitext(url_basename(smil_url))[0]
2288         title = None
2289         description = None
2290         upload_date = None
2291         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2292             name = meta.attrib.get('name')
2293             content = meta.attrib.get('content')
2294             if not name or not content:
2295                 continue
2296             if not title and name == 'title':
2297                 title = content
2298             elif not description and name in ('description', 'abstract'):
2299                 description = content
2300             elif not upload_date and name == 'date':
2301                 upload_date = unified_strdate(content)
2302
2303         thumbnails = [{
2304             'id': image.get('type'),
2305             'url': image.get('src'),
2306             'width': int_or_none(image.get('width')),
2307             'height': int_or_none(image.get('height')),
2308         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2309
2310         return {
2311             'id': video_id,
2312             'title': title or video_id,
2313             'description': description,
2314             'upload_date': upload_date,
2315             'thumbnails': thumbnails,
2316             'formats': formats,
2317             'subtitles': subtitles,
2318         }
2319
2320     def _parse_smil_namespace(self, smil):
2321         return self._search_regex(
2322             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2323
2324     def _parse_smil_formats(self, *args, **kwargs):
2325         fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2326         if subs:
2327             self._report_ignoring_subs('SMIL')
2328         return fmts
2329
2330     def _parse_smil_formats_and_subtitles(
2331             self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2332         base = smil_url
2333         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2334             b = meta.get('base') or meta.get('httpBase')
2335             if b:
2336                 base = b
2337                 break
2338
2339         formats, subtitles = [], {}
2340         rtmp_count = 0
2341         http_count = 0
2342         m3u8_count = 0
2343         imgs_count = 0
2344
2345         srcs = set()
2346         media = itertools.chain.from_iterable(
2347             smil.findall(self._xpath_ns(arg, namespace))
2348             for arg in ['.//video', './/audio', './/media'])
2349         for medium in media:
2350             src = medium.get('src')
2351             if not src or src in srcs:
2352                 continue
2353             srcs.add(src)
2354
2355             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2356             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2357             width = int_or_none(medium.get('width'))
2358             height = int_or_none(medium.get('height'))
2359             proto = medium.get('proto')
2360             ext = medium.get('ext')
2361             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2362                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2363             streamer = medium.get('streamer') or base
2364
2365             if proto == 'rtmp' or streamer.startswith('rtmp'):
2366                 rtmp_count += 1
2367                 formats.append({
2368                     'url': streamer,
2369                     'play_path': src,
2370                     'ext': 'flv',
2371                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2372                     'tbr': bitrate,
2373                     'filesize': filesize,
2374                     'width': width,
2375                     'height': height,
2376                 })
2377                 if transform_rtmp_url:
2378                     streamer, src = transform_rtmp_url(streamer, src)
2379                     formats[-1].update({
2380                         'url': streamer,
2381                         'play_path': src,
2382                     })
2383                 continue
2384
2385             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2386             src_url = src_url.strip()
2387
2388             if proto == 'm3u8' or src_ext == 'm3u8':
2389                 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
2390                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2391                 self._merge_subtitles(m3u8_subs, target=subtitles)
2392                 if len(m3u8_formats) == 1:
2393                     m3u8_count += 1
2394                     m3u8_formats[0].update({
2395                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2396                         'tbr': bitrate,
2397                         'width': width,
2398                         'height': height,
2399                     })
2400                 formats.extend(m3u8_formats)
2401             elif src_ext == 'f4m':
2402                 f4m_url = src_url
2403                 if not f4m_params:
2404                     f4m_params = {
2405                         'hdcore': '3.2.0',
2406                         'plugin': 'flowplayer-3.2.0.1',
2407                     }
2408                 f4m_url += '&' if '?' in f4m_url else '?'
2409                 f4m_url += urllib.parse.urlencode(f4m_params)
2410                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2411             elif src_ext == 'mpd':
2412                 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2413                     src_url, video_id, mpd_id='dash', fatal=False)
2414                 formats.extend(mpd_formats)
2415                 self._merge_subtitles(mpd_subs, target=subtitles)
2416             elif re.search(r'\.ism/[Mm]anifest', src_url):
2417                 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2418                     src_url, video_id, ism_id='mss', fatal=False)
2419                 formats.extend(ism_formats)
2420                 self._merge_subtitles(ism_subs, target=subtitles)
2421             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2422                 http_count += 1
2423                 formats.append({
2424                     'url': src_url,
2425                     'ext': ext or src_ext or 'flv',
2426                     'format_id': 'http-%d' % (bitrate or http_count),
2427                     'tbr': bitrate,
2428                     'filesize': filesize,
2429                     'width': width,
2430                     'height': height,
2431                 })
2432
2433         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2434             src = medium.get('src')
2435             if not src or src in srcs:
2436                 continue
2437             srcs.add(src)
2438
2439             imgs_count += 1
2440             formats.append({
2441                 'format_id': 'imagestream-%d' % (imgs_count),
2442                 'url': src,
2443                 'ext': mimetype2ext(medium.get('type')),
2444                 'acodec': 'none',
2445                 'vcodec': 'none',
2446                 'width': int_or_none(medium.get('width')),
2447                 'height': int_or_none(medium.get('height')),
2448                 'format_note': 'SMIL storyboards',
2449             })
2450
2451         smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2452         self._merge_subtitles(smil_subs, target=subtitles)
2453
2454         return formats, subtitles
2455
2456     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2457         urls = []
2458         subtitles = {}
2459         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2460             src = textstream.get('src')
2461             if not src or src in urls:
2462                 continue
2463             urls.append(src)
2464             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2465             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2466             subtitles.setdefault(lang, []).append({
2467                 'url': src,
2468                 'ext': ext,
2469             })
2470         return subtitles
2471
2472     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2473         res = self._download_xml_handle(
2474             xspf_url, playlist_id, 'Downloading xpsf playlist',
2475             'Unable to download xspf manifest', fatal=fatal)
2476         if res is False:
2477             return []
2478
2479         xspf, urlh = res
2480         xspf_url = urlh.url
2481
2482         return self._parse_xspf(
2483             xspf, playlist_id, xspf_url=xspf_url,
2484             xspf_base_url=base_url(xspf_url))
2485
2486     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2487         NS_MAP = {
2488             'xspf': 'http://xspf.org/ns/0/',
2489             's1': 'http://static.streamone.nl/player/ns/0',
2490         }
2491
2492         entries = []
2493         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2494             title = xpath_text(
2495                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2496             description = xpath_text(
2497                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2498             thumbnail = xpath_text(
2499                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2500             duration = float_or_none(
2501                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2502
2503             formats = []
2504             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2505                 format_url = urljoin(xspf_base_url, location.text)
2506                 if not format_url:
2507                     continue
2508                 formats.append({
2509                     'url': format_url,
2510                     'manifest_url': xspf_url,
2511                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2512                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2513                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2514                 })
2515
2516             entries.append({
2517                 'id': playlist_id,
2518                 'title': title,
2519                 'description': description,
2520                 'thumbnail': thumbnail,
2521                 'duration': duration,
2522                 'formats': formats,
2523             })
2524         return entries
2525
2526     def _extract_mpd_formats(self, *args, **kwargs):
2527         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2528         if subs:
2529             self._report_ignoring_subs('DASH')
2530         return fmts
2531
2532     def _extract_mpd_formats_and_subtitles(
2533             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2534             fatal=True, data=None, headers={}, query={}):
2535
2536         if self.get_param('ignore_no_formats_error'):
2537             fatal = False
2538
2539         res = self._download_xml_handle(
2540             mpd_url, video_id,
2541             note='Downloading MPD manifest' if note is None else note,
2542             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2543             fatal=fatal, data=data, headers=headers, query=query)
2544         if res is False:
2545             return [], {}
2546         mpd_doc, urlh = res
2547         if mpd_doc is None:
2548             return [], {}
2549
2550         # We could have been redirected to a new url when we retrieved our mpd file.
2551         mpd_url = urlh.url
2552         mpd_base_url = base_url(mpd_url)
2553
2554         return self._parse_mpd_formats_and_subtitles(
2555             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2556
2557     def _parse_mpd_formats(self, *args, **kwargs):
2558         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2559         if subs:
2560             self._report_ignoring_subs('DASH')
2561         return fmts
2562
2563     def _parse_mpd_formats_and_subtitles(
2564             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2565         """
2566         Parse formats from MPD manifest.
2567         References:
2568          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2569             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2570          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2571         """
2572         if not self.get_param('dynamic_mpd', True):
2573             if mpd_doc.get('type') == 'dynamic':
2574                 return [], {}
2575
2576         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2577
2578         def _add_ns(path):
2579             return self._xpath_ns(path, namespace)
2580
2581         def is_drm_protected(element):
2582             return element.find(_add_ns('ContentProtection')) is not None
2583
2584         def extract_multisegment_info(element, ms_parent_info):
2585             ms_info = ms_parent_info.copy()
2586
2587             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2588             # common attributes and elements.  We will only extract relevant
2589             # for us.
2590             def extract_common(source):
2591                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2592                 if segment_timeline is not None:
2593                     s_e = segment_timeline.findall(_add_ns('S'))
2594                     if s_e:
2595                         ms_info['total_number'] = 0
2596                         ms_info['s'] = []
2597                         for s in s_e:
2598                             r = int(s.get('r', 0))
2599                             ms_info['total_number'] += 1 + r
2600                             ms_info['s'].append({
2601                                 't': int(s.get('t', 0)),
2602                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2603                                 'd': int(s.attrib['d']),
2604                                 'r': r,
2605                             })
2606                 start_number = source.get('startNumber')
2607                 if start_number:
2608                     ms_info['start_number'] = int(start_number)
2609                 timescale = source.get('timescale')
2610                 if timescale:
2611                     ms_info['timescale'] = int(timescale)
2612                 segment_duration = source.get('duration')
2613                 if segment_duration:
2614                     ms_info['segment_duration'] = float(segment_duration)
2615
2616             def extract_Initialization(source):
2617                 initialization = source.find(_add_ns('Initialization'))
2618                 if initialization is not None:
2619                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2620
2621             segment_list = element.find(_add_ns('SegmentList'))
2622             if segment_list is not None:
2623                 extract_common(segment_list)
2624                 extract_Initialization(segment_list)
2625                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2626                 if segment_urls_e:
2627                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2628             else:
2629                 segment_template = element.find(_add_ns('SegmentTemplate'))
2630                 if segment_template is not None:
2631                     extract_common(segment_template)
2632                     media = segment_template.get('media')
2633                     if media:
2634                         ms_info['media'] = media
2635                     initialization = segment_template.get('initialization')
2636                     if initialization:
2637                         ms_info['initialization'] = initialization
2638                     else:
2639                         extract_Initialization(segment_template)
2640             return ms_info
2641
2642         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2643         formats, subtitles = [], {}
2644         stream_numbers = collections.defaultdict(int)
2645         for period in mpd_doc.findall(_add_ns('Period')):
2646             period_duration = parse_duration(period.get('duration')) or mpd_duration
2647             period_ms_info = extract_multisegment_info(period, {
2648                 'start_number': 1,
2649                 'timescale': 1,
2650             })
2651             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2652                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2653                 for representation in adaptation_set.findall(_add_ns('Representation')):
2654                     representation_attrib = adaptation_set.attrib.copy()
2655                     representation_attrib.update(representation.attrib)
2656                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2657                     mime_type = representation_attrib['mimeType']
2658                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2659
2660                     codec_str = representation_attrib.get('codecs', '')
2661                     # Some kind of binary subtitle found in some youtube livestreams
2662                     if mime_type == 'application/x-rawcc':
2663                         codecs = {'scodec': codec_str}
2664                     else:
2665                         codecs = parse_codecs(codec_str)
2666                     if content_type not in ('video', 'audio', 'text'):
2667                         if mime_type == 'image/jpeg':
2668                             content_type = mime_type
2669                         elif codecs.get('vcodec', 'none') != 'none':
2670                             content_type = 'video'
2671                         elif codecs.get('acodec', 'none') != 'none':
2672                             content_type = 'audio'
2673                         elif codecs.get('scodec', 'none') != 'none':
2674                             content_type = 'text'
2675                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2676                             content_type = 'text'
2677                         else:
2678                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2679                             continue
2680
2681                     base_url = ''
2682                     for element in (representation, adaptation_set, period, mpd_doc):
2683                         base_url_e = element.find(_add_ns('BaseURL'))
2684                         if try_call(lambda: base_url_e.text) is not None:
2685                             base_url = base_url_e.text + base_url
2686                             if re.match(r'^https?://', base_url):
2687                                 break
2688                     if mpd_base_url and base_url.startswith('/'):
2689                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2690                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2691                         if not mpd_base_url.endswith('/'):
2692                             mpd_base_url += '/'
2693                         base_url = mpd_base_url + base_url
2694                     representation_id = representation_attrib.get('id')
2695                     lang = representation_attrib.get('lang')
2696                     url_el = representation.find(_add_ns('BaseURL'))
2697                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2698                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2699                     if representation_id is not None:
2700                         format_id = representation_id
2701                     else:
2702                         format_id = content_type
2703                     if mpd_id:
2704                         format_id = mpd_id + '-' + format_id
2705                     if content_type in ('video', 'audio'):
2706                         f = {
2707                             'format_id': format_id,
2708                             'manifest_url': mpd_url,
2709                             'ext': mimetype2ext(mime_type),
2710                             'width': int_or_none(representation_attrib.get('width')),
2711                             'height': int_or_none(representation_attrib.get('height')),
2712                             'tbr': float_or_none(bandwidth, 1000),
2713                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2714                             'fps': int_or_none(representation_attrib.get('frameRate')),
2715                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2716                             'format_note': 'DASH %s' % content_type,
2717                             'filesize': filesize,
2718                             'container': mimetype2ext(mime_type) + '_dash',
2719                             **codecs
2720                         }
2721                     elif content_type == 'text':
2722                         f = {
2723                             'ext': mimetype2ext(mime_type),
2724                             'manifest_url': mpd_url,
2725                             'filesize': filesize,
2726                         }
2727                     elif content_type == 'image/jpeg':
2728                         # See test case in VikiIE
2729                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2730                         f = {
2731                             'format_id': format_id,
2732                             'ext': 'mhtml',
2733                             'manifest_url': mpd_url,
2734                             'format_note': 'DASH storyboards (jpeg)',
2735                             'acodec': 'none',
2736                             'vcodec': 'none',
2737                         }
2738                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2739                         f['has_drm'] = True
2740                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2741
2742                     def prepare_template(template_name, identifiers):
2743                         tmpl = representation_ms_info[template_name]
2744                         if representation_id is not None:
2745                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2746                         # First of, % characters outside $...$ templates
2747                         # must be escaped by doubling for proper processing
2748                         # by % operator string formatting used further (see
2749                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2750                         t = ''
2751                         in_template = False
2752                         for c in tmpl:
2753                             t += c
2754                             if c == '$':
2755                                 in_template = not in_template
2756                             elif c == '%' and not in_template:
2757                                 t += c
2758                         # Next, $...$ templates are translated to their
2759                         # %(...) counterparts to be used with % operator
2760                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2761                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2762                         t.replace('$$', '$')
2763                         return t
2764
2765                     # @initialization is a regular template like @media one
2766                     # so it should be handled just the same way (see
2767                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2768                     if 'initialization' in representation_ms_info:
2769                         initialization_template = prepare_template(
2770                             'initialization',
2771                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2772                             # $Time$ shall not be included for @initialization thus
2773                             # only $Bandwidth$ remains
2774                             ('Bandwidth', ))
2775                         representation_ms_info['initialization_url'] = initialization_template % {
2776                             'Bandwidth': bandwidth,
2777                         }
2778
2779                     def location_key(location):
2780                         return 'url' if re.match(r'^https?://', location) else 'path'
2781
2782                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2783
2784                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2785                         media_location_key = location_key(media_template)
2786
2787                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2788                         # can't be used at the same time
2789                         if '%(Number' in media_template and 's' not in representation_ms_info:
2790                             segment_duration = None
2791                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2792                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2793                                 representation_ms_info['total_number'] = int(math.ceil(
2794                                     float_or_none(period_duration, segment_duration, default=0)))
2795                             representation_ms_info['fragments'] = [{
2796                                 media_location_key: media_template % {
2797                                     'Number': segment_number,
2798                                     'Bandwidth': bandwidth,
2799                                 },
2800                                 'duration': segment_duration,
2801                             } for segment_number in range(
2802                                 representation_ms_info['start_number'],
2803                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2804                         else:
2805                             # $Number*$ or $Time$ in media template with S list available
2806                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2807                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2808                             representation_ms_info['fragments'] = []
2809                             segment_time = 0
2810                             segment_d = None
2811                             segment_number = representation_ms_info['start_number']
2812
2813                             def add_segment_url():
2814                                 segment_url = media_template % {
2815                                     'Time': segment_time,
2816                                     'Bandwidth': bandwidth,
2817                                     'Number': segment_number,
2818                                 }
2819                                 representation_ms_info['fragments'].append({
2820                                     media_location_key: segment_url,
2821                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2822                                 })
2823
2824                             for num, s in enumerate(representation_ms_info['s']):
2825                                 segment_time = s.get('t') or segment_time
2826                                 segment_d = s['d']
2827                                 add_segment_url()
2828                                 segment_number += 1
2829                                 for r in range(s.get('r', 0)):
2830                                     segment_time += segment_d
2831                                     add_segment_url()
2832                                     segment_number += 1
2833                                 segment_time += segment_d
2834                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2835                         # No media template,
2836                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2837                         # or any YouTube dashsegments video
2838                         fragments = []
2839                         segment_index = 0
2840                         timescale = representation_ms_info['timescale']
2841                         for s in representation_ms_info['s']:
2842                             duration = float_or_none(s['d'], timescale)
2843                             for r in range(s.get('r', 0) + 1):
2844                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2845                                 fragments.append({
2846                                     location_key(segment_uri): segment_uri,
2847                                     'duration': duration,
2848                                 })
2849                                 segment_index += 1
2850                         representation_ms_info['fragments'] = fragments
2851                     elif 'segment_urls' in representation_ms_info:
2852                         # Segment URLs with no SegmentTimeline
2853                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2854                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2855                         fragments = []
2856                         segment_duration = float_or_none(
2857                             representation_ms_info['segment_duration'],
2858                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2859                         for segment_url in representation_ms_info['segment_urls']:
2860                             fragment = {
2861                                 location_key(segment_url): segment_url,
2862                             }
2863                             if segment_duration:
2864                                 fragment['duration'] = segment_duration
2865                             fragments.append(fragment)
2866                         representation_ms_info['fragments'] = fragments
2867                     # If there is a fragments key available then we correctly recognized fragmented media.
2868                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2869                     # assumption is not necessarily correct since we may simply have no support for
2870                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2871                     if 'fragments' in representation_ms_info:
2872                         f.update({
2873                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2874                             'url': mpd_url or base_url,
2875                             'fragment_base_url': base_url,
2876                             'fragments': [],
2877                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2878                         })
2879                         if 'initialization_url' in representation_ms_info:
2880                             initialization_url = representation_ms_info['initialization_url']
2881                             if not f.get('url'):
2882                                 f['url'] = initialization_url
2883                             f['fragments'].append({location_key(initialization_url): initialization_url})
2884                         f['fragments'].extend(representation_ms_info['fragments'])
2885                         if not period_duration:
2886                             period_duration = try_get(
2887                                 representation_ms_info,
2888                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2889                     else:
2890                         # Assuming direct URL to unfragmented media.
2891                         f['url'] = base_url
2892                     if content_type in ('video', 'audio', 'image/jpeg'):
2893                         f['manifest_stream_number'] = stream_numbers[f['url']]
2894                         stream_numbers[f['url']] += 1
2895                         formats.append(f)
2896                     elif content_type == 'text':
2897                         subtitles.setdefault(lang or 'und', []).append(f)
2898
2899         return formats, subtitles
2900
2901     def _extract_ism_formats(self, *args, **kwargs):
2902         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2903         if subs:
2904             self._report_ignoring_subs('ISM')
2905         return fmts
2906
2907     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2908         if self.get_param('ignore_no_formats_error'):
2909             fatal = False
2910
2911         res = self._download_xml_handle(
2912             ism_url, video_id,
2913             note='Downloading ISM manifest' if note is None else note,
2914             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2915             fatal=fatal, data=data, headers=headers, query=query)
2916         if res is False:
2917             return [], {}
2918         ism_doc, urlh = res
2919         if ism_doc is None:
2920             return [], {}
2921
2922         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
2923
2924     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2925         """
2926         Parse formats from ISM manifest.
2927         References:
2928          1. [MS-SSTR]: Smooth Streaming Protocol,
2929             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2930         """
2931         if ism_doc.get('IsLive') == 'TRUE':
2932             return [], {}
2933
2934         duration = int(ism_doc.attrib['Duration'])
2935         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2936
2937         formats = []
2938         subtitles = {}
2939         for stream in ism_doc.findall('StreamIndex'):
2940             stream_type = stream.get('Type')
2941             if stream_type not in ('video', 'audio', 'text'):
2942                 continue
2943             url_pattern = stream.attrib['Url']
2944             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2945             stream_name = stream.get('Name')
2946             stream_language = stream.get('Language', 'und')
2947             for track in stream.findall('QualityLevel'):
2948                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2949                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
2950                 # TODO: add support for WVC1 and WMAP
2951                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
2952                     self.report_warning('%s is not a supported codec' % fourcc)
2953                     continue
2954                 tbr = int(track.attrib['Bitrate']) // 1000
2955                 # [1] does not mention Width and Height attributes. However,
2956                 # they're often present while MaxWidth and MaxHeight are
2957                 # missing, so should be used as fallbacks
2958                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2959                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2960                 sampling_rate = int_or_none(track.get('SamplingRate'))
2961
2962                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2963                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
2964
2965                 fragments = []
2966                 fragment_ctx = {
2967                     'time': 0,
2968                 }
2969                 stream_fragments = stream.findall('c')
2970                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2971                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2972                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2973                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2974                     if not fragment_ctx['duration']:
2975                         try:
2976                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2977                         except IndexError:
2978                             next_fragment_time = duration
2979                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2980                     for _ in range(fragment_repeat):
2981                         fragments.append({
2982                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
2983                             'duration': fragment_ctx['duration'] / stream_timescale,
2984                         })
2985                         fragment_ctx['time'] += fragment_ctx['duration']
2986
2987                 if stream_type == 'text':
2988                     subtitles.setdefault(stream_language, []).append({
2989                         'ext': 'ismt',
2990                         'protocol': 'ism',
2991                         'url': ism_url,
2992                         'manifest_url': ism_url,
2993                         'fragments': fragments,
2994                         '_download_params': {
2995                             'stream_type': stream_type,
2996                             'duration': duration,
2997                             'timescale': stream_timescale,
2998                             'fourcc': fourcc,
2999                             'language': stream_language,
3000                             'codec_private_data': track.get('CodecPrivateData'),
3001                         }
3002                     })
3003                 elif stream_type in ('video', 'audio'):
3004                     formats.append({
3005                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3006                         'url': ism_url,
3007                         'manifest_url': ism_url,
3008                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3009                         'width': width,
3010                         'height': height,
3011                         'tbr': tbr,
3012                         'asr': sampling_rate,
3013                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3014                         'acodec': 'none' if stream_type == 'video' else fourcc,
3015                         'protocol': 'ism',
3016                         'fragments': fragments,
3017                         'has_drm': ism_doc.find('Protection') is not None,
3018                         'language': stream_language,
3019                         'audio_channels': int_or_none(track.get('Channels')),
3020                         '_download_params': {
3021                             'stream_type': stream_type,
3022                             'duration': duration,
3023                             'timescale': stream_timescale,
3024                             'width': width or 0,
3025                             'height': height or 0,
3026                             'fourcc': fourcc,
3027                             'language': stream_language,
3028                             'codec_private_data': track.get('CodecPrivateData'),
3029                             'sampling_rate': sampling_rate,
3030                             'channels': int_or_none(track.get('Channels', 2)),
3031                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3032                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3033                         },
3034                     })
3035         return formats, subtitles
3036
3037     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3038         def absolute_url(item_url):
3039             return urljoin(base_url, item_url)
3040
3041         def parse_content_type(content_type):
3042             if not content_type:
3043                 return {}
3044             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3045             if ctr:
3046                 mimetype, codecs = ctr.groups()
3047                 f = parse_codecs(codecs)
3048                 f['ext'] = mimetype2ext(mimetype)
3049                 return f
3050             return {}
3051
3052         def _media_formats(src, cur_media_type, type_info=None):
3053             type_info = type_info or {}
3054             full_url = absolute_url(src)
3055             ext = type_info.get('ext') or determine_ext(full_url)
3056             if ext == 'm3u8':
3057                 is_plain_url = False
3058                 formats = self._extract_m3u8_formats(
3059                     full_url, video_id, ext='mp4',
3060                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3061                     preference=preference, quality=quality, fatal=False)
3062             elif ext == 'mpd':
3063                 is_plain_url = False
3064                 formats = self._extract_mpd_formats(
3065                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3066             else:
3067                 is_plain_url = True
3068                 formats = [{
3069                     'url': full_url,
3070                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3071                     'ext': ext,
3072                 }]
3073             return is_plain_url, formats
3074
3075         entries = []
3076         # amp-video and amp-audio are very similar to their HTML5 counterparts
3077         # so we will include them right here (see
3078         # https://www.ampproject.org/docs/reference/components/amp-video)
3079         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3080         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3081         media_tags = [(media_tag, media_tag_name, media_type, '')
3082                       for media_tag, media_tag_name, media_type
3083                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3084         media_tags.extend(re.findall(
3085             # We only allow video|audio followed by a whitespace or '>'.
3086             # Allowing more characters may end up in significant slow down (see
3087             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3088             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3089             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3090         for media_tag, _, media_type, media_content in media_tags:
3091             media_info = {
3092                 'formats': [],
3093                 'subtitles': {},
3094             }
3095             media_attributes = extract_attributes(media_tag)
3096             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3097             if src:
3098                 f = parse_content_type(media_attributes.get('type'))
3099                 _, formats = _media_formats(src, media_type, f)
3100                 media_info['formats'].extend(formats)
3101             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3102             if media_content:
3103                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3104                     s_attr = extract_attributes(source_tag)
3105                     # data-video-src and data-src are non standard but seen
3106                     # several times in the wild
3107                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3108                     if not src:
3109                         continue
3110                     f = parse_content_type(s_attr.get('type'))
3111                     is_plain_url, formats = _media_formats(src, media_type, f)
3112                     if is_plain_url:
3113                         # width, height, res, label and title attributes are
3114                         # all not standard but seen several times in the wild
3115                         labels = [
3116                             s_attr.get(lbl)
3117                             for lbl in ('label', 'title')
3118                             if str_or_none(s_attr.get(lbl))
3119                         ]
3120                         width = int_or_none(s_attr.get('width'))
3121                         height = (int_or_none(s_attr.get('height'))
3122                                   or int_or_none(s_attr.get('res')))
3123                         if not width or not height:
3124                             for lbl in labels:
3125                                 resolution = parse_resolution(lbl)
3126                                 if not resolution:
3127                                     continue
3128                                 width = width or resolution.get('width')
3129                                 height = height or resolution.get('height')
3130                         for lbl in labels:
3131                             tbr = parse_bitrate(lbl)
3132                             if tbr:
3133                                 break
3134                         else:
3135                             tbr = None
3136                         f.update({
3137                             'width': width,
3138                             'height': height,
3139                             'tbr': tbr,
3140                             'format_id': s_attr.get('label') or s_attr.get('title'),
3141                         })
3142                         f.update(formats[0])
3143                         media_info['formats'].append(f)
3144                     else:
3145                         media_info['formats'].extend(formats)
3146                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3147                     track_attributes = extract_attributes(track_tag)
3148                     kind = track_attributes.get('kind')
3149                     if not kind or kind in ('subtitles', 'captions'):
3150                         src = strip_or_none(track_attributes.get('src'))
3151                         if not src:
3152                             continue
3153                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3154                         media_info['subtitles'].setdefault(lang, []).append({
3155                             'url': absolute_url(src),
3156                         })
3157             for f in media_info['formats']:
3158                 f.setdefault('http_headers', {})['Referer'] = base_url
3159             if media_info['formats'] or media_info['subtitles']:
3160                 entries.append(media_info)
3161         return entries
3162
3163     def _extract_akamai_formats(self, *args, **kwargs):
3164         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3165         if subs:
3166             self._report_ignoring_subs('akamai')
3167         return fmts
3168
3169     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3170         signed = 'hdnea=' in manifest_url
3171         if not signed:
3172             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3173             manifest_url = re.sub(
3174                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3175                 '', manifest_url).strip('?')
3176
3177         formats = []
3178         subtitles = {}
3179
3180         hdcore_sign = 'hdcore=3.7.0'
3181         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3182         hds_host = hosts.get('hds')
3183         if hds_host:
3184             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3185         if 'hdcore=' not in f4m_url:
3186             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3187         f4m_formats = self._extract_f4m_formats(
3188             f4m_url, video_id, f4m_id='hds', fatal=False)
3189         for entry in f4m_formats:
3190             entry.update({'extra_param_to_segment_url': hdcore_sign})
3191         formats.extend(f4m_formats)
3192
3193         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3194         hls_host = hosts.get('hls')
3195         if hls_host:
3196             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3197         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3198             m3u8_url, video_id, 'mp4', 'm3u8_native',
3199             m3u8_id='hls', fatal=False)
3200         formats.extend(m3u8_formats)
3201         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3202
3203         http_host = hosts.get('http')
3204         if http_host and m3u8_formats and not signed:
3205             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3206             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3207             qualities_length = len(qualities)
3208             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3209                 i = 0
3210                 for f in m3u8_formats:
3211                     if f['vcodec'] != 'none':
3212                         for protocol in ('http', 'https'):
3213                             http_f = f.copy()
3214                             del http_f['manifest_url']
3215                             http_url = re.sub(
3216                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3217                             http_f.update({
3218                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3219                                 'url': http_url,
3220                                 'protocol': protocol,
3221                             })
3222                             formats.append(http_f)
3223                         i += 1
3224
3225         return formats, subtitles
3226
3227     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3228         query = urllib.parse.urlparse(url).query
3229         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3230         mobj = re.search(
3231             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3232         url_base = mobj.group('url')
3233         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3234         formats = []
3235
3236         def manifest_url(manifest):
3237             m_url = f'{http_base_url}/{manifest}'
3238             if query:
3239                 m_url += '?%s' % query
3240             return m_url
3241
3242         if 'm3u8' not in skip_protocols:
3243             formats.extend(self._extract_m3u8_formats(
3244                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3245                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3246         if 'f4m' not in skip_protocols:
3247             formats.extend(self._extract_f4m_formats(
3248                 manifest_url('manifest.f4m'),
3249                 video_id, f4m_id='hds', fatal=False))
3250         if 'dash' not in skip_protocols:
3251             formats.extend(self._extract_mpd_formats(
3252                 manifest_url('manifest.mpd'),
3253                 video_id, mpd_id='dash', fatal=False))
3254         if re.search(r'(?:/smil:|\.smil)', url_base):
3255             if 'smil' not in skip_protocols:
3256                 rtmp_formats = self._extract_smil_formats(
3257                     manifest_url('jwplayer.smil'),
3258                     video_id, fatal=False)
3259                 for rtmp_format in rtmp_formats:
3260                     rtsp_format = rtmp_format.copy()
3261                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3262                     del rtsp_format['play_path']
3263                     del rtsp_format['ext']
3264                     rtsp_format.update({
3265                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3266                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3267                         'protocol': 'rtsp',
3268                     })
3269                     formats.extend([rtmp_format, rtsp_format])
3270         else:
3271             for protocol in ('rtmp', 'rtsp'):
3272                 if protocol not in skip_protocols:
3273                     formats.append({
3274                         'url': f'{protocol}:{url_base}',
3275                         'format_id': protocol,
3276                         'protocol': protocol,
3277                     })
3278         return formats
3279
3280     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3281         mobj = re.search(
3282             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3283             webpage)
3284         if mobj:
3285             try:
3286                 jwplayer_data = self._parse_json(mobj.group('options'),
3287                                                  video_id=video_id,
3288                                                  transform_source=transform_source)
3289             except ExtractorError:
3290                 pass
3291             else:
3292                 if isinstance(jwplayer_data, dict):
3293                     return jwplayer_data
3294
3295     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3296         jwplayer_data = self._find_jwplayer_data(
3297             webpage, video_id, transform_source=js_to_json)
3298         return self._parse_jwplayer_data(
3299             jwplayer_data, video_id, *args, **kwargs)
3300
3301     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3302                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3303         entries = []
3304         if not isinstance(jwplayer_data, dict):
3305             return entries
3306
3307         playlist_items = jwplayer_data.get('playlist')
3308         # JWPlayer backward compatibility: single playlist item/flattened playlists
3309         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3310         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3311         if not isinstance(playlist_items, list):
3312             playlist_items = (playlist_items or jwplayer_data, )
3313
3314         for video_data in playlist_items:
3315             if not isinstance(video_data, dict):
3316                 continue
3317             # JWPlayer backward compatibility: flattened sources
3318             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3319             if 'sources' not in video_data:
3320                 video_data['sources'] = [video_data]
3321
3322             this_video_id = video_id or video_data['mediaid']
3323
3324             formats = self._parse_jwplayer_formats(
3325                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3326                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3327
3328             subtitles = {}
3329             tracks = video_data.get('tracks')
3330             if tracks and isinstance(tracks, list):
3331                 for track in tracks:
3332                     if not isinstance(track, dict):
3333                         continue
3334                     track_kind = track.get('kind')
3335                     if not track_kind or not isinstance(track_kind, str):
3336                         continue
3337                     if track_kind.lower() not in ('captions', 'subtitles'):
3338                         continue
3339                     track_url = urljoin(base_url, track.get('file'))
3340                     if not track_url:
3341                         continue
3342                     subtitles.setdefault(track.get('label') or 'en', []).append({
3343                         'url': self._proto_relative_url(track_url)
3344                     })
3345
3346             entry = {
3347                 'id': this_video_id,
3348                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3349                 'description': clean_html(video_data.get('description')),
3350                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3351                 'timestamp': int_or_none(video_data.get('pubdate')),
3352                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3353                 'subtitles': subtitles,
3354                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3355                 'genre': clean_html(video_data.get('genre')),
3356                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3357                 'season_number': int_or_none(video_data.get('season')),
3358                 'episode_number': int_or_none(video_data.get('episode')),
3359                 'release_year': int_or_none(video_data.get('releasedate')),
3360                 'age_limit': int_or_none(video_data.get('age_restriction')),
3361             }
3362             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3363             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3364                 entry.update({
3365                     '_type': 'url_transparent',
3366                     'url': formats[0]['url'],
3367                 })
3368             else:
3369                 entry['formats'] = formats
3370             entries.append(entry)
3371         if len(entries) == 1:
3372             return entries[0]
3373         else:
3374             return self.playlist_result(entries)
3375
3376     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3377                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3378         urls = set()
3379         formats = []
3380         for source in jwplayer_sources_data:
3381             if not isinstance(source, dict):
3382                 continue
3383             source_url = urljoin(
3384                 base_url, self._proto_relative_url(source.get('file')))
3385             if not source_url or source_url in urls:
3386                 continue
3387             urls.add(source_url)
3388             source_type = source.get('type') or ''
3389             ext = mimetype2ext(source_type) or determine_ext(source_url)
3390             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3391                 formats.extend(self._extract_m3u8_formats(
3392                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3393                     m3u8_id=m3u8_id, fatal=False))
3394             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3395                 formats.extend(self._extract_mpd_formats(
3396                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3397             elif ext == 'smil':
3398                 formats.extend(self._extract_smil_formats(
3399                     source_url, video_id, fatal=False))
3400             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3401             elif source_type.startswith('audio') or ext in (
3402                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3403                 formats.append({
3404                     'url': source_url,
3405                     'vcodec': 'none',
3406                     'ext': ext,
3407                 })
3408             else:
3409                 format_id = str_or_none(source.get('label'))
3410                 height = int_or_none(source.get('height'))
3411                 if height is None and format_id:
3412                     # Often no height is provided but there is a label in
3413                     # format like "1080p", "720p SD", or 1080.
3414                     height = parse_resolution(format_id).get('height')
3415                 a_format = {
3416                     'url': source_url,
3417                     'width': int_or_none(source.get('width')),
3418                     'height': height,
3419                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3420                     'filesize': int_or_none(source.get('filesize')),
3421                     'ext': ext,
3422                     'format_id': format_id
3423                 }
3424                 if source_url.startswith('rtmp'):
3425                     a_format['ext'] = 'flv'
3426                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3427                     # of jwplayer.flash.swf
3428                     rtmp_url_parts = re.split(
3429                         r'((?:mp4|mp3|flv):)', source_url, 1)
3430                     if len(rtmp_url_parts) == 3:
3431                         rtmp_url, prefix, play_path = rtmp_url_parts
3432                         a_format.update({
3433                             'url': rtmp_url,
3434                             'play_path': prefix + play_path,
3435                         })
3436                     if rtmp_params:
3437                         a_format.update(rtmp_params)
3438                 formats.append(a_format)
3439         return formats
3440
3441     def _live_title(self, name):
3442         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3443         return name
3444
3445     def _int(self, v, name, fatal=False, **kwargs):
3446         res = int_or_none(v, **kwargs)
3447         if res is None:
3448             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3449             if fatal:
3450                 raise ExtractorError(msg)
3451             else:
3452                 self.report_warning(msg)
3453         return res
3454
3455     def _float(self, v, name, fatal=False, **kwargs):
3456         res = float_or_none(v, **kwargs)
3457         if res is None:
3458             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3459             if fatal:
3460                 raise ExtractorError(msg)
3461             else:
3462                 self.report_warning(msg)
3463         return res
3464
3465     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3466                     path='/', secure=False, discard=False, rest={}, **kwargs):
3467         cookie = http.cookiejar.Cookie(
3468             0, name, value, port, port is not None, domain, True,
3469             domain.startswith('.'), path, True, secure, expire_time,
3470             discard, None, None, rest)
3471         self.cookiejar.set_cookie(cookie)
3472
3473     def _get_cookies(self, url):
3474         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3475         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3476
3477     def _apply_first_set_cookie_header(self, url_handle, cookie):
3478         """
3479         Apply first Set-Cookie header instead of the last. Experimental.
3480
3481         Some sites (e.g. [1-3]) may serve two cookies under the same name
3482         in Set-Cookie header and expect the first (old) one to be set rather
3483         than second (new). However, as of RFC6265 the newer one cookie
3484         should be set into cookie store what actually happens.
3485         We will workaround this issue by resetting the cookie to
3486         the first one manually.
3487         1. https://new.vk.com/
3488         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3489         3. https://learning.oreilly.com/
3490         """
3491         for header, cookies in url_handle.headers.items():
3492             if header.lower() != 'set-cookie':
3493                 continue
3494             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3495             cookie_value = re.search(
3496                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3497             if cookie_value:
3498                 value, domain = cookie_value.groups()
3499                 self._set_cookie(domain, cookie, value)
3500                 break
3501
3502     @classmethod
3503     def get_testcases(cls, include_onlymatching=False):
3504         # Do not look in super classes
3505         t = vars(cls).get('_TEST')
3506         if t:
3507             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3508             tests = [t]
3509         else:
3510             tests = vars(cls).get('_TESTS', [])
3511         for t in tests:
3512             if not include_onlymatching and t.get('only_matching', False):
3513                 continue
3514             t['name'] = cls.ie_key()
3515             yield t
3516         if getattr(cls, '__wrapped__', None):
3517             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3518
3519     @classmethod
3520     def get_webpage_testcases(cls):
3521         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3522         for t in tests:
3523             t['name'] = cls.ie_key()
3524             yield t
3525         if getattr(cls, '__wrapped__', None):
3526             yield from cls.__wrapped__.get_webpage_testcases()
3527
3528     @classproperty(cache=True)
3529     def age_limit(cls):
3530         """Get age limit from the testcases"""
3531         return max(traverse_obj(
3532             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3533             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3534
3535     @classproperty(cache=True)
3536     def _RETURN_TYPE(cls):
3537         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3538         tests = tuple(cls.get_testcases(include_onlymatching=False))
3539         if not tests:
3540             return None
3541         elif not any(k.startswith('playlist') for test in tests for k in test):
3542             return 'video'
3543         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3544             return 'playlist'
3545         return 'any'
3546
3547     @classmethod
3548     def is_single_video(cls, url):
3549         """Returns whether the URL is of a single video, None if unknown"""
3550         if cls.suitable(url):
3551             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3552
3553     @classmethod
3554     def is_suitable(cls, age_limit):
3555         """Test whether the extractor is generally suitable for the given age limit"""
3556         return not age_restricted(cls.age_limit, age_limit)
3557
3558     @classmethod
3559     def description(cls, *, markdown=True, search_examples=None):
3560         """Description of the extractor"""
3561         desc = ''
3562         if cls._NETRC_MACHINE:
3563             if markdown:
3564                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3565             else:
3566                 desc += f' [{cls._NETRC_MACHINE}]'
3567         if cls.IE_DESC is False:
3568             desc += ' [HIDDEN]'
3569         elif cls.IE_DESC:
3570             desc += f' {cls.IE_DESC}'
3571         if cls.SEARCH_KEY:
3572             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3573             if search_examples:
3574                 _COUNTS = ('', '5', '10', 'all')
3575                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3576         if not cls.working():
3577             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3578
3579         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3580         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3581         return f'{name}:{desc}' if desc else name
3582
3583     def extract_subtitles(self, *args, **kwargs):
3584         if (self.get_param('writesubtitles', False)
3585                 or self.get_param('listsubtitles')):
3586             return self._get_subtitles(*args, **kwargs)
3587         return {}
3588
3589     def _get_subtitles(self, *args, **kwargs):
3590         raise NotImplementedError('This method must be implemented by subclasses')
3591
3592     class CommentsDisabled(Exception):
3593         """Raise in _get_comments if comments are disabled for the video"""
3594
3595     def extract_comments(self, *args, **kwargs):
3596         if not self.get_param('getcomments'):
3597             return None
3598         generator = self._get_comments(*args, **kwargs)
3599
3600         def extractor():
3601             comments = []
3602             interrupted = True
3603             try:
3604                 while True:
3605                     comments.append(next(generator))
3606             except StopIteration:
3607                 interrupted = False
3608             except KeyboardInterrupt:
3609                 self.to_screen('Interrupted by user')
3610             except self.CommentsDisabled:
3611                 return {'comments': None, 'comment_count': None}
3612             except Exception as e:
3613                 if self.get_param('ignoreerrors') is not True:
3614                     raise
3615                 self._downloader.report_error(e)
3616             comment_count = len(comments)
3617             self.to_screen(f'Extracted {comment_count} comments')
3618             return {
3619                 'comments': comments,
3620                 'comment_count': None if interrupted else comment_count
3621             }
3622         return extractor
3623
3624     def _get_comments(self, *args, **kwargs):
3625         raise NotImplementedError('This method must be implemented by subclasses')
3626
3627     @staticmethod
3628     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3629         """ Merge subtitle items for one language. Items with duplicated URLs/data
3630         will be dropped. """
3631         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3632         ret = list(subtitle_list1)
3633         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3634         return ret
3635
3636     @classmethod
3637     def _merge_subtitles(cls, *dicts, target=None):
3638         """ Merge subtitle dictionaries, language by language. """
3639         if target is None:
3640             target = {}
3641         for d in dicts:
3642             for lang, subs in d.items():
3643                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3644         return target
3645
3646     def extract_automatic_captions(self, *args, **kwargs):
3647         if (self.get_param('writeautomaticsub', False)
3648                 or self.get_param('listsubtitles')):
3649             return self._get_automatic_captions(*args, **kwargs)
3650         return {}
3651
3652     def _get_automatic_captions(self, *args, **kwargs):
3653         raise NotImplementedError('This method must be implemented by subclasses')
3654
3655     @functools.cached_property
3656     def _cookies_passed(self):
3657         """Whether cookies have been passed to YoutubeDL"""
3658         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3659
3660     def mark_watched(self, *args, **kwargs):
3661         if not self.get_param('mark_watched', False):
3662             return
3663         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3664             self._mark_watched(*args, **kwargs)
3665
3666     def _mark_watched(self, *args, **kwargs):
3667         raise NotImplementedError('This method must be implemented by subclasses')
3668
3669     def geo_verification_headers(self):
3670         headers = {}
3671         geo_verification_proxy = self.get_param('geo_verification_proxy')
3672         if geo_verification_proxy:
3673             headers['Ytdl-request-proxy'] = geo_verification_proxy
3674         return headers
3675
3676     @staticmethod
3677     def _generic_id(url):
3678         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3679
3680     def _generic_title(self, url='', webpage='', *, default=None):
3681         return (self._og_search_title(webpage, default=None)
3682                 or self._html_extract_title(webpage, default=None)
3683                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3684                 or default)
3685
3686     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3687         if not duration:
3688             return
3689         chapter_list = [{
3690             'start_time': start_function(chapter),
3691             'title': title_function(chapter),
3692         } for chapter in chapter_list or []]
3693         if strict:
3694             warn = self.report_warning
3695         else:
3696             warn = self.write_debug
3697             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3698
3699         chapters = [{'start_time': 0}]
3700         for idx, chapter in enumerate(chapter_list):
3701             if chapter['start_time'] is None:
3702                 warn(f'Incomplete chapter {idx}')
3703             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3704                 chapters.append(chapter)
3705             elif chapter not in chapters:
3706                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3707                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3708                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3709         return chapters[1:]
3710
3711     def _extract_chapters_from_description(self, description, duration):
3712         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3713         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3714         return self._extract_chapters_helper(
3715             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3716             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3717             duration=duration, strict=False) or self._extract_chapters_helper(
3718             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3719             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3720             duration=duration, strict=False)
3721
3722     @staticmethod
3723     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3724         all_known = all(map(
3725             lambda x: x is not None,
3726             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3727         return (
3728             'private' if is_private
3729             else 'premium_only' if needs_premium
3730             else 'subscriber_only' if needs_subscription
3731             else 'needs_auth' if needs_auth
3732             else 'unlisted' if is_unlisted
3733             else 'public' if all_known
3734             else None)
3735
3736     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3737         '''
3738         @returns            A list of values for the extractor argument given by "key"
3739                             or "default" if no such key is present
3740         @param default      The default value to return when the key is not present (default: [])
3741         @param casesense    When false, the values are converted to lower case
3742         '''
3743         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3744         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3745         if val is None:
3746             return [] if default is NO_DEFAULT else default
3747         return list(val) if casesense else [x.lower() for x in val]
3748
3749     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3750         if not playlist_id or not video_id:
3751             return not video_id
3752
3753         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3754         if no_playlist is not None:
3755             return not no_playlist
3756
3757         video_id = '' if video_id is True else f' {video_id}'
3758         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3759         if self.get_param('noplaylist'):
3760             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3761             return False
3762         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3763         return True
3764
3765     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3766         RetryManager.report_retry(
3767             err, _count or int(fatal), _retries,
3768             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3769             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3770
3771     def RetryManager(self, **kwargs):
3772         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3773
3774     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3775         display_id = traverse_obj(info_dict, 'display_id', 'id')
3776         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3777         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3778             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3779
3780     @classmethod
3781     def extract_from_webpage(cls, ydl, url, webpage):
3782         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3783               else ydl.get_info_extractor(cls.ie_key()))
3784         for info in ie._extract_from_webpage(url, webpage) or []:
3785             # url = None since we do not want to set (webpage/original)_url
3786             ydl.add_default_extra_info(info, ie, None)
3787             yield info
3788
3789     @classmethod
3790     def _extract_from_webpage(cls, url, webpage):
3791         for embed_url in orderedSet(
3792                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3793             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3794
3795     @classmethod
3796     def _extract_embed_urls(cls, url, webpage):
3797         """@returns all the embed urls on the webpage"""
3798         if '_EMBED_URL_RE' not in cls.__dict__:
3799             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3800             for idx, regex in enumerate(cls._EMBED_REGEX):
3801                 assert regex.count('(?P<url>') == 1, \
3802                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3803             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3804
3805         for regex in cls._EMBED_URL_RE:
3806             for mobj in regex.finditer(webpage):
3807                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3808                 if cls._VALID_URL is False or cls.suitable(embed_url):
3809                     yield embed_url
3810
3811     class StopExtraction(Exception):
3812         pass
3813
3814     @classmethod
3815     def _extract_url(cls, webpage):  # TODO: Remove
3816         """Only for compatibility with some older extractors"""
3817         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3818
3819     @classmethod
3820     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3821         if plugin_name:
3822             mro = inspect.getmro(cls)
3823             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3824             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3825             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3826             while getattr(super_class, '__wrapped__', None):
3827                 super_class = super_class.__wrapped__
3828             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3829             _PLUGIN_OVERRIDES[super_class].append(cls)
3830
3831         return super().__init_subclass__(**kwargs)
3832
3833
3834 class SearchInfoExtractor(InfoExtractor):
3835     """
3836     Base class for paged search queries extractors.
3837     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3838     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3839     """
3840
3841     _MAX_RESULTS = float('inf')
3842     _RETURN_TYPE = 'playlist'
3843
3844     @classproperty
3845     def _VALID_URL(cls):
3846         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3847
3848     def _real_extract(self, query):
3849         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3850         if prefix == '':
3851             return self._get_n_results(query, 1)
3852         elif prefix == 'all':
3853             return self._get_n_results(query, self._MAX_RESULTS)
3854         else:
3855             n = int(prefix)
3856             if n <= 0:
3857                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3858             elif n > self._MAX_RESULTS:
3859                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3860                 n = self._MAX_RESULTS
3861             return self._get_n_results(query, n)
3862
3863     def _get_n_results(self, query, n):
3864         """Get a specified number of results for a query.
3865         Either this function or _search_results must be overridden by subclasses """
3866         return self.playlist_result(
3867             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3868             query, query)
3869
3870     def _search_results(self, query):
3871         """Returns an iterator of search results"""
3872         raise NotImplementedError('This method must be implemented by subclasses')
3873
3874     @classproperty
3875     def SEARCH_KEY(cls):
3876         return cls._SEARCH_KEY
3877
3878
3879 class UnsupportedURLIE(InfoExtractor):
3880     _VALID_URL = '.*'
3881     _ENABLED = False
3882     IE_DESC = False
3883
3884     def _real_extract(self, url):
3885         raise UnsupportedError(url)
3886
3887
3888 _PLUGIN_OVERRIDES = collections.defaultdict(list)