yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import sys
  17 import time
  18 import types
  19 import urllib.parse
  20 import urllib.request
  21 import xml.etree.ElementTree
  22
  23 from ..compat import functools  # isort: split
  24 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  25 from ..cookies import LenientSimpleCookie
  26 from ..downloader.f4m import get_base_url, remove_encrypted_media
  27 from ..utils import (
  28     IDENTITY,
  29     JSON_LD_RE,
  30     NO_DEFAULT,
  31     ExtractorError,
  32     FormatSorter,
  33     GeoRestrictedError,
  34     GeoUtils,
  35     LenientJSONDecoder,
  36     RegexNotFoundError,
  37     RetryManager,
  38     UnsupportedError,
  39     age_restricted,
  40     base_url,
  41     bug_reports_message,
  42     classproperty,
  43     clean_html,
  44     deprecation_warning,
  45     determine_ext,
  46     dict_get,
  47     encode_data_uri,
  48     error_to_compat_str,
  49     extract_attributes,
  50     filter_dict,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     int_or_none,
  55     join_nonempty,
  56     js_to_json,
  57     mimetype2ext,
  58     network_exceptions,
  59     orderedSet,
  60     parse_bitrate,
  61     parse_codecs,
  62     parse_duration,
  63     parse_iso8601,
  64     parse_m3u8_attributes,
  65     parse_resolution,
  66     sanitize_filename,
  67     sanitize_url,
  68     sanitized_Request,
  69     smuggle_url,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     traverse_obj,
  74     truncate_string,
  75     try_call,
  76     try_get,
  77     unescapeHTML,
  78     unified_strdate,
  79     unified_timestamp,
  80     update_Request,
  81     update_url_query,
  82     url_basename,
  83     url_or_none,
  84     urljoin,
  85     variadic,
  86     xpath_element,
  87     xpath_text,
  88     xpath_with_ns,
  89 )
  90
  91
  92 class InfoExtractor:
  93     """Information Extractor class.
  94
  95     Information extractors are the classes that, given a URL, extract
  96     information about the video (or videos) the URL refers to. This
  97     information includes the real video URL, the video title, author and
  98     others. The information is stored in a dictionary which is then
  99     passed to the YoutubeDL. The YoutubeDL processes this
 100     information possibly downloading the video to the file system, among
 101     other possible outcomes.
 102
 103     The type field determines the type of the result.
 104     By far the most common value (and the default if _type is missing) is
 105     "video", which indicates a single video.
 106
 107     For a video, the dictionaries must include the following fields:
 108
 109     id:             Video identifier.
 110     title:          Video title, unescaped. Set to an empty string if video has
 111                     no title as opposed to "None" which signifies that the
 112                     extractor failed to obtain a title
 113
 114     Additionally, it must contain either a formats entry or a url one:
 115
 116     formats:        A list of dictionaries for each format available, ordered
 117                     from worst to best quality.
 118
 119                     Potential fields:
 120                     * url        The mandatory URL representing the media:
 121                                    for plain file media - HTTP URL of this file,
 122                                    for RTMP - RTMP URL,
 123                                    for HLS - URL of the M3U8 media playlist,
 124                                    for HDS - URL of the F4M manifest,
 125                                    for DASH
 126                                      - HTTP URL to plain file media (in case of
 127                                        unfragmented media)
 128                                      - URL of the MPD manifest or base URL
 129                                        representing the media if MPD manifest
 130                                        is parsed from a string (in case of
 131                                        fragmented media)
 132                                    for MSS - URL of the ISM manifest.
 133                     * manifest_url
 134                                  The URL of the manifest file in case of
 135                                  fragmented media:
 136                                    for HLS - URL of the M3U8 master playlist,
 137                                    for HDS - URL of the F4M manifest,
 138                                    for DASH - URL of the MPD manifest,
 139                                    for MSS - URL of the ISM manifest.
 140                     * manifest_stream_number  (For internal use only)
 141                                  The index of the stream in the manifest file
 142                     * ext        Will be calculated from URL if missing
 143                     * format     A human-readable description of the format
 144                                  ("mp4 container with h264/opus").
 145                                  Calculated from the format_id, width, height.
 146                                  and format_note fields if missing.
 147                     * format_id  A short description of the format
 148                                  ("mp4_h264_opus" or "19").
 149                                 Technically optional, but strongly recommended.
 150                     * format_note Additional info about the format
 151                                  ("3D" or "DASH video")
 152                     * width      Width of the video, if known
 153                     * height     Height of the video, if known
 154                     * aspect_ratio  Aspect ratio of the video, if known
 155                                  Automatically calculated from width and height
 156                     * resolution Textual description of width and height
 157                                  Automatically calculated from width and height
 158                     * dynamic_range The dynamic range of the video. One of:
 159                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 160                     * tbr        Average bitrate of audio and video in KBit/s
 161                     * abr        Average audio bitrate in KBit/s
 162                     * acodec     Name of the audio codec in use
 163                     * asr        Audio sampling rate in Hertz
 164                     * audio_channels  Number of audio channels
 165                     * vbr        Average video bitrate in KBit/s
 166                     * fps        Frame rate
 167                     * vcodec     Name of the video codec in use
 168                     * container  Name of the container format
 169                     * filesize   The number of bytes, if known in advance
 170                     * filesize_approx  An estimate for the number of bytes
 171                     * player_url SWF Player URL (used for rtmpdump).
 172                     * protocol   The protocol that will be used for the actual
 173                                  download, lower-case. One of "http", "https" or
 174                                  one of the protocols defined in downloader.PROTOCOL_MAP
 175                     * fragment_base_url
 176                                  Base URL for fragments. Each fragment's path
 177                                  value (if present) will be relative to
 178                                  this URL.
 179                     * fragments  A list of fragments of a fragmented media.
 180                                  Each fragment entry must contain either an url
 181                                  or a path. If an url is present it should be
 182                                  considered by a client. Otherwise both path and
 183                                  fragment_base_url must be present. Here is
 184                                  the list of all potential fields:
 185                                  * "url" - fragment's URL
 186                                  * "path" - fragment's path relative to
 187                                             fragment_base_url
 188                                  * "duration" (optional, int or float)
 189                                  * "filesize" (optional, int)
 190                     * is_from_start  Is a live format that can be downloaded
 191                                 from the start. Boolean
 192                     * preference Order number of this format. If this field is
 193                                  present and not None, the formats get sorted
 194                                  by this field, regardless of all other values.
 195                                  -1 for default (order by other properties),
 196                                  -2 or smaller for less than default.
 197                                  < -1000 to hide the format (if there is
 198                                     another one which is strictly better)
 199                     * language   Language code, e.g. "de" or "en-US".
 200                     * language_preference  Is this in the language mentioned in
 201                                  the URL?
 202                                  10 if it's what the URL is about,
 203                                  -1 for default (don't know),
 204                                  -10 otherwise, other values reserved for now.
 205                     * quality    Order number of the video quality of this
 206                                  format, irrespective of the file format.
 207                                  -1 for default (order by other properties),
 208                                  -2 or smaller for less than default.
 209                     * source_preference  Order number for this video source
 210                                   (quality takes higher priority)
 211                                  -1 for default (order by other properties),
 212                                  -2 or smaller for less than default.
 213                     * http_headers  A dictionary of additional HTTP headers
 214                                  to add to the request.
 215                     * stretched_ratio  If given and not 1, indicates that the
 216                                  video's pixels are not square.
 217                                  width : height ratio as float.
 218                     * no_resume  The server does not support resuming the
 219                                  (HTTP or RTMP) download. Boolean.
 220                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 221                     * downloader_options  A dictionary of downloader options
 222                                  (For internal use only)
 223                                  * http_chunk_size Chunk size for HTTP downloads
 224                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 225                     RTMP formats can also have the additional fields: page_url,
 226                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 227                     rtmp_protocol, rtmp_real_time
 228
 229     url:            Final video URL.
 230     ext:            Video filename extension.
 231     format:         The video format, defaults to ext (used for --get-format)
 232     player_url:     SWF Player URL (used for rtmpdump).
 233
 234     The following fields are optional:
 235
 236     direct:         True if a direct video file was given (must only be set by GenericIE)
 237     alt_title:      A secondary title of the video.
 238     display_id      An alternative identifier for the video, not necessarily
 239                     unique, but available before title. Typically, id is
 240                     something like "4234987", title "Dancing naked mole rats",
 241                     and display_id "dancing-naked-mole-rats"
 242     thumbnails:     A list of dictionaries, with the following entries:
 243                         * "id" (optional, string) - Thumbnail format ID
 244                         * "url"
 245                         * "preference" (optional, int) - quality of the image
 246                         * "width" (optional, int)
 247                         * "height" (optional, int)
 248                         * "resolution" (optional, string "{width}x{height}",
 249                                         deprecated)
 250                         * "filesize" (optional, int)
 251                         * "http_headers" (dict) - HTTP headers for the request
 252     thumbnail:      Full URL to a video thumbnail image.
 253     description:    Full video description.
 254     uploader:       Full name of the video uploader.
 255     license:        License name the video is licensed under.
 256     creator:        The creator of the video.
 257     timestamp:      UNIX timestamp of the moment the video was uploaded
 258     upload_date:    Video upload date in UTC (YYYYMMDD).
 259                     If not explicitly set, calculated from timestamp
 260     release_timestamp: UNIX timestamp of the moment the video was released.
 261                     If it is not clear whether to use timestamp or this, use the former
 262     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 263                     If not explicitly set, calculated from release_timestamp
 264     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 265     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 266                     If not explicitly set, calculated from modified_timestamp
 267     uploader_id:    Nickname or id of the video uploader.
 268     uploader_url:   Full URL to a personal webpage of the video uploader.
 269     channel:        Full name of the channel the video is uploaded on.
 270                     Note that channel fields may or may not repeat uploader
 271                     fields. This depends on a particular extractor.
 272     channel_id:     Id of the channel.
 273     channel_url:    Full URL to a channel webpage.
 274     channel_follower_count: Number of followers of the channel.
 275     location:       Physical location where the video was filmed.
 276     subtitles:      The available subtitles as a dictionary in the format
 277                     {tag: subformats}. "tag" is usually a language code, and
 278                     "subformats" is a list sorted from lower to higher
 279                     preference, each element is a dictionary with the "ext"
 280                     entry and one of:
 281                         * "data": The subtitles file contents
 282                         * "url": A URL pointing to the subtitles file
 283                     It can optionally also have:
 284                         * "name": Name or description of the subtitles
 285                         * "http_headers": A dictionary of additional HTTP headers
 286                                   to add to the request.
 287                     "ext" will be calculated from URL if missing
 288     automatic_captions: Like 'subtitles'; contains automatically generated
 289                     captions instead of normal subtitles
 290     duration:       Length of the video in seconds, as an integer or float.
 291     view_count:     How many users have watched the video on the platform.
 292     concurrent_view_count: How many users are currently watching the video on the platform.
 293     like_count:     Number of positive ratings of the video
 294     dislike_count:  Number of negative ratings of the video
 295     repost_count:   Number of reposts of the video
 296     average_rating: Average rating give by users, the scale used depends on the webpage
 297     comment_count:  Number of comments on the video
 298     comments:       A list of comments, each with one or more of the following
 299                     properties (all but one of text or html optional):
 300                         * "author" - human-readable name of the comment author
 301                         * "author_id" - user ID of the comment author
 302                         * "author_thumbnail" - The thumbnail of the comment author
 303                         * "id" - Comment ID
 304                         * "html" - Comment as HTML
 305                         * "text" - Plain text of the comment
 306                         * "timestamp" - UNIX timestamp of comment
 307                         * "parent" - ID of the comment this one is replying to.
 308                                      Set to "root" to indicate that this is a
 309                                      comment to the original video.
 310                         * "like_count" - Number of positive ratings of the comment
 311                         * "dislike_count" - Number of negative ratings of the comment
 312                         * "is_favorited" - Whether the comment is marked as
 313                                            favorite by the video uploader
 314                         * "author_is_uploader" - Whether the comment is made by
 315                                                  the video uploader
 316     age_limit:      Age restriction for the video, as an integer (years)
 317     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 318                     should allow to get the same result again. (It will be set
 319                     by YoutubeDL if it's missing)
 320     categories:     A list of categories that the video falls in, for example
 321                     ["Sports", "Berlin"]
 322     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 323     cast:           A list of the video cast
 324     is_live:        True, False, or None (=unknown). Whether this video is a
 325                     live stream that goes on instead of a fixed-length video.
 326     was_live:       True, False, or None (=unknown). Whether this video was
 327                     originally a live stream.
 328     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 329                     or 'post_live' (was live, but VOD is not yet processed)
 330                     If absent, automatically set from is_live, was_live
 331     start_time:     Time in seconds where the reproduction should start, as
 332                     specified in the URL.
 333     end_time:       Time in seconds where the reproduction should end, as
 334                     specified in the URL.
 335     chapters:       A list of dictionaries, with the following entries:
 336                         * "start_time" - The start time of the chapter in seconds
 337                         * "end_time" - The end time of the chapter in seconds
 338                         * "title" (optional, string)
 339     playable_in_embed: Whether this video is allowed to play in embedded
 340                     players on other sites. Can be True (=always allowed),
 341                     False (=never allowed), None (=unknown), or a string
 342                     specifying the criteria for embedability; e.g. 'whitelist'
 343     availability:   Under what condition the video is available. One of
 344                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 345                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 346                     to set it
 347     _old_archive_ids: A list of old archive ids needed for backward compatibility
 348     _format_sort_fields: A list of fields to use for sorting formats
 349     __post_extractor: A function to be called just before the metadata is
 350                     written to either disk, logger or console. The function
 351                     must return a dict which will be added to the info_dict.
 352                     This is usefull for additional information that is
 353                     time-consuming to extract. Note that the fields thus
 354                     extracted will not be available to output template and
 355                     match_filter. So, only "comments" and "comment_count" are
 356                     currently allowed to be extracted via this method.
 357
 358     The following fields should only be used when the video belongs to some logical
 359     chapter or section:
 360
 361     chapter:        Name or title of the chapter the video belongs to.
 362     chapter_number: Number of the chapter the video belongs to, as an integer.
 363     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 364
 365     The following fields should only be used when the video is an episode of some
 366     series, programme or podcast:
 367
 368     series:         Title of the series or programme the video episode belongs to.
 369     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 370     season:         Title of the season the video episode belongs to.
 371     season_number:  Number of the season the video episode belongs to, as an integer.
 372     season_id:      Id of the season the video episode belongs to, as a unicode string.
 373     episode:        Title of the video episode. Unlike mandatory video title field,
 374                     this field should denote the exact title of the video episode
 375                     without any kind of decoration.
 376     episode_number: Number of the video episode within a season, as an integer.
 377     episode_id:     Id of the video episode, as a unicode string.
 378
 379     The following fields should only be used when the media is a track or a part of
 380     a music album:
 381
 382     track:          Title of the track.
 383     track_number:   Number of the track within an album or a disc, as an integer.
 384     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 385                     as a unicode string.
 386     artist:         Artist(s) of the track.
 387     genre:          Genre(s) of the track.
 388     album:          Title of the album the track belongs to.
 389     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 390     album_artist:   List of all artists appeared on the album (e.g.
 391                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 392                     and compilations).
 393     disc_number:    Number of the disc or other physical medium the track belongs to,
 394                     as an integer.
 395     release_year:   Year (YYYY) when the album was released.
 396     composer:       Composer of the piece
 397
 398     The following fields should only be set for clips that should be cut from the original video:
 399
 400     section_start:  Start time of the section in seconds
 401     section_end:    End time of the section in seconds
 402
 403     The following fields should only be set for storyboards:
 404     rows:           Number of rows in each storyboard fragment, as an integer
 405     columns:        Number of columns in each storyboard fragment, as an integer
 406
 407     Unless mentioned otherwise, the fields should be Unicode strings.
 408
 409     Unless mentioned otherwise, None is equivalent to absence of information.
 410
 411
 412     _type "playlist" indicates multiple videos.
 413     There must be a key "entries", which is a list, an iterable, or a PagedList
 414     object, each element of which is a valid dictionary by this specification.
 415
 416     Additionally, playlists can have "id", "title", and any other relevant
 417     attributes with the same semantics as videos (see above).
 418
 419     It can also have the following optional fields:
 420
 421     playlist_count: The total number of videos in a playlist. If not given,
 422                     YoutubeDL tries to calculate it from "entries"
 423
 424
 425     _type "multi_video" indicates that there are multiple videos that
 426     form a single show, for examples multiple acts of an opera or TV episode.
 427     It must have an entries key like a playlist and contain all the keys
 428     required for a video at the same time.
 429
 430
 431     _type "url" indicates that the video must be extracted from another
 432     location, possibly by a different extractor. Its only required key is:
 433     "url" - the next URL to extract.
 434     The key "ie_key" can be set to the class name (minus the trailing "IE",
 435     e.g. "Youtube") if the extractor class is known in advance.
 436     Additionally, the dictionary may have any properties of the resolved entity
 437     known in advance, for example "title" if the title of the referred video is
 438     known ahead of time.
 439
 440
 441     _type "url_transparent" entities have the same specification as "url", but
 442     indicate that the given additional information is more precise than the one
 443     associated with the resolved URL.
 444     This is useful when a site employs a video service that hosts the video and
 445     its technical metadata, but that video service does not embed a useful
 446     title, description etc.
 447
 448
 449     Subclasses of this should also be added to the list of extractors and
 450     should define a _VALID_URL regexp and, re-define the _real_extract() and
 451     (optionally) _real_initialize() methods.
 452
 453     Subclasses may also override suitable() if necessary, but ensure the function
 454     signature is preserved and that this function imports everything it needs
 455     (except other extractors), so that lazy_extractors works correctly.
 456
 457     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 458     the HTML of Generic webpages. It may also override _extract_embed_urls
 459     or _extract_from_webpage as necessary. While these are normally classmethods,
 460     _extract_from_webpage is allowed to be an instance method.
 461
 462     _extract_from_webpage may raise self.StopExtraction() to stop further
 463     processing of the webpage and obtain exclusive rights to it. This is useful
 464     when the extractor cannot reliably be matched using just the URL,
 465     e.g. invidious/peertube instances
 466
 467     Embed-only extractors can be defined by setting _VALID_URL = False.
 468
 469     To support username + password (or netrc) login, the extractor must define a
 470     _NETRC_MACHINE and re-define _perform_login(username, password) and
 471     (optionally) _initialize_pre_login() methods. The _perform_login method will
 472     be called between _initialize_pre_login and _real_initialize if credentials
 473     are passed by the user. In cases where it is necessary to have the login
 474     process as part of the extraction rather than initialization, _perform_login
 475     can be left undefined.
 476
 477     _GEO_BYPASS attribute may be set to False in order to disable
 478     geo restriction bypass mechanisms for a particular extractor.
 479     Though it won't disable explicit geo restriction bypass based on
 480     country code provided with geo_bypass_country.
 481
 482     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 483     countries for this extractor. One of these countries will be used by
 484     geo restriction bypass mechanism right away in order to bypass
 485     geo restriction, of course, if the mechanism is not disabled.
 486
 487     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 488     IP blocks in CIDR notation for this extractor. One of these IP blocks
 489     will be used by geo restriction bypass mechanism similarly
 490     to _GEO_COUNTRIES.
 491
 492     The _ENABLED attribute should be set to False for IEs that
 493     are disabled by default and must be explicitly enabled.
 494
 495     The _WORKING attribute should be set to False for broken IEs
 496     in order to warn the users and skip the tests.
 497     """
 498
 499     _ready = False
 500     _downloader = None
 501     _x_forwarded_for_ip = None
 502     _GEO_BYPASS = True
 503     _GEO_COUNTRIES = None
 504     _GEO_IP_BLOCKS = None
 505     _WORKING = True
 506     _ENABLED = True
 507     _NETRC_MACHINE = None
 508     IE_DESC = None
 509     SEARCH_KEY = None
 510     _VALID_URL = None
 511     _EMBED_REGEX = []
 512
 513     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 514         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 515         return {
 516             None: '',
 517             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 518             'password': f'Use {password_hint}',
 519             'cookies': (
 520                 'Use --cookies-from-browser or --cookies for the authentication. '
 521                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 522         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 523
 524     def __init__(self, downloader=None):
 525         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 526         If a downloader is not passed during initialization,
 527         it must be set using "set_downloader()" before "extract()" is called"""
 528         self._ready = False
 529         self._x_forwarded_for_ip = None
 530         self._printed_messages = set()
 531         self.set_downloader(downloader)
 532
 533     @classmethod
 534     def _match_valid_url(cls, url):
 535         if cls._VALID_URL is False:
 536             return None
 537         # This does not use has/getattr intentionally - we want to know whether
 538         # we have cached the regexp for *this* class, whereas getattr would also
 539         # match the superclass
 540         if '_VALID_URL_RE' not in cls.__dict__:
 541             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 542         return cls._VALID_URL_RE.match(url)
 543
 544     @classmethod
 545     def suitable(cls, url):
 546         """Receives a URL and returns True if suitable for this IE."""
 547         # This function must import everything it needs (except other extractors),
 548         # so that lazy_extractors works correctly
 549         return cls._match_valid_url(url) is not None
 550
 551     @classmethod
 552     def _match_id(cls, url):
 553         return cls._match_valid_url(url).group('id')
 554
 555     @classmethod
 556     def get_temp_id(cls, url):
 557         try:
 558             return cls._match_id(url)
 559         except (IndexError, AttributeError):
 560             return None
 561
 562     @classmethod
 563     def working(cls):
 564         """Getter method for _WORKING."""
 565         return cls._WORKING
 566
 567     @classmethod
 568     def supports_login(cls):
 569         return bool(cls._NETRC_MACHINE)
 570
 571     def initialize(self):
 572         """Initializes an instance (authentication, etc)."""
 573         self._printed_messages = set()
 574         self._initialize_geo_bypass({
 575             'countries': self._GEO_COUNTRIES,
 576             'ip_blocks': self._GEO_IP_BLOCKS,
 577         })
 578         if not self._ready:
 579             self._initialize_pre_login()
 580             if self.supports_login():
 581                 username, password = self._get_login_info()
 582                 if username:
 583                     self._perform_login(username, password)
 584             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 585                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 586             self._real_initialize()
 587             self._ready = True
 588
 589     def _initialize_geo_bypass(self, geo_bypass_context):
 590         """
 591         Initialize geo restriction bypass mechanism.
 592
 593         This method is used to initialize geo bypass mechanism based on faking
 594         X-Forwarded-For HTTP header. A random country from provided country list
 595         is selected and a random IP belonging to this country is generated. This
 596         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 597         HTTP requests.
 598
 599         This method will be used for initial geo bypass mechanism initialization
 600         during the instance initialization with _GEO_COUNTRIES and
 601         _GEO_IP_BLOCKS.
 602
 603         You may also manually call it from extractor's code if geo bypass
 604         information is not available beforehand (e.g. obtained during
 605         extraction) or due to some other reason. In this case you should pass
 606         this information in geo bypass context passed as first argument. It may
 607         contain following fields:
 608
 609         countries:  List of geo unrestricted countries (similar
 610                     to _GEO_COUNTRIES)
 611         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 612                     (similar to _GEO_IP_BLOCKS)
 613
 614         """
 615         if not self._x_forwarded_for_ip:
 616
 617             # Geo bypass mechanism is explicitly disabled by user
 618             if not self.get_param('geo_bypass', True):
 619                 return
 620
 621             if not geo_bypass_context:
 622                 geo_bypass_context = {}
 623
 624             # Backward compatibility: previously _initialize_geo_bypass
 625             # expected a list of countries, some 3rd party code may still use
 626             # it this way
 627             if isinstance(geo_bypass_context, (list, tuple)):
 628                 geo_bypass_context = {
 629                     'countries': geo_bypass_context,
 630                 }
 631
 632             # The whole point of geo bypass mechanism is to fake IP
 633             # as X-Forwarded-For HTTP header based on some IP block or
 634             # country code.
 635
 636             # Path 1: bypassing based on IP block in CIDR notation
 637
 638             # Explicit IP block specified by user, use it right away
 639             # regardless of whether extractor is geo bypassable or not
 640             ip_block = self.get_param('geo_bypass_ip_block', None)
 641
 642             # Otherwise use random IP block from geo bypass context but only
 643             # if extractor is known as geo bypassable
 644             if not ip_block:
 645                 ip_blocks = geo_bypass_context.get('ip_blocks')
 646                 if self._GEO_BYPASS and ip_blocks:
 647                     ip_block = random.choice(ip_blocks)
 648
 649             if ip_block:
 650                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 651                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 652                 return
 653
 654             # Path 2: bypassing based on country code
 655
 656             # Explicit country code specified by user, use it right away
 657             # regardless of whether extractor is geo bypassable or not
 658             country = self.get_param('geo_bypass_country', None)
 659
 660             # Otherwise use random country code from geo bypass context but
 661             # only if extractor is known as geo bypassable
 662             if not country:
 663                 countries = geo_bypass_context.get('countries')
 664                 if self._GEO_BYPASS and countries:
 665                     country = random.choice(countries)
 666
 667             if country:
 668                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 669                 self._downloader.write_debug(
 670                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 671
 672     def extract(self, url):
 673         """Extracts URL information and returns it in list of dicts."""
 674         try:
 675             for _ in range(2):
 676                 try:
 677                     self.initialize()
 678                     self.to_screen('Extracting URL: %s' % (
 679                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 680                     ie_result = self._real_extract(url)
 681                     if ie_result is None:
 682                         return None
 683                     if self._x_forwarded_for_ip:
 684                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 685                     subtitles = ie_result.get('subtitles') or {}
 686                     if 'no-live-chat' in self.get_param('compat_opts'):
 687                         for lang in ('live_chat', 'comments', 'danmaku'):
 688                             subtitles.pop(lang, None)
 689                     return ie_result
 690                 except GeoRestrictedError as e:
 691                     if self.__maybe_fake_ip_and_retry(e.countries):
 692                         continue
 693                     raise
 694         except UnsupportedError:
 695             raise
 696         except ExtractorError as e:
 697             e.video_id = e.video_id or self.get_temp_id(url),
 698             e.ie = e.ie or self.IE_NAME,
 699             e.traceback = e.traceback or sys.exc_info()[2]
 700             raise
 701         except http.client.IncompleteRead as e:
 702             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 703         except (KeyError, StopIteration) as e:
 704             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 705
 706     def __maybe_fake_ip_and_retry(self, countries):
 707         if (not self.get_param('geo_bypass_country', None)
 708                 and self._GEO_BYPASS
 709                 and self.get_param('geo_bypass', True)
 710                 and not self._x_forwarded_for_ip
 711                 and countries):
 712             country_code = random.choice(countries)
 713             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 714             if self._x_forwarded_for_ip:
 715                 self.report_warning(
 716                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 717                     % (self._x_forwarded_for_ip, country_code.upper()))
 718                 return True
 719         return False
 720
 721     def set_downloader(self, downloader):
 722         """Sets a YoutubeDL instance as the downloader for this IE."""
 723         self._downloader = downloader
 724
 725     @property
 726     def cache(self):
 727         return self._downloader.cache
 728
 729     @property
 730     def cookiejar(self):
 731         return self._downloader.cookiejar
 732
 733     def _initialize_pre_login(self):
 734         """ Initialization before login. Redefine in subclasses."""
 735         pass
 736
 737     def _perform_login(self, username, password):
 738         """ Login with username and password. Redefine in subclasses."""
 739         pass
 740
 741     def _real_initialize(self):
 742         """Real initialization process. Redefine in subclasses."""
 743         pass
 744
 745     def _real_extract(self, url):
 746         """Real extraction process. Redefine in subclasses."""
 747         raise NotImplementedError('This method must be implemented by subclasses')
 748
 749     @classmethod
 750     def ie_key(cls):
 751         """A string for getting the InfoExtractor with get_info_extractor"""
 752         return cls.__name__[:-2]
 753
 754     @classproperty
 755     def IE_NAME(cls):
 756         return cls.__name__[:-2]
 757
 758     @staticmethod
 759     def __can_accept_status_code(err, expected_status):
 760         assert isinstance(err, urllib.error.HTTPError)
 761         if expected_status is None:
 762             return False
 763         elif callable(expected_status):
 764             return expected_status(err.code) is True
 765         else:
 766             return err.code in variadic(expected_status)
 767
 768     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 769         if isinstance(url_or_request, urllib.request.Request):
 770             return update_Request(url_or_request, data=data, headers=headers, query=query)
 771         if query:
 772             url_or_request = update_url_query(url_or_request, query)
 773         return sanitized_Request(url_or_request, data, headers or {})
 774
 775     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 776         """
 777         Return the response handle.
 778
 779         See _download_webpage docstring for arguments specification.
 780         """
 781         if not self._downloader._first_webpage_request:
 782             sleep_interval = self.get_param('sleep_interval_requests') or 0
 783             if sleep_interval > 0:
 784                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 785                 time.sleep(sleep_interval)
 786         else:
 787             self._downloader._first_webpage_request = False
 788
 789         if note is None:
 790             self.report_download_webpage(video_id)
 791         elif note is not False:
 792             if video_id is None:
 793                 self.to_screen(str(note))
 794             else:
 795                 self.to_screen(f'{video_id}: {note}')
 796
 797         # Some sites check X-Forwarded-For HTTP header in order to figure out
 798         # the origin of the client behind proxy. This allows bypassing geo
 799         # restriction by faking this header's value to IP that belongs to some
 800         # geo unrestricted country. We will do so once we encounter any
 801         # geo restriction error.
 802         if self._x_forwarded_for_ip:
 803             headers = (headers or {}).copy()
 804             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 805
 806         try:
 807             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 808         except network_exceptions as err:
 809             if isinstance(err, urllib.error.HTTPError):
 810                 if self.__can_accept_status_code(err, expected_status):
 811                     # Retain reference to error to prevent file object from
 812                     # being closed before it can be read. Works around the
 813                     # effects of <https://bugs.python.org/issue15002>
 814                     # introduced in Python 3.4.1.
 815                     err.fp._error = err
 816                     return err.fp
 817
 818             if errnote is False:
 819                 return False
 820             if errnote is None:
 821                 errnote = 'Unable to download webpage'
 822
 823             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 824             if fatal:
 825                 raise ExtractorError(errmsg, cause=err)
 826             else:
 827                 self.report_warning(errmsg)
 828                 return False
 829
 830     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 831                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 832         """
 833         Return a tuple (page content as string, URL handle).
 834
 835         Arguments:
 836         url_or_request -- plain text URL as a string or
 837             a urllib.request.Request object
 838         video_id -- Video/playlist/item identifier (string)
 839
 840         Keyword arguments:
 841         note -- note printed before downloading (string)
 842         errnote -- note printed in case of an error (string)
 843         fatal -- flag denoting whether error should be considered fatal,
 844             i.e. whether it should cause ExtractionError to be raised,
 845             otherwise a warning will be reported and extraction continued
 846         encoding -- encoding for a page content decoding, guessed automatically
 847             when not explicitly specified
 848         data -- POST data (bytes)
 849         headers -- HTTP headers (dict)
 850         query -- URL query (dict)
 851         expected_status -- allows to accept failed HTTP requests (non 2xx
 852             status code) by explicitly specifying a set of accepted status
 853             codes. Can be any of the following entities:
 854                 - an integer type specifying an exact failed status code to
 855                   accept
 856                 - a list or a tuple of integer types specifying a list of
 857                   failed status codes to accept
 858                 - a callable accepting an actual failed status code and
 859                   returning True if it should be accepted
 860             Note that this argument does not affect success status codes (2xx)
 861             which are always accepted.
 862         """
 863
 864         # Strip hashes from the URL (#1038)
 865         if isinstance(url_or_request, str):
 866             url_or_request = url_or_request.partition('#')[0]
 867
 868         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 869         if urlh is False:
 870             assert not fatal
 871             return False
 872         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 873         return (content, urlh)
 874
 875     @staticmethod
 876     def _guess_encoding_from_content(content_type, webpage_bytes):
 877         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 878         if m:
 879             encoding = m.group(1)
 880         else:
 881             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 882                           webpage_bytes[:1024])
 883             if m:
 884                 encoding = m.group(1).decode('ascii')
 885             elif webpage_bytes.startswith(b'\xff\xfe'):
 886                 encoding = 'utf-16'
 887             else:
 888                 encoding = 'utf-8'
 889
 890         return encoding
 891
 892     def __check_blocked(self, content):
 893         first_block = content[:512]
 894         if ('<title>Access to this site is blocked</title>' in content
 895                 and 'Websense' in first_block):
 896             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 897             blocked_iframe = self._html_search_regex(
 898                 r'<iframe src="([^"]+)"', content,
 899                 'Websense information URL', default=None)
 900             if blocked_iframe:
 901                 msg += ' Visit %s for more details' % blocked_iframe
 902             raise ExtractorError(msg, expected=True)
 903         if '<title>The URL you requested has been blocked</title>' in first_block:
 904             msg = (
 905                 'Access to this webpage has been blocked by Indian censorship. '
 906                 'Use a VPN or proxy server (with --proxy) to route around it.')
 907             block_msg = self._html_search_regex(
 908                 r'</h1><p>(.*?)</p>',
 909                 content, 'block message', default=None)
 910             if block_msg:
 911                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 912             raise ExtractorError(msg, expected=True)
 913         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 914                 and 'blocklist.rkn.gov.ru' in content):
 915             raise ExtractorError(
 916                 'Access to this webpage has been blocked by decision of the Russian government. '
 917                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 918                 expected=True)
 919
 920     def _request_dump_filename(self, url, video_id):
 921         basen = f'{video_id}_{url}'
 922         trim_length = self.get_param('trim_file_name') or 240
 923         if len(basen) > trim_length:
 924             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 925             basen = basen[:trim_length - len(h)] + h
 926         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 927         # Working around MAX_PATH limitation on Windows (see
 928         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 929         if compat_os_name == 'nt':
 930             absfilepath = os.path.abspath(filename)
 931             if len(absfilepath) > 259:
 932                 filename = fR'\\?\{absfilepath}'
 933         return filename
 934
 935     def __decode_webpage(self, webpage_bytes, encoding, headers):
 936         if not encoding:
 937             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 938         try:
 939             return webpage_bytes.decode(encoding, 'replace')
 940         except LookupError:
 941             return webpage_bytes.decode('utf-8', 'replace')
 942
 943     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 944         webpage_bytes = urlh.read()
 945         if prefix is not None:
 946             webpage_bytes = prefix + webpage_bytes
 947         if self.get_param('dump_intermediate_pages', False):
 948             self.to_screen('Dumping request to ' + urlh.geturl())
 949             dump = base64.b64encode(webpage_bytes).decode('ascii')
 950             self._downloader.to_screen(dump)
 951         if self.get_param('write_pages'):
 952             filename = self._request_dump_filename(urlh.geturl(), video_id)
 953             self.to_screen(f'Saving request to {filename}')
 954             with open(filename, 'wb') as outf:
 955                 outf.write(webpage_bytes)
 956
 957         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 958         self.__check_blocked(content)
 959
 960         return content
 961
 962     def __print_error(self, errnote, fatal, video_id, err):
 963         if fatal:
 964             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 965         elif errnote:
 966             self.report_warning(f'{video_id}: {errnote}: {err}')
 967
 968     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 969         if transform_source:
 970             xml_string = transform_source(xml_string)
 971         try:
 972             return compat_etree_fromstring(xml_string.encode('utf-8'))
 973         except xml.etree.ElementTree.ParseError as ve:
 974             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 975
 976     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
 977         try:
 978             return json.loads(
 979                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 980         except ValueError as ve:
 981             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
 982
 983     def _parse_socket_response_as_json(self, data, *args, **kwargs):
 984         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
 985
 986     def __create_download_methods(name, parser, note, errnote, return_value):
 987
 988         def parse(ie, content, *args, errnote=errnote, **kwargs):
 989             if parser is None:
 990                 return content
 991             if errnote is False:
 992                 kwargs['errnote'] = errnote
 993             # parser is fetched by name so subclasses can override it
 994             return getattr(ie, parser)(content, *args, **kwargs)
 995
 996         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 997                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 998             res = self._download_webpage_handle(
 999                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1000                 data=data, headers=headers, query=query, expected_status=expected_status)
1001             if res is False:
1002                 return res
1003             content, urlh = res
1004             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1005
1006         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1007                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1008             if self.get_param('load_pages'):
1009                 url_or_request = self._create_request(url_or_request, data, headers, query)
1010                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1011                 self.to_screen(f'Loading request from {filename}')
1012                 try:
1013                     with open(filename, 'rb') as dumpf:
1014                         webpage_bytes = dumpf.read()
1015                 except OSError as e:
1016                     self.report_warning(f'Unable to load request from disk: {e}')
1017                 else:
1018                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1019                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1020             kwargs = {
1021                 'note': note,
1022                 'errnote': errnote,
1023                 'transform_source': transform_source,
1024                 'fatal': fatal,
1025                 'encoding': encoding,
1026                 'data': data,
1027                 'headers': headers,
1028                 'query': query,
1029                 'expected_status': expected_status,
1030             }
1031             if parser is None:
1032                 kwargs.pop('transform_source')
1033             # The method is fetched by name so subclasses can override _download_..._handle
1034             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1035             return res if res is False else res[0]
1036
1037         def impersonate(func, name, return_value):
1038             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1039             func.__doc__ = f'''
1040                 @param transform_source     Apply this transformation before parsing
1041                 @returns                    {return_value}
1042
1043                 See _download_webpage_handle docstring for other arguments specification
1044             '''
1045
1046         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1047         impersonate(download_content, f'_download_{name}', f'{return_value}')
1048         return download_handle, download_content
1049
1050     _download_xml_handle, _download_xml = __create_download_methods(
1051         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1052     _download_json_handle, _download_json = __create_download_methods(
1053         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1054     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1055         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1056     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1057
1058     def _download_webpage(
1059             self, url_or_request, video_id, note=None, errnote=None,
1060             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1061         """
1062         Return the data of the page as a string.
1063
1064         Keyword arguments:
1065         tries -- number of tries
1066         timeout -- sleep interval between tries
1067
1068         See _download_webpage_handle docstring for other arguments specification.
1069         """
1070
1071         R''' # NB: These are unused; should they be deprecated?
1072         if tries != 1:
1073             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1074         if timeout is NO_DEFAULT:
1075             timeout = 5
1076         else:
1077             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1078         '''
1079
1080         try_count = 0
1081         while True:
1082             try:
1083                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1084             except http.client.IncompleteRead as e:
1085                 try_count += 1
1086                 if try_count >= tries:
1087                     raise e
1088                 self._sleep(timeout, video_id)
1089
1090     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1091         idstr = format_field(video_id, None, '%s: ')
1092         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1093         if only_once:
1094             if f'WARNING: {msg}' in self._printed_messages:
1095                 return
1096             self._printed_messages.add(f'WARNING: {msg}')
1097         self._downloader.report_warning(msg, *args, **kwargs)
1098
1099     def to_screen(self, msg, *args, **kwargs):
1100         """Print msg to screen, prefixing it with '[ie_name]'"""
1101         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1102
1103     def write_debug(self, msg, *args, **kwargs):
1104         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1105
1106     def get_param(self, name, default=None, *args, **kwargs):
1107         if self._downloader:
1108             return self._downloader.params.get(name, default, *args, **kwargs)
1109         return default
1110
1111     def report_drm(self, video_id, partial=NO_DEFAULT):
1112         if partial is not NO_DEFAULT:
1113             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1114         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1115
1116     def report_extraction(self, id_or_name):
1117         """Report information extraction."""
1118         self.to_screen('%s: Extracting information' % id_or_name)
1119
1120     def report_download_webpage(self, video_id):
1121         """Report webpage download."""
1122         self.to_screen('%s: Downloading webpage' % video_id)
1123
1124     def report_age_confirmation(self):
1125         """Report attempt to confirm age."""
1126         self.to_screen('Confirming age')
1127
1128     def report_login(self):
1129         """Report attempt to log in."""
1130         self.to_screen('Logging in')
1131
1132     def raise_login_required(
1133             self, msg='This video is only available for registered users',
1134             metadata_available=False, method=NO_DEFAULT):
1135         if metadata_available and (
1136                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1137             self.report_warning(msg)
1138             return
1139         msg += format_field(self._login_hint(method), None, '. %s')
1140         raise ExtractorError(msg, expected=True)
1141
1142     def raise_geo_restricted(
1143             self, msg='This video is not available from your location due to geo restriction',
1144             countries=None, metadata_available=False):
1145         if metadata_available and (
1146                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1147             self.report_warning(msg)
1148         else:
1149             raise GeoRestrictedError(msg, countries=countries)
1150
1151     def raise_no_formats(self, msg, expected=False, video_id=None):
1152         if expected and (
1153                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1154             self.report_warning(msg, video_id)
1155         elif isinstance(msg, ExtractorError):
1156             raise msg
1157         else:
1158             raise ExtractorError(msg, expected=expected, video_id=video_id)
1159
1160     # Methods for following #608
1161     @staticmethod
1162     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1163         """Returns a URL that points to a page that should be processed"""
1164         if ie is not None:
1165             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1166         if video_id is not None:
1167             kwargs['id'] = video_id
1168         if video_title is not None:
1169             kwargs['title'] = video_title
1170         return {
1171             **kwargs,
1172             '_type': 'url_transparent' if url_transparent else 'url',
1173             'url': url,
1174         }
1175
1176     @classmethod
1177     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1178                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1179         return cls.playlist_result(
1180             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1181             playlist_id, playlist_title, **kwargs)
1182
1183     @staticmethod
1184     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1185         """Returns a playlist"""
1186         if playlist_id:
1187             kwargs['id'] = playlist_id
1188         if playlist_title:
1189             kwargs['title'] = playlist_title
1190         if playlist_description is not None:
1191             kwargs['description'] = playlist_description
1192         return {
1193             **kwargs,
1194             '_type': 'multi_video' if multi_video else 'playlist',
1195             'entries': entries,
1196         }
1197
1198     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1199         """
1200         Perform a regex search on the given string, using a single or a list of
1201         patterns returning the first matching group.
1202         In case of failure return a default value or raise a WARNING or a
1203         RegexNotFoundError, depending on fatal, specifying the field name.
1204         """
1205         if string is None:
1206             mobj = None
1207         elif isinstance(pattern, (str, re.Pattern)):
1208             mobj = re.search(pattern, string, flags)
1209         else:
1210             for p in pattern:
1211                 mobj = re.search(p, string, flags)
1212                 if mobj:
1213                     break
1214
1215         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1216
1217         if mobj:
1218             if group is None:
1219                 # return the first matching group
1220                 return next(g for g in mobj.groups() if g is not None)
1221             elif isinstance(group, (list, tuple)):
1222                 return tuple(mobj.group(g) for g in group)
1223             else:
1224                 return mobj.group(group)
1225         elif default is not NO_DEFAULT:
1226             return default
1227         elif fatal:
1228             raise RegexNotFoundError('Unable to extract %s' % _name)
1229         else:
1230             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1231             return None
1232
1233     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1234                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1235         """Searches string for the JSON object specified by start_pattern"""
1236         # NB: end_pattern is only used to reduce the size of the initial match
1237         if default is NO_DEFAULT:
1238             default, has_default = {}, False
1239         else:
1240             fatal, has_default = False, True
1241
1242         json_string = self._search_regex(
1243             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1244             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1245         if not json_string:
1246             return default
1247
1248         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1249         try:
1250             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1251         except ExtractorError as e:
1252             if fatal:
1253                 raise ExtractorError(
1254                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1255             elif not has_default:
1256                 self.report_warning(
1257                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1258         return default
1259
1260     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1261         """
1262         Like _search_regex, but strips HTML tags and unescapes entities.
1263         """
1264         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1265         if isinstance(res, tuple):
1266             return tuple(map(clean_html, res))
1267         return clean_html(res)
1268
1269     def _get_netrc_login_info(self, netrc_machine=None):
1270         username = None
1271         password = None
1272         netrc_machine = netrc_machine or self._NETRC_MACHINE
1273
1274         if self.get_param('usenetrc', False):
1275             try:
1276                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1277                 if os.path.isdir(netrc_file):
1278                     netrc_file = os.path.join(netrc_file, '.netrc')
1279                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1280                 if info is not None:
1281                     username = info[0]
1282                     password = info[2]
1283                 else:
1284                     raise netrc.NetrcParseError(
1285                         'No authenticators for %s' % netrc_machine)
1286             except (OSError, netrc.NetrcParseError) as err:
1287                 self.report_warning(
1288                     'parsing .netrc: %s' % error_to_compat_str(err))
1289
1290         return username, password
1291
1292     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1293         """
1294         Get the login info as (username, password)
1295         First look for the manually specified credentials using username_option
1296         and password_option as keys in params dictionary. If no such credentials
1297         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1298         value.
1299         If there's no info available, return (None, None)
1300         """
1301
1302         # Attempt to use provided username and password or .netrc data
1303         username = self.get_param(username_option)
1304         if username is not None:
1305             password = self.get_param(password_option)
1306         else:
1307             username, password = self._get_netrc_login_info(netrc_machine)
1308
1309         return username, password
1310
1311     def _get_tfa_info(self, note='two-factor verification code'):
1312         """
1313         Get the two-factor authentication info
1314         TODO - asking the user will be required for sms/phone verify
1315         currently just uses the command line option
1316         If there's no info available, return None
1317         """
1318
1319         tfa = self.get_param('twofactor')
1320         if tfa is not None:
1321             return tfa
1322
1323         return getpass.getpass('Type %s and press [Return]: ' % note)
1324
1325     # Helper functions for extracting OpenGraph info
1326     @staticmethod
1327     def _og_regexes(prop):
1328         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1329         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1330                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1331         template = r'<meta[^>]+?%s[^>]+?%s'
1332         return [
1333             template % (property_re, content_re),
1334             template % (content_re, property_re),
1335         ]
1336
1337     @staticmethod
1338     def _meta_regex(prop):
1339         return r'''(?isx)<meta
1340                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1341                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1342
1343     def _og_search_property(self, prop, html, name=None, **kargs):
1344         prop = variadic(prop)
1345         if name is None:
1346             name = 'OpenGraph %s' % prop[0]
1347         og_regexes = []
1348         for p in prop:
1349             og_regexes.extend(self._og_regexes(p))
1350         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1351         if escaped is None:
1352             return None
1353         return unescapeHTML(escaped)
1354
1355     def _og_search_thumbnail(self, html, **kargs):
1356         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1357
1358     def _og_search_description(self, html, **kargs):
1359         return self._og_search_property('description', html, fatal=False, **kargs)
1360
1361     def _og_search_title(self, html, *, fatal=False, **kargs):
1362         return self._og_search_property('title', html, fatal=fatal, **kargs)
1363
1364     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1365         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1366         if secure:
1367             regexes = self._og_regexes('video:secure_url') + regexes
1368         return self._html_search_regex(regexes, html, name, **kargs)
1369
1370     def _og_search_url(self, html, **kargs):
1371         return self._og_search_property('url', html, **kargs)
1372
1373     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1374         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1375
1376     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1377         name = variadic(name)
1378         if display_name is None:
1379             display_name = name[0]
1380         return self._html_search_regex(
1381             [self._meta_regex(n) for n in name],
1382             html, display_name, fatal=fatal, group='content', **kwargs)
1383
1384     def _dc_search_uploader(self, html):
1385         return self._html_search_meta('dc.creator', html, 'uploader')
1386
1387     @staticmethod
1388     def _rta_search(html):
1389         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1390         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1391                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1392                      html):
1393             return 18
1394
1395         # And then there are the jokers who advertise that they use RTA, but actually don't.
1396         AGE_LIMIT_MARKERS = [
1397             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1398             r'>[^<]*you acknowledge you are at least (\d+) years old',
1399             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1400         ]
1401
1402         age_limit = 0
1403         for marker in AGE_LIMIT_MARKERS:
1404             mobj = re.search(marker, html)
1405             if mobj:
1406                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1407         return age_limit
1408
1409     def _media_rating_search(self, html):
1410         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1411         rating = self._html_search_meta('rating', html)
1412
1413         if not rating:
1414             return None
1415
1416         RATING_TABLE = {
1417             'safe for kids': 0,
1418             'general': 8,
1419             '14 years': 14,
1420             'mature': 17,
1421             'restricted': 19,
1422         }
1423         return RATING_TABLE.get(rating.lower())
1424
1425     def _family_friendly_search(self, html):
1426         # See http://schema.org/VideoObject
1427         family_friendly = self._html_search_meta(
1428             'isFamilyFriendly', html, default=None)
1429
1430         if not family_friendly:
1431             return None
1432
1433         RATING_TABLE = {
1434             '1': 0,
1435             'true': 0,
1436             '0': 18,
1437             'false': 18,
1438         }
1439         return RATING_TABLE.get(family_friendly.lower())
1440
1441     def _twitter_search_player(self, html):
1442         return self._html_search_meta('twitter:player', html,
1443                                       'twitter card player')
1444
1445     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1446         """Yield all json ld objects in the html"""
1447         if default is not NO_DEFAULT:
1448             fatal = False
1449         for mobj in re.finditer(JSON_LD_RE, html):
1450             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1451             for json_ld in variadic(json_ld_item):
1452                 if isinstance(json_ld, dict):
1453                     yield json_ld
1454
1455     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1456         """Search for a video in any json ld in the html"""
1457         if default is not NO_DEFAULT:
1458             fatal = False
1459         info = self._json_ld(
1460             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1461             video_id, fatal=fatal, expected_type=expected_type)
1462         if info:
1463             return info
1464         if default is not NO_DEFAULT:
1465             return default
1466         elif fatal:
1467             raise RegexNotFoundError('Unable to extract JSON-LD')
1468         else:
1469             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1470             return {}
1471
1472     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1473         if isinstance(json_ld, str):
1474             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1475         if not json_ld:
1476             return {}
1477         info = {}
1478
1479         INTERACTION_TYPE_MAP = {
1480             'CommentAction': 'comment',
1481             'AgreeAction': 'like',
1482             'DisagreeAction': 'dislike',
1483             'LikeAction': 'like',
1484             'DislikeAction': 'dislike',
1485             'ListenAction': 'view',
1486             'WatchAction': 'view',
1487             'ViewAction': 'view',
1488         }
1489
1490         def is_type(e, *expected_types):
1491             type = variadic(traverse_obj(e, '@type'))
1492             return any(x in type for x in expected_types)
1493
1494         def extract_interaction_type(e):
1495             interaction_type = e.get('interactionType')
1496             if isinstance(interaction_type, dict):
1497                 interaction_type = interaction_type.get('@type')
1498             return str_or_none(interaction_type)
1499
1500         def extract_interaction_statistic(e):
1501             interaction_statistic = e.get('interactionStatistic')
1502             if isinstance(interaction_statistic, dict):
1503                 interaction_statistic = [interaction_statistic]
1504             if not isinstance(interaction_statistic, list):
1505                 return
1506             for is_e in interaction_statistic:
1507                 if not is_type(is_e, 'InteractionCounter'):
1508                     continue
1509                 interaction_type = extract_interaction_type(is_e)
1510                 if not interaction_type:
1511                     continue
1512                 # For interaction count some sites provide string instead of
1513                 # an integer (as per spec) with non digit characters (e.g. ",")
1514                 # so extracting count with more relaxed str_to_int
1515                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1516                 if interaction_count is None:
1517                     continue
1518                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1519                 if not count_kind:
1520                     continue
1521                 count_key = '%s_count' % count_kind
1522                 if info.get(count_key) is not None:
1523                     continue
1524                 info[count_key] = interaction_count
1525
1526         def extract_chapter_information(e):
1527             chapters = [{
1528                 'title': part.get('name'),
1529                 'start_time': part.get('startOffset'),
1530                 'end_time': part.get('endOffset'),
1531             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1532             for idx, (last_c, current_c, next_c) in enumerate(zip(
1533                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1534                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1535                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1536                 if None in current_c.values():
1537                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1538                     return
1539             if chapters:
1540                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1541                 info['chapters'] = chapters
1542
1543         def extract_video_object(e):
1544             author = e.get('author')
1545             info.update({
1546                 'url': url_or_none(e.get('contentUrl')),
1547                 'ext': mimetype2ext(e.get('encodingFormat')),
1548                 'title': unescapeHTML(e.get('name')),
1549                 'description': unescapeHTML(e.get('description')),
1550                 'thumbnails': [{'url': unescapeHTML(url)}
1551                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1552                                if url_or_none(url)],
1553                 'duration': parse_duration(e.get('duration')),
1554                 'timestamp': unified_timestamp(e.get('uploadDate')),
1555                 # author can be an instance of 'Organization' or 'Person' types.
1556                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1557                 # however some websites are using 'Text' type instead.
1558                 # 1. https://schema.org/VideoObject
1559                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1560                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1561                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1562                 'tbr': int_or_none(e.get('bitrate')),
1563                 'width': int_or_none(e.get('width')),
1564                 'height': int_or_none(e.get('height')),
1565                 'view_count': int_or_none(e.get('interactionCount')),
1566                 'tags': try_call(lambda: e.get('keywords').split(',')),
1567             })
1568             if is_type(e, 'AudioObject'):
1569                 info.update({
1570                     'vcodec': 'none',
1571                     'abr': int_or_none(e.get('bitrate')),
1572                 })
1573             extract_interaction_statistic(e)
1574             extract_chapter_information(e)
1575
1576         def traverse_json_ld(json_ld, at_top_level=True):
1577             for e in variadic(json_ld):
1578                 if not isinstance(e, dict):
1579                     continue
1580                 if at_top_level and '@context' not in e:
1581                     continue
1582                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1583                     traverse_json_ld(e['@graph'], at_top_level=False)
1584                     continue
1585                 if expected_type is not None and not is_type(e, expected_type):
1586                     continue
1587                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1588                 if rating is not None:
1589                     info['average_rating'] = rating
1590                 if is_type(e, 'TVEpisode', 'Episode'):
1591                     episode_name = unescapeHTML(e.get('name'))
1592                     info.update({
1593                         'episode': episode_name,
1594                         'episode_number': int_or_none(e.get('episodeNumber')),
1595                         'description': unescapeHTML(e.get('description')),
1596                     })
1597                     if not info.get('title') and episode_name:
1598                         info['title'] = episode_name
1599                     part_of_season = e.get('partOfSeason')
1600                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1601                         info.update({
1602                             'season': unescapeHTML(part_of_season.get('name')),
1603                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1604                         })
1605                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1606                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1607                         info['series'] = unescapeHTML(part_of_series.get('name'))
1608                 elif is_type(e, 'Movie'):
1609                     info.update({
1610                         'title': unescapeHTML(e.get('name')),
1611                         'description': unescapeHTML(e.get('description')),
1612                         'duration': parse_duration(e.get('duration')),
1613                         'timestamp': unified_timestamp(e.get('dateCreated')),
1614                     })
1615                 elif is_type(e, 'Article', 'NewsArticle'):
1616                     info.update({
1617                         'timestamp': parse_iso8601(e.get('datePublished')),
1618                         'title': unescapeHTML(e.get('headline')),
1619                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1620                     })
1621                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1622                         extract_video_object(e['video'][0])
1623                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1624                         extract_video_object(e['subjectOf'][0])
1625                 elif is_type(e, 'VideoObject', 'AudioObject'):
1626                     extract_video_object(e)
1627                     if expected_type is None:
1628                         continue
1629                     else:
1630                         break
1631                 video = e.get('video')
1632                 if is_type(video, 'VideoObject'):
1633                     extract_video_object(video)
1634                 if expected_type is None:
1635                     continue
1636                 else:
1637                     break
1638
1639         traverse_json_ld(json_ld)
1640         return filter_dict(info)
1641
1642     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1643         return self._parse_json(
1644             self._search_regex(
1645                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1646                 webpage, 'next.js data', fatal=fatal, **kw),
1647             video_id, transform_source=transform_source, fatal=fatal)
1648
1649     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1650         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1651         rectx = re.escape(context_name)
1652         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1653         js, arg_keys, arg_vals = self._search_regex(
1654             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1655             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1656             default=NO_DEFAULT if fatal else (None, None, None))
1657         if js is None:
1658             return {}
1659
1660         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1661
1662         for key, val in args.items():
1663             if val in ('undefined', 'void 0'):
1664                 args[key] = 'null'
1665
1666         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1667         return traverse_obj(ret, traverse) or {}
1668
1669     @staticmethod
1670     def _hidden_inputs(html):
1671         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1672         hidden_inputs = {}
1673         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1674             attrs = extract_attributes(input)
1675             if not input:
1676                 continue
1677             if attrs.get('type') not in ('hidden', 'submit'):
1678                 continue
1679             name = attrs.get('name') or attrs.get('id')
1680             value = attrs.get('value')
1681             if name and value is not None:
1682                 hidden_inputs[name] = value
1683         return hidden_inputs
1684
1685     def _form_hidden_inputs(self, form_id, html):
1686         form = self._search_regex(
1687             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1688             html, '%s form' % form_id, group='form')
1689         return self._hidden_inputs(form)
1690
1691     @classproperty(cache=True)
1692     def FormatSort(cls):
1693         class FormatSort(FormatSorter):
1694             def __init__(ie, *args, **kwargs):
1695                 super().__init__(ie._downloader, *args, **kwargs)
1696
1697         deprecation_warning(
1698             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1699             'Use yt_dlp.utils.FormatSorter instead')
1700         return FormatSort
1701
1702     def _sort_formats(self, formats, field_preference=[]):
1703         if not field_preference:
1704             self._downloader.deprecation_warning(
1705                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1706             return
1707         self._downloader.deprecation_warning(
1708             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1709             'Return _format_sort_fields in the info_dict instead')
1710         if formats:
1711             formats[0]['__sort_fields'] = field_preference
1712
1713     def _check_formats(self, formats, video_id):
1714         if formats:
1715             formats[:] = filter(
1716                 lambda f: self._is_valid_url(
1717                     f['url'], video_id,
1718                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1719                 formats)
1720
1721     @staticmethod
1722     def _remove_duplicate_formats(formats):
1723         format_urls = set()
1724         unique_formats = []
1725         for f in formats:
1726             if f['url'] not in format_urls:
1727                 format_urls.add(f['url'])
1728                 unique_formats.append(f)
1729         formats[:] = unique_formats
1730
1731     def _is_valid_url(self, url, video_id, item='video', headers={}):
1732         url = self._proto_relative_url(url, scheme='http:')
1733         # For now assume non HTTP(S) URLs always valid
1734         if not (url.startswith('http://') or url.startswith('https://')):
1735             return True
1736         try:
1737             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1738             return True
1739         except ExtractorError as e:
1740             self.to_screen(
1741                 '%s: %s URL is invalid, skipping: %s'
1742                 % (video_id, item, error_to_compat_str(e.cause)))
1743             return False
1744
1745     def http_scheme(self):
1746         """ Either "http:" or "https:", depending on the user's preferences """
1747         return (
1748             'http:'
1749             if self.get_param('prefer_insecure', False)
1750             else 'https:')
1751
1752     def _proto_relative_url(self, url, scheme=None):
1753         scheme = scheme or self.http_scheme()
1754         assert scheme.endswith(':')
1755         return sanitize_url(url, scheme=scheme[:-1])
1756
1757     def _sleep(self, timeout, video_id, msg_template=None):
1758         if msg_template is None:
1759             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1760         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1761         self.to_screen(msg)
1762         time.sleep(timeout)
1763
1764     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1765                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1766                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1767         if self.get_param('ignore_no_formats_error'):
1768             fatal = False
1769
1770         res = self._download_xml_handle(
1771             manifest_url, video_id, 'Downloading f4m manifest',
1772             'Unable to download f4m manifest',
1773             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1774             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1775             transform_source=transform_source,
1776             fatal=fatal, data=data, headers=headers, query=query)
1777         if res is False:
1778             return []
1779
1780         manifest, urlh = res
1781         manifest_url = urlh.geturl()
1782
1783         return self._parse_f4m_formats(
1784             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1785             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1786
1787     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1788                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1789                            fatal=True, m3u8_id=None):
1790         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1791             return []
1792
1793         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1794         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1795         if akamai_pv is not None and ';' in akamai_pv.text:
1796             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1797             if playerVerificationChallenge.strip() != '':
1798                 return []
1799
1800         formats = []
1801         manifest_version = '1.0'
1802         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1803         if not media_nodes:
1804             manifest_version = '2.0'
1805             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1806         # Remove unsupported DRM protected media from final formats
1807         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1808         media_nodes = remove_encrypted_media(media_nodes)
1809         if not media_nodes:
1810             return formats
1811
1812         manifest_base_url = get_base_url(manifest)
1813
1814         bootstrap_info = xpath_element(
1815             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1816             'bootstrap info', default=None)
1817
1818         vcodec = None
1819         mime_type = xpath_text(
1820             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1821             'base URL', default=None)
1822         if mime_type and mime_type.startswith('audio/'):
1823             vcodec = 'none'
1824
1825         for i, media_el in enumerate(media_nodes):
1826             tbr = int_or_none(media_el.attrib.get('bitrate'))
1827             width = int_or_none(media_el.attrib.get('width'))
1828             height = int_or_none(media_el.attrib.get('height'))
1829             format_id = join_nonempty(f4m_id, tbr or i)
1830             # If <bootstrapInfo> is present, the specified f4m is a
1831             # stream-level manifest, and only set-level manifests may refer to
1832             # external resources.  See section 11.4 and section 4 of F4M spec
1833             if bootstrap_info is None:
1834                 media_url = None
1835                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1836                 if manifest_version == '2.0':
1837                     media_url = media_el.attrib.get('href')
1838                 if media_url is None:
1839                     media_url = media_el.attrib.get('url')
1840                 if not media_url:
1841                     continue
1842                 manifest_url = (
1843                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1844                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1845                 # If media_url is itself a f4m manifest do the recursive extraction
1846                 # since bitrates in parent manifest (this one) and media_url manifest
1847                 # may differ leading to inability to resolve the format by requested
1848                 # bitrate in f4m downloader
1849                 ext = determine_ext(manifest_url)
1850                 if ext == 'f4m':
1851                     f4m_formats = self._extract_f4m_formats(
1852                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1853                         transform_source=transform_source, fatal=fatal)
1854                     # Sometimes stream-level manifest contains single media entry that
1855                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1856                     # At the same time parent's media entry in set-level manifest may
1857                     # contain it. We will copy it from parent in such cases.
1858                     if len(f4m_formats) == 1:
1859                         f = f4m_formats[0]
1860                         f.update({
1861                             'tbr': f.get('tbr') or tbr,
1862                             'width': f.get('width') or width,
1863                             'height': f.get('height') or height,
1864                             'format_id': f.get('format_id') if not tbr else format_id,
1865                             'vcodec': vcodec,
1866                         })
1867                     formats.extend(f4m_formats)
1868                     continue
1869                 elif ext == 'm3u8':
1870                     formats.extend(self._extract_m3u8_formats(
1871                         manifest_url, video_id, 'mp4', preference=preference,
1872                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1873                     continue
1874             formats.append({
1875                 'format_id': format_id,
1876                 'url': manifest_url,
1877                 'manifest_url': manifest_url,
1878                 'ext': 'flv' if bootstrap_info is not None else None,
1879                 'protocol': 'f4m',
1880                 'tbr': tbr,
1881                 'width': width,
1882                 'height': height,
1883                 'vcodec': vcodec,
1884                 'preference': preference,
1885                 'quality': quality,
1886             })
1887         return formats
1888
1889     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1890         return {
1891             'format_id': join_nonempty(m3u8_id, 'meta'),
1892             'url': m3u8_url,
1893             'ext': ext,
1894             'protocol': 'm3u8',
1895             'preference': preference - 100 if preference else -100,
1896             'quality': quality,
1897             'resolution': 'multiple',
1898             'format_note': 'Quality selection URL',
1899         }
1900
1901     def _report_ignoring_subs(self, name):
1902         self.report_warning(bug_reports_message(
1903             f'Ignoring subtitle tracks found in the {name} manifest; '
1904             'if any subtitle tracks are missing,'
1905         ), only_once=True)
1906
1907     def _extract_m3u8_formats(self, *args, **kwargs):
1908         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1909         if subs:
1910             self._report_ignoring_subs('HLS')
1911         return fmts
1912
1913     def _extract_m3u8_formats_and_subtitles(
1914             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1915             preference=None, quality=None, m3u8_id=None, note=None,
1916             errnote=None, fatal=True, live=False, data=None, headers={},
1917             query={}):
1918
1919         if self.get_param('ignore_no_formats_error'):
1920             fatal = False
1921
1922         if not m3u8_url:
1923             if errnote is not False:
1924                 errnote = errnote or 'Failed to obtain m3u8 URL'
1925                 if fatal:
1926                     raise ExtractorError(errnote, video_id=video_id)
1927                 self.report_warning(f'{errnote}{bug_reports_message()}')
1928             return [], {}
1929
1930         res = self._download_webpage_handle(
1931             m3u8_url, video_id,
1932             note='Downloading m3u8 information' if note is None else note,
1933             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1934             fatal=fatal, data=data, headers=headers, query=query)
1935
1936         if res is False:
1937             return [], {}
1938
1939         m3u8_doc, urlh = res
1940         m3u8_url = urlh.geturl()
1941
1942         return self._parse_m3u8_formats_and_subtitles(
1943             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1944             preference=preference, quality=quality, m3u8_id=m3u8_id,
1945             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1946             headers=headers, query=query, video_id=video_id)
1947
1948     def _parse_m3u8_formats_and_subtitles(
1949             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
1950             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1951             errnote=None, fatal=True, data=None, headers={}, query={},
1952             video_id=None):
1953         formats, subtitles = [], {}
1954
1955         has_drm = re.search('|'.join([
1956             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
1957             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
1958         ]), m3u8_doc)
1959
1960         def format_url(url):
1961             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
1962
1963         if self.get_param('hls_split_discontinuity', False):
1964             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1965                 if not m3u8_doc:
1966                     if not manifest_url:
1967                         return []
1968                     m3u8_doc = self._download_webpage(
1969                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
1970                         note=False, errnote='Failed to download m3u8 playlist information')
1971                     if m3u8_doc is False:
1972                         return []
1973                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
1974
1975         else:
1976             def _extract_m3u8_playlist_indices(*args, **kwargs):
1977                 return [None]
1978
1979         # References:
1980         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1981         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1982         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1983
1984         # We should try extracting formats only from master playlists [1, 4.3.4],
1985         # i.e. playlists that describe available qualities. On the other hand
1986         # media playlists [1, 4.3.3] should be returned as is since they contain
1987         # just the media without qualities renditions.
1988         # Fortunately, master playlist can be easily distinguished from media
1989         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1990         # master playlist tags MUST NOT appear in a media playlist and vice versa.
1991         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1992         # media playlist and MUST NOT appear in master playlist thus we can
1993         # clearly detect media playlist with this criterion.
1994
1995         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1996             formats = [{
1997                 'format_id': join_nonempty(m3u8_id, idx),
1998                 'format_index': idx,
1999                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2000                 'ext': ext,
2001                 'protocol': entry_protocol,
2002                 'preference': preference,
2003                 'quality': quality,
2004                 'has_drm': has_drm,
2005             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2006
2007             return formats, subtitles
2008
2009         groups = {}
2010         last_stream_inf = {}
2011
2012         def extract_media(x_media_line):
2013             media = parse_m3u8_attributes(x_media_line)
2014             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2015             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2016             if not (media_type and group_id and name):
2017                 return
2018             groups.setdefault(group_id, []).append(media)
2019             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2020             if media_type == 'SUBTITLES':
2021                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2022                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2023                 # However, lack of URI has been spotted in the wild.
2024                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2025                 if not media.get('URI'):
2026                     return
2027                 url = format_url(media['URI'])
2028                 sub_info = {
2029                     'url': url,
2030                     'ext': determine_ext(url),
2031                 }
2032                 if sub_info['ext'] == 'm3u8':
2033                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2034                     # files may contain is WebVTT:
2035                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2036                     sub_info['ext'] = 'vtt'
2037                     sub_info['protocol'] = 'm3u8_native'
2038                 lang = media.get('LANGUAGE') or 'und'
2039                 subtitles.setdefault(lang, []).append(sub_info)
2040             if media_type not in ('VIDEO', 'AUDIO'):
2041                 return
2042             media_url = media.get('URI')
2043             if media_url:
2044                 manifest_url = format_url(media_url)
2045                 formats.extend({
2046                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2047                     'format_note': name,
2048                     'format_index': idx,
2049                     'url': manifest_url,
2050                     'manifest_url': m3u8_url,
2051                     'language': media.get('LANGUAGE'),
2052                     'ext': ext,
2053                     'protocol': entry_protocol,
2054                     'preference': preference,
2055                     'quality': quality,
2056                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2057                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2058
2059         def build_stream_name():
2060             # Despite specification does not mention NAME attribute for
2061             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2062             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2063             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2064             stream_name = last_stream_inf.get('NAME')
2065             if stream_name:
2066                 return stream_name
2067             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2068             # from corresponding rendition group
2069             stream_group_id = last_stream_inf.get('VIDEO')
2070             if not stream_group_id:
2071                 return
2072             stream_group = groups.get(stream_group_id)
2073             if not stream_group:
2074                 return stream_group_id
2075             rendition = stream_group[0]
2076             return rendition.get('NAME') or stream_group_id
2077
2078         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2079         # chance to detect video only formats when EXT-X-STREAM-INF tags
2080         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2081         for line in m3u8_doc.splitlines():
2082             if line.startswith('#EXT-X-MEDIA:'):
2083                 extract_media(line)
2084
2085         for line in m3u8_doc.splitlines():
2086             if line.startswith('#EXT-X-STREAM-INF:'):
2087                 last_stream_inf = parse_m3u8_attributes(line)
2088             elif line.startswith('#') or not line.strip():
2089                 continue
2090             else:
2091                 tbr = float_or_none(
2092                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2093                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2094                 manifest_url = format_url(line.strip())
2095
2096                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2097                     format_id = [m3u8_id, None, idx]
2098                     # Bandwidth of live streams may differ over time thus making
2099                     # format_id unpredictable. So it's better to keep provided
2100                     # format_id intact.
2101                     if not live:
2102                         stream_name = build_stream_name()
2103                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2104                     f = {
2105                         'format_id': join_nonempty(*format_id),
2106                         'format_index': idx,
2107                         'url': manifest_url,
2108                         'manifest_url': m3u8_url,
2109                         'tbr': tbr,
2110                         'ext': ext,
2111                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2112                         'protocol': entry_protocol,
2113                         'preference': preference,
2114                         'quality': quality,
2115                     }
2116                     resolution = last_stream_inf.get('RESOLUTION')
2117                     if resolution:
2118                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2119                         if mobj:
2120                             f['width'] = int(mobj.group('width'))
2121                             f['height'] = int(mobj.group('height'))
2122                     # Unified Streaming Platform
2123                     mobj = re.search(
2124                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2125                     if mobj:
2126                         abr, vbr = mobj.groups()
2127                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2128                         f.update({
2129                             'vbr': vbr,
2130                             'abr': abr,
2131                         })
2132                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2133                     f.update(codecs)
2134                     audio_group_id = last_stream_inf.get('AUDIO')
2135                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2136                     # references a rendition group MUST have a CODECS attribute.
2137                     # However, this is not always respected. E.g. [2]
2138                     # contains EXT-X-STREAM-INF tag which references AUDIO
2139                     # rendition group but does not have CODECS and despite
2140                     # referencing an audio group it represents a complete
2141                     # (with audio and video) format. So, for such cases we will
2142                     # ignore references to rendition groups and treat them
2143                     # as complete formats.
2144                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2145                         audio_group = groups.get(audio_group_id)
2146                         if audio_group and audio_group[0].get('URI'):
2147                             # TODO: update acodec for audio only formats with
2148                             # the same GROUP-ID
2149                             f['acodec'] = 'none'
2150                     if not f.get('ext'):
2151                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2152                     formats.append(f)
2153
2154                     # for DailyMotion
2155                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2156                     if progressive_uri:
2157                         http_f = f.copy()
2158                         del http_f['manifest_url']
2159                         http_f.update({
2160                             'format_id': f['format_id'].replace('hls-', 'http-'),
2161                             'protocol': 'http',
2162                             'url': progressive_uri,
2163                         })
2164                         formats.append(http_f)
2165
2166                 last_stream_inf = {}
2167         return formats, subtitles
2168
2169     def _extract_m3u8_vod_duration(
2170             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2171
2172         m3u8_vod = self._download_webpage(
2173             m3u8_vod_url, video_id,
2174             note='Downloading m3u8 VOD manifest' if note is None else note,
2175             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2176             fatal=False, data=data, headers=headers, query=query)
2177
2178         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2179
2180     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2181         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2182             return None
2183
2184         return int(sum(
2185             float(line[len('#EXTINF:'):].split(',')[0])
2186             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2187
2188     @staticmethod
2189     def _xpath_ns(path, namespace=None):
2190         if not namespace:
2191             return path
2192         out = []
2193         for c in path.split('/'):
2194             if not c or c == '.':
2195                 out.append(c)
2196             else:
2197                 out.append('{%s}%s' % (namespace, c))
2198         return '/'.join(out)
2199
2200     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2201         if self.get_param('ignore_no_formats_error'):
2202             fatal = False
2203
2204         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2205         if res is False:
2206             assert not fatal
2207             return [], {}
2208
2209         smil, urlh = res
2210         smil_url = urlh.geturl()
2211
2212         namespace = self._parse_smil_namespace(smil)
2213
2214         fmts = self._parse_smil_formats(
2215             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2216         subs = self._parse_smil_subtitles(
2217             smil, namespace=namespace)
2218
2219         return fmts, subs
2220
2221     def _extract_smil_formats(self, *args, **kwargs):
2222         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2223         if subs:
2224             self._report_ignoring_subs('SMIL')
2225         return fmts
2226
2227     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2228         res = self._download_smil(smil_url, video_id, fatal=fatal)
2229         if res is False:
2230             return {}
2231
2232         smil, urlh = res
2233         smil_url = urlh.geturl()
2234
2235         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2236
2237     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2238         return self._download_xml_handle(
2239             smil_url, video_id, 'Downloading SMIL file',
2240             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2241
2242     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2243         namespace = self._parse_smil_namespace(smil)
2244
2245         formats = self._parse_smil_formats(
2246             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2247         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2248
2249         video_id = os.path.splitext(url_basename(smil_url))[0]
2250         title = None
2251         description = None
2252         upload_date = None
2253         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2254             name = meta.attrib.get('name')
2255             content = meta.attrib.get('content')
2256             if not name or not content:
2257                 continue
2258             if not title and name == 'title':
2259                 title = content
2260             elif not description and name in ('description', 'abstract'):
2261                 description = content
2262             elif not upload_date and name == 'date':
2263                 upload_date = unified_strdate(content)
2264
2265         thumbnails = [{
2266             'id': image.get('type'),
2267             'url': image.get('src'),
2268             'width': int_or_none(image.get('width')),
2269             'height': int_or_none(image.get('height')),
2270         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2271
2272         return {
2273             'id': video_id,
2274             'title': title or video_id,
2275             'description': description,
2276             'upload_date': upload_date,
2277             'thumbnails': thumbnails,
2278             'formats': formats,
2279             'subtitles': subtitles,
2280         }
2281
2282     def _parse_smil_namespace(self, smil):
2283         return self._search_regex(
2284             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2285
2286     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2287         base = smil_url
2288         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2289             b = meta.get('base') or meta.get('httpBase')
2290             if b:
2291                 base = b
2292                 break
2293
2294         formats = []
2295         rtmp_count = 0
2296         http_count = 0
2297         m3u8_count = 0
2298         imgs_count = 0
2299
2300         srcs = set()
2301         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2302         for medium in media:
2303             src = medium.get('src')
2304             if not src or src in srcs:
2305                 continue
2306             srcs.add(src)
2307
2308             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2309             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2310             width = int_or_none(medium.get('width'))
2311             height = int_or_none(medium.get('height'))
2312             proto = medium.get('proto')
2313             ext = medium.get('ext')
2314             src_ext = determine_ext(src)
2315             streamer = medium.get('streamer') or base
2316
2317             if proto == 'rtmp' or streamer.startswith('rtmp'):
2318                 rtmp_count += 1
2319                 formats.append({
2320                     'url': streamer,
2321                     'play_path': src,
2322                     'ext': 'flv',
2323                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2324                     'tbr': bitrate,
2325                     'filesize': filesize,
2326                     'width': width,
2327                     'height': height,
2328                 })
2329                 if transform_rtmp_url:
2330                     streamer, src = transform_rtmp_url(streamer, src)
2331                     formats[-1].update({
2332                         'url': streamer,
2333                         'play_path': src,
2334                     })
2335                 continue
2336
2337             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2338             src_url = src_url.strip()
2339
2340             if proto == 'm3u8' or src_ext == 'm3u8':
2341                 m3u8_formats = self._extract_m3u8_formats(
2342                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2343                 if len(m3u8_formats) == 1:
2344                     m3u8_count += 1
2345                     m3u8_formats[0].update({
2346                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2347                         'tbr': bitrate,
2348                         'width': width,
2349                         'height': height,
2350                     })
2351                 formats.extend(m3u8_formats)
2352             elif src_ext == 'f4m':
2353                 f4m_url = src_url
2354                 if not f4m_params:
2355                     f4m_params = {
2356                         'hdcore': '3.2.0',
2357                         'plugin': 'flowplayer-3.2.0.1',
2358                     }
2359                 f4m_url += '&' if '?' in f4m_url else '?'
2360                 f4m_url += urllib.parse.urlencode(f4m_params)
2361                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2362             elif src_ext == 'mpd':
2363                 formats.extend(self._extract_mpd_formats(
2364                     src_url, video_id, mpd_id='dash', fatal=False))
2365             elif re.search(r'\.ism/[Mm]anifest', src_url):
2366                 formats.extend(self._extract_ism_formats(
2367                     src_url, video_id, ism_id='mss', fatal=False))
2368             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2369                 http_count += 1
2370                 formats.append({
2371                     'url': src_url,
2372                     'ext': ext or src_ext or 'flv',
2373                     'format_id': 'http-%d' % (bitrate or http_count),
2374                     'tbr': bitrate,
2375                     'filesize': filesize,
2376                     'width': width,
2377                     'height': height,
2378                 })
2379
2380         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2381             src = medium.get('src')
2382             if not src or src in srcs:
2383                 continue
2384             srcs.add(src)
2385
2386             imgs_count += 1
2387             formats.append({
2388                 'format_id': 'imagestream-%d' % (imgs_count),
2389                 'url': src,
2390                 'ext': mimetype2ext(medium.get('type')),
2391                 'acodec': 'none',
2392                 'vcodec': 'none',
2393                 'width': int_or_none(medium.get('width')),
2394                 'height': int_or_none(medium.get('height')),
2395                 'format_note': 'SMIL storyboards',
2396             })
2397
2398         return formats
2399
2400     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2401         urls = []
2402         subtitles = {}
2403         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2404             src = textstream.get('src')
2405             if not src or src in urls:
2406                 continue
2407             urls.append(src)
2408             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2409             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2410             subtitles.setdefault(lang, []).append({
2411                 'url': src,
2412                 'ext': ext,
2413             })
2414         return subtitles
2415
2416     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2417         res = self._download_xml_handle(
2418             xspf_url, playlist_id, 'Downloading xpsf playlist',
2419             'Unable to download xspf manifest', fatal=fatal)
2420         if res is False:
2421             return []
2422
2423         xspf, urlh = res
2424         xspf_url = urlh.geturl()
2425
2426         return self._parse_xspf(
2427             xspf, playlist_id, xspf_url=xspf_url,
2428             xspf_base_url=base_url(xspf_url))
2429
2430     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2431         NS_MAP = {
2432             'xspf': 'http://xspf.org/ns/0/',
2433             's1': 'http://static.streamone.nl/player/ns/0',
2434         }
2435
2436         entries = []
2437         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2438             title = xpath_text(
2439                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2440             description = xpath_text(
2441                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2442             thumbnail = xpath_text(
2443                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2444             duration = float_or_none(
2445                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2446
2447             formats = []
2448             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2449                 format_url = urljoin(xspf_base_url, location.text)
2450                 if not format_url:
2451                     continue
2452                 formats.append({
2453                     'url': format_url,
2454                     'manifest_url': xspf_url,
2455                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2456                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2457                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2458                 })
2459
2460             entries.append({
2461                 'id': playlist_id,
2462                 'title': title,
2463                 'description': description,
2464                 'thumbnail': thumbnail,
2465                 'duration': duration,
2466                 'formats': formats,
2467             })
2468         return entries
2469
2470     def _extract_mpd_formats(self, *args, **kwargs):
2471         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2472         if subs:
2473             self._report_ignoring_subs('DASH')
2474         return fmts
2475
2476     def _extract_mpd_formats_and_subtitles(
2477             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2478             fatal=True, data=None, headers={}, query={}):
2479
2480         if self.get_param('ignore_no_formats_error'):
2481             fatal = False
2482
2483         res = self._download_xml_handle(
2484             mpd_url, video_id,
2485             note='Downloading MPD manifest' if note is None else note,
2486             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2487             fatal=fatal, data=data, headers=headers, query=query)
2488         if res is False:
2489             return [], {}
2490         mpd_doc, urlh = res
2491         if mpd_doc is None:
2492             return [], {}
2493
2494         # We could have been redirected to a new url when we retrieved our mpd file.
2495         mpd_url = urlh.geturl()
2496         mpd_base_url = base_url(mpd_url)
2497
2498         return self._parse_mpd_formats_and_subtitles(
2499             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2500
2501     def _parse_mpd_formats(self, *args, **kwargs):
2502         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2503         if subs:
2504             self._report_ignoring_subs('DASH')
2505         return fmts
2506
2507     def _parse_mpd_formats_and_subtitles(
2508             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2509         """
2510         Parse formats from MPD manifest.
2511         References:
2512          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2513             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2514          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2515         """
2516         if not self.get_param('dynamic_mpd', True):
2517             if mpd_doc.get('type') == 'dynamic':
2518                 return [], {}
2519
2520         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2521
2522         def _add_ns(path):
2523             return self._xpath_ns(path, namespace)
2524
2525         def is_drm_protected(element):
2526             return element.find(_add_ns('ContentProtection')) is not None
2527
2528         def extract_multisegment_info(element, ms_parent_info):
2529             ms_info = ms_parent_info.copy()
2530
2531             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2532             # common attributes and elements.  We will only extract relevant
2533             # for us.
2534             def extract_common(source):
2535                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2536                 if segment_timeline is not None:
2537                     s_e = segment_timeline.findall(_add_ns('S'))
2538                     if s_e:
2539                         ms_info['total_number'] = 0
2540                         ms_info['s'] = []
2541                         for s in s_e:
2542                             r = int(s.get('r', 0))
2543                             ms_info['total_number'] += 1 + r
2544                             ms_info['s'].append({
2545                                 't': int(s.get('t', 0)),
2546                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2547                                 'd': int(s.attrib['d']),
2548                                 'r': r,
2549                             })
2550                 start_number = source.get('startNumber')
2551                 if start_number:
2552                     ms_info['start_number'] = int(start_number)
2553                 timescale = source.get('timescale')
2554                 if timescale:
2555                     ms_info['timescale'] = int(timescale)
2556                 segment_duration = source.get('duration')
2557                 if segment_duration:
2558                     ms_info['segment_duration'] = float(segment_duration)
2559
2560             def extract_Initialization(source):
2561                 initialization = source.find(_add_ns('Initialization'))
2562                 if initialization is not None:
2563                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2564
2565             segment_list = element.find(_add_ns('SegmentList'))
2566             if segment_list is not None:
2567                 extract_common(segment_list)
2568                 extract_Initialization(segment_list)
2569                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2570                 if segment_urls_e:
2571                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2572             else:
2573                 segment_template = element.find(_add_ns('SegmentTemplate'))
2574                 if segment_template is not None:
2575                     extract_common(segment_template)
2576                     media = segment_template.get('media')
2577                     if media:
2578                         ms_info['media'] = media
2579                     initialization = segment_template.get('initialization')
2580                     if initialization:
2581                         ms_info['initialization'] = initialization
2582                     else:
2583                         extract_Initialization(segment_template)
2584             return ms_info
2585
2586         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2587         formats, subtitles = [], {}
2588         stream_numbers = collections.defaultdict(int)
2589         for period in mpd_doc.findall(_add_ns('Period')):
2590             period_duration = parse_duration(period.get('duration')) or mpd_duration
2591             period_ms_info = extract_multisegment_info(period, {
2592                 'start_number': 1,
2593                 'timescale': 1,
2594             })
2595             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2596                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2597                 for representation in adaptation_set.findall(_add_ns('Representation')):
2598                     representation_attrib = adaptation_set.attrib.copy()
2599                     representation_attrib.update(representation.attrib)
2600                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2601                     mime_type = representation_attrib['mimeType']
2602                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2603
2604                     codec_str = representation_attrib.get('codecs', '')
2605                     # Some kind of binary subtitle found in some youtube livestreams
2606                     if mime_type == 'application/x-rawcc':
2607                         codecs = {'scodec': codec_str}
2608                     else:
2609                         codecs = parse_codecs(codec_str)
2610                     if content_type not in ('video', 'audio', 'text'):
2611                         if mime_type == 'image/jpeg':
2612                             content_type = mime_type
2613                         elif codecs.get('vcodec', 'none') != 'none':
2614                             content_type = 'video'
2615                         elif codecs.get('acodec', 'none') != 'none':
2616                             content_type = 'audio'
2617                         elif codecs.get('scodec', 'none') != 'none':
2618                             content_type = 'text'
2619                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2620                             content_type = 'text'
2621                         else:
2622                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2623                             continue
2624
2625                     base_url = ''
2626                     for element in (representation, adaptation_set, period, mpd_doc):
2627                         base_url_e = element.find(_add_ns('BaseURL'))
2628                         if try_call(lambda: base_url_e.text) is not None:
2629                             base_url = base_url_e.text + base_url
2630                             if re.match(r'^https?://', base_url):
2631                                 break
2632                     if mpd_base_url and base_url.startswith('/'):
2633                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2634                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2635                         if not mpd_base_url.endswith('/'):
2636                             mpd_base_url += '/'
2637                         base_url = mpd_base_url + base_url
2638                     representation_id = representation_attrib.get('id')
2639                     lang = representation_attrib.get('lang')
2640                     url_el = representation.find(_add_ns('BaseURL'))
2641                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2642                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2643                     if representation_id is not None:
2644                         format_id = representation_id
2645                     else:
2646                         format_id = content_type
2647                     if mpd_id:
2648                         format_id = mpd_id + '-' + format_id
2649                     if content_type in ('video', 'audio'):
2650                         f = {
2651                             'format_id': format_id,
2652                             'manifest_url': mpd_url,
2653                             'ext': mimetype2ext(mime_type),
2654                             'width': int_or_none(representation_attrib.get('width')),
2655                             'height': int_or_none(representation_attrib.get('height')),
2656                             'tbr': float_or_none(bandwidth, 1000),
2657                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2658                             'fps': int_or_none(representation_attrib.get('frameRate')),
2659                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2660                             'format_note': 'DASH %s' % content_type,
2661                             'filesize': filesize,
2662                             'container': mimetype2ext(mime_type) + '_dash',
2663                             **codecs
2664                         }
2665                     elif content_type == 'text':
2666                         f = {
2667                             'ext': mimetype2ext(mime_type),
2668                             'manifest_url': mpd_url,
2669                             'filesize': filesize,
2670                         }
2671                     elif content_type == 'image/jpeg':
2672                         # See test case in VikiIE
2673                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2674                         f = {
2675                             'format_id': format_id,
2676                             'ext': 'mhtml',
2677                             'manifest_url': mpd_url,
2678                             'format_note': 'DASH storyboards (jpeg)',
2679                             'acodec': 'none',
2680                             'vcodec': 'none',
2681                         }
2682                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2683                         f['has_drm'] = True
2684                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2685
2686                     def prepare_template(template_name, identifiers):
2687                         tmpl = representation_ms_info[template_name]
2688                         if representation_id is not None:
2689                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2690                         # First of, % characters outside $...$ templates
2691                         # must be escaped by doubling for proper processing
2692                         # by % operator string formatting used further (see
2693                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2694                         t = ''
2695                         in_template = False
2696                         for c in tmpl:
2697                             t += c
2698                             if c == '$':
2699                                 in_template = not in_template
2700                             elif c == '%' and not in_template:
2701                                 t += c
2702                         # Next, $...$ templates are translated to their
2703                         # %(...) counterparts to be used with % operator
2704                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2705                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2706                         t.replace('$$', '$')
2707                         return t
2708
2709                     # @initialization is a regular template like @media one
2710                     # so it should be handled just the same way (see
2711                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2712                     if 'initialization' in representation_ms_info:
2713                         initialization_template = prepare_template(
2714                             'initialization',
2715                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2716                             # $Time$ shall not be included for @initialization thus
2717                             # only $Bandwidth$ remains
2718                             ('Bandwidth', ))
2719                         representation_ms_info['initialization_url'] = initialization_template % {
2720                             'Bandwidth': bandwidth,
2721                         }
2722
2723                     def location_key(location):
2724                         return 'url' if re.match(r'^https?://', location) else 'path'
2725
2726                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2727
2728                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2729                         media_location_key = location_key(media_template)
2730
2731                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2732                         # can't be used at the same time
2733                         if '%(Number' in media_template and 's' not in representation_ms_info:
2734                             segment_duration = None
2735                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2736                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2737                                 representation_ms_info['total_number'] = int(math.ceil(
2738                                     float_or_none(period_duration, segment_duration, default=0)))
2739                             representation_ms_info['fragments'] = [{
2740                                 media_location_key: media_template % {
2741                                     'Number': segment_number,
2742                                     'Bandwidth': bandwidth,
2743                                 },
2744                                 'duration': segment_duration,
2745                             } for segment_number in range(
2746                                 representation_ms_info['start_number'],
2747                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2748                         else:
2749                             # $Number*$ or $Time$ in media template with S list available
2750                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2751                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2752                             representation_ms_info['fragments'] = []
2753                             segment_time = 0
2754                             segment_d = None
2755                             segment_number = representation_ms_info['start_number']
2756
2757                             def add_segment_url():
2758                                 segment_url = media_template % {
2759                                     'Time': segment_time,
2760                                     'Bandwidth': bandwidth,
2761                                     'Number': segment_number,
2762                                 }
2763                                 representation_ms_info['fragments'].append({
2764                                     media_location_key: segment_url,
2765                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2766                                 })
2767
2768                             for num, s in enumerate(representation_ms_info['s']):
2769                                 segment_time = s.get('t') or segment_time
2770                                 segment_d = s['d']
2771                                 add_segment_url()
2772                                 segment_number += 1
2773                                 for r in range(s.get('r', 0)):
2774                                     segment_time += segment_d
2775                                     add_segment_url()
2776                                     segment_number += 1
2777                                 segment_time += segment_d
2778                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2779                         # No media template,
2780                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2781                         # or any YouTube dashsegments video
2782                         fragments = []
2783                         segment_index = 0
2784                         timescale = representation_ms_info['timescale']
2785                         for s in representation_ms_info['s']:
2786                             duration = float_or_none(s['d'], timescale)
2787                             for r in range(s.get('r', 0) + 1):
2788                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2789                                 fragments.append({
2790                                     location_key(segment_uri): segment_uri,
2791                                     'duration': duration,
2792                                 })
2793                                 segment_index += 1
2794                         representation_ms_info['fragments'] = fragments
2795                     elif 'segment_urls' in representation_ms_info:
2796                         # Segment URLs with no SegmentTimeline
2797                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2798                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2799                         fragments = []
2800                         segment_duration = float_or_none(
2801                             representation_ms_info['segment_duration'],
2802                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2803                         for segment_url in representation_ms_info['segment_urls']:
2804                             fragment = {
2805                                 location_key(segment_url): segment_url,
2806                             }
2807                             if segment_duration:
2808                                 fragment['duration'] = segment_duration
2809                             fragments.append(fragment)
2810                         representation_ms_info['fragments'] = fragments
2811                     # If there is a fragments key available then we correctly recognized fragmented media.
2812                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2813                     # assumption is not necessarily correct since we may simply have no support for
2814                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2815                     if 'fragments' in representation_ms_info:
2816                         f.update({
2817                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2818                             'url': mpd_url or base_url,
2819                             'fragment_base_url': base_url,
2820                             'fragments': [],
2821                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2822                         })
2823                         if 'initialization_url' in representation_ms_info:
2824                             initialization_url = representation_ms_info['initialization_url']
2825                             if not f.get('url'):
2826                                 f['url'] = initialization_url
2827                             f['fragments'].append({location_key(initialization_url): initialization_url})
2828                         f['fragments'].extend(representation_ms_info['fragments'])
2829                         if not period_duration:
2830                             period_duration = try_get(
2831                                 representation_ms_info,
2832                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2833                     else:
2834                         # Assuming direct URL to unfragmented media.
2835                         f['url'] = base_url
2836                     if content_type in ('video', 'audio', 'image/jpeg'):
2837                         f['manifest_stream_number'] = stream_numbers[f['url']]
2838                         stream_numbers[f['url']] += 1
2839                         formats.append(f)
2840                     elif content_type == 'text':
2841                         subtitles.setdefault(lang or 'und', []).append(f)
2842
2843         return formats, subtitles
2844
2845     def _extract_ism_formats(self, *args, **kwargs):
2846         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2847         if subs:
2848             self._report_ignoring_subs('ISM')
2849         return fmts
2850
2851     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2852         if self.get_param('ignore_no_formats_error'):
2853             fatal = False
2854
2855         res = self._download_xml_handle(
2856             ism_url, video_id,
2857             note='Downloading ISM manifest' if note is None else note,
2858             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2859             fatal=fatal, data=data, headers=headers, query=query)
2860         if res is False:
2861             return [], {}
2862         ism_doc, urlh = res
2863         if ism_doc is None:
2864             return [], {}
2865
2866         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2867
2868     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2869         """
2870         Parse formats from ISM manifest.
2871         References:
2872          1. [MS-SSTR]: Smooth Streaming Protocol,
2873             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2874         """
2875         if ism_doc.get('IsLive') == 'TRUE':
2876             return [], {}
2877
2878         duration = int(ism_doc.attrib['Duration'])
2879         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2880
2881         formats = []
2882         subtitles = {}
2883         for stream in ism_doc.findall('StreamIndex'):
2884             stream_type = stream.get('Type')
2885             if stream_type not in ('video', 'audio', 'text'):
2886                 continue
2887             url_pattern = stream.attrib['Url']
2888             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2889             stream_name = stream.get('Name')
2890             stream_language = stream.get('Language', 'und')
2891             for track in stream.findall('QualityLevel'):
2892                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2893                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
2894                 # TODO: add support for WVC1 and WMAP
2895                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
2896                     self.report_warning('%s is not a supported codec' % fourcc)
2897                     continue
2898                 tbr = int(track.attrib['Bitrate']) // 1000
2899                 # [1] does not mention Width and Height attributes. However,
2900                 # they're often present while MaxWidth and MaxHeight are
2901                 # missing, so should be used as fallbacks
2902                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2903                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2904                 sampling_rate = int_or_none(track.get('SamplingRate'))
2905
2906                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2907                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
2908
2909                 fragments = []
2910                 fragment_ctx = {
2911                     'time': 0,
2912                 }
2913                 stream_fragments = stream.findall('c')
2914                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2915                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2916                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2917                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2918                     if not fragment_ctx['duration']:
2919                         try:
2920                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2921                         except IndexError:
2922                             next_fragment_time = duration
2923                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2924                     for _ in range(fragment_repeat):
2925                         fragments.append({
2926                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
2927                             'duration': fragment_ctx['duration'] / stream_timescale,
2928                         })
2929                         fragment_ctx['time'] += fragment_ctx['duration']
2930
2931                 if stream_type == 'text':
2932                     subtitles.setdefault(stream_language, []).append({
2933                         'ext': 'ismt',
2934                         'protocol': 'ism',
2935                         'url': ism_url,
2936                         'manifest_url': ism_url,
2937                         'fragments': fragments,
2938                         '_download_params': {
2939                             'stream_type': stream_type,
2940                             'duration': duration,
2941                             'timescale': stream_timescale,
2942                             'fourcc': fourcc,
2943                             'language': stream_language,
2944                             'codec_private_data': track.get('CodecPrivateData'),
2945                         }
2946                     })
2947                 elif stream_type in ('video', 'audio'):
2948                     formats.append({
2949                         'format_id': join_nonempty(ism_id, stream_name, tbr),
2950                         'url': ism_url,
2951                         'manifest_url': ism_url,
2952                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2953                         'width': width,
2954                         'height': height,
2955                         'tbr': tbr,
2956                         'asr': sampling_rate,
2957                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2958                         'acodec': 'none' if stream_type == 'video' else fourcc,
2959                         'protocol': 'ism',
2960                         'fragments': fragments,
2961                         'has_drm': ism_doc.find('Protection') is not None,
2962                         '_download_params': {
2963                             'stream_type': stream_type,
2964                             'duration': duration,
2965                             'timescale': stream_timescale,
2966                             'width': width or 0,
2967                             'height': height or 0,
2968                             'fourcc': fourcc,
2969                             'language': stream_language,
2970                             'codec_private_data': track.get('CodecPrivateData'),
2971                             'sampling_rate': sampling_rate,
2972                             'channels': int_or_none(track.get('Channels', 2)),
2973                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2974                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2975                         },
2976                     })
2977         return formats, subtitles
2978
2979     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
2980         def absolute_url(item_url):
2981             return urljoin(base_url, item_url)
2982
2983         def parse_content_type(content_type):
2984             if not content_type:
2985                 return {}
2986             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2987             if ctr:
2988                 mimetype, codecs = ctr.groups()
2989                 f = parse_codecs(codecs)
2990                 f['ext'] = mimetype2ext(mimetype)
2991                 return f
2992             return {}
2993
2994         def _media_formats(src, cur_media_type, type_info=None):
2995             type_info = type_info or {}
2996             full_url = absolute_url(src)
2997             ext = type_info.get('ext') or determine_ext(full_url)
2998             if ext == 'm3u8':
2999                 is_plain_url = False
3000                 formats = self._extract_m3u8_formats(
3001                     full_url, video_id, ext='mp4',
3002                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3003                     preference=preference, quality=quality, fatal=False)
3004             elif ext == 'mpd':
3005                 is_plain_url = False
3006                 formats = self._extract_mpd_formats(
3007                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3008             else:
3009                 is_plain_url = True
3010                 formats = [{
3011                     'url': full_url,
3012                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3013                     'ext': ext,
3014                 }]
3015             return is_plain_url, formats
3016
3017         entries = []
3018         # amp-video and amp-audio are very similar to their HTML5 counterparts
3019         # so we will include them right here (see
3020         # https://www.ampproject.org/docs/reference/components/amp-video)
3021         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3022         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3023         media_tags = [(media_tag, media_tag_name, media_type, '')
3024                       for media_tag, media_tag_name, media_type
3025                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3026         media_tags.extend(re.findall(
3027             # We only allow video|audio followed by a whitespace or '>'.
3028             # Allowing more characters may end up in significant slow down (see
3029             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3030             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3031             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3032         for media_tag, _, media_type, media_content in media_tags:
3033             media_info = {
3034                 'formats': [],
3035                 'subtitles': {},
3036             }
3037             media_attributes = extract_attributes(media_tag)
3038             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3039             if src:
3040                 f = parse_content_type(media_attributes.get('type'))
3041                 _, formats = _media_formats(src, media_type, f)
3042                 media_info['formats'].extend(formats)
3043             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3044             if media_content:
3045                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3046                     s_attr = extract_attributes(source_tag)
3047                     # data-video-src and data-src are non standard but seen
3048                     # several times in the wild
3049                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3050                     if not src:
3051                         continue
3052                     f = parse_content_type(s_attr.get('type'))
3053                     is_plain_url, formats = _media_formats(src, media_type, f)
3054                     if is_plain_url:
3055                         # width, height, res, label and title attributes are
3056                         # all not standard but seen several times in the wild
3057                         labels = [
3058                             s_attr.get(lbl)
3059                             for lbl in ('label', 'title')
3060                             if str_or_none(s_attr.get(lbl))
3061                         ]
3062                         width = int_or_none(s_attr.get('width'))
3063                         height = (int_or_none(s_attr.get('height'))
3064                                   or int_or_none(s_attr.get('res')))
3065                         if not width or not height:
3066                             for lbl in labels:
3067                                 resolution = parse_resolution(lbl)
3068                                 if not resolution:
3069                                     continue
3070                                 width = width or resolution.get('width')
3071                                 height = height or resolution.get('height')
3072                         for lbl in labels:
3073                             tbr = parse_bitrate(lbl)
3074                             if tbr:
3075                                 break
3076                         else:
3077                             tbr = None
3078                         f.update({
3079                             'width': width,
3080                             'height': height,
3081                             'tbr': tbr,
3082                             'format_id': s_attr.get('label') or s_attr.get('title'),
3083                         })
3084                         f.update(formats[0])
3085                         media_info['formats'].append(f)
3086                     else:
3087                         media_info['formats'].extend(formats)
3088                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3089                     track_attributes = extract_attributes(track_tag)
3090                     kind = track_attributes.get('kind')
3091                     if not kind or kind in ('subtitles', 'captions'):
3092                         src = strip_or_none(track_attributes.get('src'))
3093                         if not src:
3094                             continue
3095                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3096                         media_info['subtitles'].setdefault(lang, []).append({
3097                             'url': absolute_url(src),
3098                         })
3099             for f in media_info['formats']:
3100                 f.setdefault('http_headers', {})['Referer'] = base_url
3101             if media_info['formats'] or media_info['subtitles']:
3102                 entries.append(media_info)
3103         return entries
3104
3105     def _extract_akamai_formats(self, *args, **kwargs):
3106         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3107         if subs:
3108             self._report_ignoring_subs('akamai')
3109         return fmts
3110
3111     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3112         signed = 'hdnea=' in manifest_url
3113         if not signed:
3114             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3115             manifest_url = re.sub(
3116                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3117                 '', manifest_url).strip('?')
3118
3119         formats = []
3120         subtitles = {}
3121
3122         hdcore_sign = 'hdcore=3.7.0'
3123         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3124         hds_host = hosts.get('hds')
3125         if hds_host:
3126             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3127         if 'hdcore=' not in f4m_url:
3128             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3129         f4m_formats = self._extract_f4m_formats(
3130             f4m_url, video_id, f4m_id='hds', fatal=False)
3131         for entry in f4m_formats:
3132             entry.update({'extra_param_to_segment_url': hdcore_sign})
3133         formats.extend(f4m_formats)
3134
3135         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3136         hls_host = hosts.get('hls')
3137         if hls_host:
3138             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3139         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3140             m3u8_url, video_id, 'mp4', 'm3u8_native',
3141             m3u8_id='hls', fatal=False)
3142         formats.extend(m3u8_formats)
3143         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3144
3145         http_host = hosts.get('http')
3146         if http_host and m3u8_formats and not signed:
3147             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3148             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3149             qualities_length = len(qualities)
3150             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3151                 i = 0
3152                 for f in m3u8_formats:
3153                     if f['vcodec'] != 'none':
3154                         for protocol in ('http', 'https'):
3155                             http_f = f.copy()
3156                             del http_f['manifest_url']
3157                             http_url = re.sub(
3158                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3159                             http_f.update({
3160                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3161                                 'url': http_url,
3162                                 'protocol': protocol,
3163                             })
3164                             formats.append(http_f)
3165                         i += 1
3166
3167         return formats, subtitles
3168
3169     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3170         query = urllib.parse.urlparse(url).query
3171         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3172         mobj = re.search(
3173             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3174         url_base = mobj.group('url')
3175         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3176         formats = []
3177
3178         def manifest_url(manifest):
3179             m_url = f'{http_base_url}/{manifest}'
3180             if query:
3181                 m_url += '?%s' % query
3182             return m_url
3183
3184         if 'm3u8' not in skip_protocols:
3185             formats.extend(self._extract_m3u8_formats(
3186                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3187                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3188         if 'f4m' not in skip_protocols:
3189             formats.extend(self._extract_f4m_formats(
3190                 manifest_url('manifest.f4m'),
3191                 video_id, f4m_id='hds', fatal=False))
3192         if 'dash' not in skip_protocols:
3193             formats.extend(self._extract_mpd_formats(
3194                 manifest_url('manifest.mpd'),
3195                 video_id, mpd_id='dash', fatal=False))
3196         if re.search(r'(?:/smil:|\.smil)', url_base):
3197             if 'smil' not in skip_protocols:
3198                 rtmp_formats = self._extract_smil_formats(
3199                     manifest_url('jwplayer.smil'),
3200                     video_id, fatal=False)
3201                 for rtmp_format in rtmp_formats:
3202                     rtsp_format = rtmp_format.copy()
3203                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3204                     del rtsp_format['play_path']
3205                     del rtsp_format['ext']
3206                     rtsp_format.update({
3207                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3208                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3209                         'protocol': 'rtsp',
3210                     })
3211                     formats.extend([rtmp_format, rtsp_format])
3212         else:
3213             for protocol in ('rtmp', 'rtsp'):
3214                 if protocol not in skip_protocols:
3215                     formats.append({
3216                         'url': f'{protocol}:{url_base}',
3217                         'format_id': protocol,
3218                         'protocol': protocol,
3219                     })
3220         return formats
3221
3222     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3223         mobj = re.search(
3224             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3225             webpage)
3226         if mobj:
3227             try:
3228                 jwplayer_data = self._parse_json(mobj.group('options'),
3229                                                  video_id=video_id,
3230                                                  transform_source=transform_source)
3231             except ExtractorError:
3232                 pass
3233             else:
3234                 if isinstance(jwplayer_data, dict):
3235                     return jwplayer_data
3236
3237     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3238         jwplayer_data = self._find_jwplayer_data(
3239             webpage, video_id, transform_source=js_to_json)
3240         return self._parse_jwplayer_data(
3241             jwplayer_data, video_id, *args, **kwargs)
3242
3243     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3244                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3245         entries = []
3246         if not isinstance(jwplayer_data, dict):
3247             return entries
3248
3249         playlist_items = jwplayer_data.get('playlist')
3250         # JWPlayer backward compatibility: single playlist item/flattened playlists
3251         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3252         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3253         if not isinstance(playlist_items, list):
3254             playlist_items = (playlist_items or jwplayer_data, )
3255
3256         for video_data in playlist_items:
3257             if not isinstance(video_data, dict):
3258                 continue
3259             # JWPlayer backward compatibility: flattened sources
3260             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3261             if 'sources' not in video_data:
3262                 video_data['sources'] = [video_data]
3263
3264             this_video_id = video_id or video_data['mediaid']
3265
3266             formats = self._parse_jwplayer_formats(
3267                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3268                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3269
3270             subtitles = {}
3271             tracks = video_data.get('tracks')
3272             if tracks and isinstance(tracks, list):
3273                 for track in tracks:
3274                     if not isinstance(track, dict):
3275                         continue
3276                     track_kind = track.get('kind')
3277                     if not track_kind or not isinstance(track_kind, str):
3278                         continue
3279                     if track_kind.lower() not in ('captions', 'subtitles'):
3280                         continue
3281                     track_url = urljoin(base_url, track.get('file'))
3282                     if not track_url:
3283                         continue
3284                     subtitles.setdefault(track.get('label') or 'en', []).append({
3285                         'url': self._proto_relative_url(track_url)
3286                     })
3287
3288             entry = {
3289                 'id': this_video_id,
3290                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3291                 'description': clean_html(video_data.get('description')),
3292                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3293                 'timestamp': int_or_none(video_data.get('pubdate')),
3294                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3295                 'subtitles': subtitles,
3296                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3297                 'genre': clean_html(video_data.get('genre')),
3298                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3299                 'season_number': int_or_none(video_data.get('season')),
3300                 'episode_number': int_or_none(video_data.get('episode')),
3301                 'release_year': int_or_none(video_data.get('releasedate')),
3302                 'age_limit': int_or_none(video_data.get('age_restriction')),
3303             }
3304             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3305             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3306                 entry.update({
3307                     '_type': 'url_transparent',
3308                     'url': formats[0]['url'],
3309                 })
3310             else:
3311                 entry['formats'] = formats
3312             entries.append(entry)
3313         if len(entries) == 1:
3314             return entries[0]
3315         else:
3316             return self.playlist_result(entries)
3317
3318     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3319                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3320         urls = set()
3321         formats = []
3322         for source in jwplayer_sources_data:
3323             if not isinstance(source, dict):
3324                 continue
3325             source_url = urljoin(
3326                 base_url, self._proto_relative_url(source.get('file')))
3327             if not source_url or source_url in urls:
3328                 continue
3329             urls.add(source_url)
3330             source_type = source.get('type') or ''
3331             ext = mimetype2ext(source_type) or determine_ext(source_url)
3332             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3333                 formats.extend(self._extract_m3u8_formats(
3334                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3335                     m3u8_id=m3u8_id, fatal=False))
3336             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3337                 formats.extend(self._extract_mpd_formats(
3338                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3339             elif ext == 'smil':
3340                 formats.extend(self._extract_smil_formats(
3341                     source_url, video_id, fatal=False))
3342             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3343             elif source_type.startswith('audio') or ext in (
3344                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3345                 formats.append({
3346                     'url': source_url,
3347                     'vcodec': 'none',
3348                     'ext': ext,
3349                 })
3350             else:
3351                 format_id = str_or_none(source.get('label'))
3352                 height = int_or_none(source.get('height'))
3353                 if height is None and format_id:
3354                     # Often no height is provided but there is a label in
3355                     # format like "1080p", "720p SD", or 1080.
3356                     height = parse_resolution(format_id).get('height')
3357                 a_format = {
3358                     'url': source_url,
3359                     'width': int_or_none(source.get('width')),
3360                     'height': height,
3361                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3362                     'filesize': int_or_none(source.get('filesize')),
3363                     'ext': ext,
3364                     'format_id': format_id
3365                 }
3366                 if source_url.startswith('rtmp'):
3367                     a_format['ext'] = 'flv'
3368                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3369                     # of jwplayer.flash.swf
3370                     rtmp_url_parts = re.split(
3371                         r'((?:mp4|mp3|flv):)', source_url, 1)
3372                     if len(rtmp_url_parts) == 3:
3373                         rtmp_url, prefix, play_path = rtmp_url_parts
3374                         a_format.update({
3375                             'url': rtmp_url,
3376                             'play_path': prefix + play_path,
3377                         })
3378                     if rtmp_params:
3379                         a_format.update(rtmp_params)
3380                 formats.append(a_format)
3381         return formats
3382
3383     def _live_title(self, name):
3384         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3385         return name
3386
3387     def _int(self, v, name, fatal=False, **kwargs):
3388         res = int_or_none(v, **kwargs)
3389         if res is None:
3390             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3391             if fatal:
3392                 raise ExtractorError(msg)
3393             else:
3394                 self.report_warning(msg)
3395         return res
3396
3397     def _float(self, v, name, fatal=False, **kwargs):
3398         res = float_or_none(v, **kwargs)
3399         if res is None:
3400             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3401             if fatal:
3402                 raise ExtractorError(msg)
3403             else:
3404                 self.report_warning(msg)
3405         return res
3406
3407     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3408                     path='/', secure=False, discard=False, rest={}, **kwargs):
3409         cookie = http.cookiejar.Cookie(
3410             0, name, value, port, port is not None, domain, True,
3411             domain.startswith('.'), path, True, secure, expire_time,
3412             discard, None, None, rest)
3413         self.cookiejar.set_cookie(cookie)
3414
3415     def _get_cookies(self, url):
3416         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3417         return LenientSimpleCookie(self._downloader._calc_cookies(url))
3418
3419     def _apply_first_set_cookie_header(self, url_handle, cookie):
3420         """
3421         Apply first Set-Cookie header instead of the last. Experimental.
3422
3423         Some sites (e.g. [1-3]) may serve two cookies under the same name
3424         in Set-Cookie header and expect the first (old) one to be set rather
3425         than second (new). However, as of RFC6265 the newer one cookie
3426         should be set into cookie store what actually happens.
3427         We will workaround this issue by resetting the cookie to
3428         the first one manually.
3429         1. https://new.vk.com/
3430         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3431         3. https://learning.oreilly.com/
3432         """
3433         for header, cookies in url_handle.headers.items():
3434             if header.lower() != 'set-cookie':
3435                 continue
3436             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3437             cookie_value = re.search(
3438                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3439             if cookie_value:
3440                 value, domain = cookie_value.groups()
3441                 self._set_cookie(domain, cookie, value)
3442                 break
3443
3444     @classmethod
3445     def get_testcases(cls, include_onlymatching=False):
3446         # Do not look in super classes
3447         t = vars(cls).get('_TEST')
3448         if t:
3449             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3450             tests = [t]
3451         else:
3452             tests = vars(cls).get('_TESTS', [])
3453         for t in tests:
3454             if not include_onlymatching and t.get('only_matching', False):
3455                 continue
3456             t['name'] = cls.ie_key()
3457             yield t
3458         if getattr(cls, '__wrapped__', None):
3459             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3460
3461     @classmethod
3462     def get_webpage_testcases(cls):
3463         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3464         for t in tests:
3465             t['name'] = cls.ie_key()
3466             yield t
3467         if getattr(cls, '__wrapped__', None):
3468             yield from cls.__wrapped__.get_webpage_testcases()
3469
3470     @classproperty(cache=True)
3471     def age_limit(cls):
3472         """Get age limit from the testcases"""
3473         return max(traverse_obj(
3474             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3475             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3476
3477     @classproperty(cache=True)
3478     def _RETURN_TYPE(cls):
3479         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3480         tests = tuple(cls.get_testcases(include_onlymatching=False))
3481         if not tests:
3482             return None
3483         elif not any(k.startswith('playlist') for test in tests for k in test):
3484             return 'video'
3485         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3486             return 'playlist'
3487         return 'any'
3488
3489     @classmethod
3490     def is_single_video(cls, url):
3491         """Returns whether the URL is of a single video, None if unknown"""
3492         assert cls.suitable(url), 'The URL must be suitable for the extractor'
3493         return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3494
3495     @classmethod
3496     def is_suitable(cls, age_limit):
3497         """Test whether the extractor is generally suitable for the given age limit"""
3498         return not age_restricted(cls.age_limit, age_limit)
3499
3500     @classmethod
3501     def description(cls, *, markdown=True, search_examples=None):
3502         """Description of the extractor"""
3503         desc = ''
3504         if cls._NETRC_MACHINE:
3505             if markdown:
3506                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3507             else:
3508                 desc += f' [{cls._NETRC_MACHINE}]'
3509         if cls.IE_DESC is False:
3510             desc += ' [HIDDEN]'
3511         elif cls.IE_DESC:
3512             desc += f' {cls.IE_DESC}'
3513         if cls.SEARCH_KEY:
3514             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3515             if search_examples:
3516                 _COUNTS = ('', '5', '10', 'all')
3517                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3518         if not cls.working():
3519             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3520
3521         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3522         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3523         return f'{name}:{desc}' if desc else name
3524
3525     def extract_subtitles(self, *args, **kwargs):
3526         if (self.get_param('writesubtitles', False)
3527                 or self.get_param('listsubtitles')):
3528             return self._get_subtitles(*args, **kwargs)
3529         return {}
3530
3531     def _get_subtitles(self, *args, **kwargs):
3532         raise NotImplementedError('This method must be implemented by subclasses')
3533
3534     class CommentsDisabled(Exception):
3535         """Raise in _get_comments if comments are disabled for the video"""
3536
3537     def extract_comments(self, *args, **kwargs):
3538         if not self.get_param('getcomments'):
3539             return None
3540         generator = self._get_comments(*args, **kwargs)
3541
3542         def extractor():
3543             comments = []
3544             interrupted = True
3545             try:
3546                 while True:
3547                     comments.append(next(generator))
3548             except StopIteration:
3549                 interrupted = False
3550             except KeyboardInterrupt:
3551                 self.to_screen('Interrupted by user')
3552             except self.CommentsDisabled:
3553                 return {'comments': None, 'comment_count': None}
3554             except Exception as e:
3555                 if self.get_param('ignoreerrors') is not True:
3556                     raise
3557                 self._downloader.report_error(e)
3558             comment_count = len(comments)
3559             self.to_screen(f'Extracted {comment_count} comments')
3560             return {
3561                 'comments': comments,
3562                 'comment_count': None if interrupted else comment_count
3563             }
3564         return extractor
3565
3566     def _get_comments(self, *args, **kwargs):
3567         raise NotImplementedError('This method must be implemented by subclasses')
3568
3569     @staticmethod
3570     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3571         """ Merge subtitle items for one language. Items with duplicated URLs/data
3572         will be dropped. """
3573         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3574         ret = list(subtitle_list1)
3575         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3576         return ret
3577
3578     @classmethod
3579     def _merge_subtitles(cls, *dicts, target=None):
3580         """ Merge subtitle dictionaries, language by language. """
3581         if target is None:
3582             target = {}
3583         for d in dicts:
3584             for lang, subs in d.items():
3585                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3586         return target
3587
3588     def extract_automatic_captions(self, *args, **kwargs):
3589         if (self.get_param('writeautomaticsub', False)
3590                 or self.get_param('listsubtitles')):
3591             return self._get_automatic_captions(*args, **kwargs)
3592         return {}
3593
3594     def _get_automatic_captions(self, *args, **kwargs):
3595         raise NotImplementedError('This method must be implemented by subclasses')
3596
3597     @functools.cached_property
3598     def _cookies_passed(self):
3599         """Whether cookies have been passed to YoutubeDL"""
3600         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3601
3602     def mark_watched(self, *args, **kwargs):
3603         if not self.get_param('mark_watched', False):
3604             return
3605         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3606             self._mark_watched(*args, **kwargs)
3607
3608     def _mark_watched(self, *args, **kwargs):
3609         raise NotImplementedError('This method must be implemented by subclasses')
3610
3611     def geo_verification_headers(self):
3612         headers = {}
3613         geo_verification_proxy = self.get_param('geo_verification_proxy')
3614         if geo_verification_proxy:
3615             headers['Ytdl-request-proxy'] = geo_verification_proxy
3616         return headers
3617
3618     @staticmethod
3619     def _generic_id(url):
3620         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3621
3622     def _generic_title(self, url='', webpage='', *, default=None):
3623         return (self._og_search_title(webpage, default=None)
3624                 or self._html_extract_title(webpage, default=None)
3625                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3626                 or default)
3627
3628     @staticmethod
3629     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3630         all_known = all(map(
3631             lambda x: x is not None,
3632             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3633         return (
3634             'private' if is_private
3635             else 'premium_only' if needs_premium
3636             else 'subscriber_only' if needs_subscription
3637             else 'needs_auth' if needs_auth
3638             else 'unlisted' if is_unlisted
3639             else 'public' if all_known
3640             else None)
3641
3642     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3643         '''
3644         @returns            A list of values for the extractor argument given by "key"
3645                             or "default" if no such key is present
3646         @param default      The default value to return when the key is not present (default: [])
3647         @param casesense    When false, the values are converted to lower case
3648         '''
3649         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3650         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3651         if val is None:
3652             return [] if default is NO_DEFAULT else default
3653         return list(val) if casesense else [x.lower() for x in val]
3654
3655     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3656         if not playlist_id or not video_id:
3657             return not video_id
3658
3659         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3660         if no_playlist is not None:
3661             return not no_playlist
3662
3663         video_id = '' if video_id is True else f' {video_id}'
3664         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3665         if self.get_param('noplaylist'):
3666             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3667             return False
3668         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3669         return True
3670
3671     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3672         RetryManager.report_retry(
3673             err, _count or int(fatal), _retries,
3674             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3675             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3676
3677     def RetryManager(self, **kwargs):
3678         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3679
3680     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3681         display_id = traverse_obj(info_dict, 'display_id', 'id')
3682         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3683         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3684             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3685
3686     @classmethod
3687     def extract_from_webpage(cls, ydl, url, webpage):
3688         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3689               else ydl.get_info_extractor(cls.ie_key()))
3690         for info in ie._extract_from_webpage(url, webpage) or []:
3691             # url = None since we do not want to set (webpage/original)_url
3692             ydl.add_default_extra_info(info, ie, None)
3693             yield info
3694
3695     @classmethod
3696     def _extract_from_webpage(cls, url, webpage):
3697         for embed_url in orderedSet(
3698                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3699             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3700
3701     @classmethod
3702     def _extract_embed_urls(cls, url, webpage):
3703         """@returns all the embed urls on the webpage"""
3704         if '_EMBED_URL_RE' not in cls.__dict__:
3705             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3706             for idx, regex in enumerate(cls._EMBED_REGEX):
3707                 assert regex.count('(?P<url>') == 1, \
3708                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3709             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3710
3711         for regex in cls._EMBED_URL_RE:
3712             for mobj in regex.finditer(webpage):
3713                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3714                 if cls._VALID_URL is False or cls.suitable(embed_url):
3715                     yield embed_url
3716
3717     class StopExtraction(Exception):
3718         pass
3719
3720     @classmethod
3721     def _extract_url(cls, webpage):  # TODO: Remove
3722         """Only for compatibility with some older extractors"""
3723         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3724
3725     @classmethod
3726     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3727         if plugin_name:
3728             mro = inspect.getmro(cls)
3729             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3730             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3731             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3732             while getattr(super_class, '__wrapped__', None):
3733                 super_class = super_class.__wrapped__
3734             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3735             _PLUGIN_OVERRIDES[super_class].append(cls)
3736
3737         return super().__init_subclass__(**kwargs)
3738
3739
3740 class SearchInfoExtractor(InfoExtractor):
3741     """
3742     Base class for paged search queries extractors.
3743     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3744     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3745     """
3746
3747     _MAX_RESULTS = float('inf')
3748     _RETURN_TYPE = 'playlist'
3749
3750     @classproperty
3751     def _VALID_URL(cls):
3752         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3753
3754     def _real_extract(self, query):
3755         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3756         if prefix == '':
3757             return self._get_n_results(query, 1)
3758         elif prefix == 'all':
3759             return self._get_n_results(query, self._MAX_RESULTS)
3760         else:
3761             n = int(prefix)
3762             if n <= 0:
3763                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3764             elif n > self._MAX_RESULTS:
3765                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3766                 n = self._MAX_RESULTS
3767             return self._get_n_results(query, n)
3768
3769     def _get_n_results(self, query, n):
3770         """Get a specified number of results for a query.
3771         Either this function or _search_results must be overridden by subclasses """
3772         return self.playlist_result(
3773             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3774             query, query)
3775
3776     def _search_results(self, query):
3777         """Returns an iterator of search results"""
3778         raise NotImplementedError('This method must be implemented by subclasses')
3779
3780     @classproperty
3781     def SEARCH_KEY(cls):
3782         return cls._SEARCH_KEY
3783
3784
3785 class UnsupportedURLIE(InfoExtractor):
3786     _VALID_URL = '.*'
3787     _ENABLED = False
3788     IE_DESC = False
3789
3790     def _real_extract(self, url):
3791         raise UnsupportedError(url)
3792
3793
3794 _PLUGIN_OVERRIDES = collections.defaultdict(list)