yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import sys
  17 import time
  18 import types
  19 import urllib.parse
  20 import urllib.request
  21 import xml.etree.ElementTree
  22
  23 from ..compat import functools  # isort: split
  24 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  25 from ..cookies import LenientSimpleCookie
  26 from ..downloader.f4m import get_base_url, remove_encrypted_media
  27 from ..utils import (
  28     IDENTITY,
  29     JSON_LD_RE,
  30     NO_DEFAULT,
  31     ExtractorError,
  32     FormatSorter,
  33     GeoRestrictedError,
  34     GeoUtils,
  35     LenientJSONDecoder,
  36     RegexNotFoundError,
  37     RetryManager,
  38     UnsupportedError,
  39     age_restricted,
  40     base_url,
  41     bug_reports_message,
  42     classproperty,
  43     clean_html,
  44     deprecation_warning,
  45     determine_ext,
  46     dict_get,
  47     encode_data_uri,
  48     error_to_compat_str,
  49     extract_attributes,
  50     filter_dict,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     int_or_none,
  55     join_nonempty,
  56     js_to_json,
  57     mimetype2ext,
  58     network_exceptions,
  59     orderedSet,
  60     parse_bitrate,
  61     parse_codecs,
  62     parse_duration,
  63     parse_iso8601,
  64     parse_m3u8_attributes,
  65     parse_resolution,
  66     sanitize_filename,
  67     sanitize_url,
  68     sanitized_Request,
  69     smuggle_url,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     traverse_obj,
  74     truncate_string,
  75     try_call,
  76     try_get,
  77     unescapeHTML,
  78     unified_strdate,
  79     unified_timestamp,
  80     update_Request,
  81     update_url_query,
  82     url_basename,
  83     url_or_none,
  84     urljoin,
  85     variadic,
  86     xpath_element,
  87     xpath_text,
  88     xpath_with_ns,
  89 )
  90
  91
  92 class InfoExtractor:
  93     """Information Extractor class.
  94
  95     Information extractors are the classes that, given a URL, extract
  96     information about the video (or videos) the URL refers to. This
  97     information includes the real video URL, the video title, author and
  98     others. The information is stored in a dictionary which is then
  99     passed to the YoutubeDL. The YoutubeDL processes this
 100     information possibly downloading the video to the file system, among
 101     other possible outcomes.
 102
 103     The type field determines the type of the result.
 104     By far the most common value (and the default if _type is missing) is
 105     "video", which indicates a single video.
 106
 107     For a video, the dictionaries must include the following fields:
 108
 109     id:             Video identifier.
 110     title:          Video title, unescaped. Set to an empty string if video has
 111                     no title as opposed to "None" which signifies that the
 112                     extractor failed to obtain a title
 113
 114     Additionally, it must contain either a formats entry or a url one:
 115
 116     formats:        A list of dictionaries for each format available, ordered
 117                     from worst to best quality.
 118
 119                     Potential fields:
 120                     * url        The mandatory URL representing the media:
 121                                    for plain file media - HTTP URL of this file,
 122                                    for RTMP - RTMP URL,
 123                                    for HLS - URL of the M3U8 media playlist,
 124                                    for HDS - URL of the F4M manifest,
 125                                    for DASH
 126                                      - HTTP URL to plain file media (in case of
 127                                        unfragmented media)
 128                                      - URL of the MPD manifest or base URL
 129                                        representing the media if MPD manifest
 130                                        is parsed from a string (in case of
 131                                        fragmented media)
 132                                    for MSS - URL of the ISM manifest.
 133                     * manifest_url
 134                                  The URL of the manifest file in case of
 135                                  fragmented media:
 136                                    for HLS - URL of the M3U8 master playlist,
 137                                    for HDS - URL of the F4M manifest,
 138                                    for DASH - URL of the MPD manifest,
 139                                    for MSS - URL of the ISM manifest.
 140                     * manifest_stream_number  (For internal use only)
 141                                  The index of the stream in the manifest file
 142                     * ext        Will be calculated from URL if missing
 143                     * format     A human-readable description of the format
 144                                  ("mp4 container with h264/opus").
 145                                  Calculated from the format_id, width, height.
 146                                  and format_note fields if missing.
 147                     * format_id  A short description of the format
 148                                  ("mp4_h264_opus" or "19").
 149                                 Technically optional, but strongly recommended.
 150                     * format_note Additional info about the format
 151                                  ("3D" or "DASH video")
 152                     * width      Width of the video, if known
 153                     * height     Height of the video, if known
 154                     * aspect_ratio  Aspect ratio of the video, if known
 155                                  Automatically calculated from width and height
 156                     * resolution Textual description of width and height
 157                                  Automatically calculated from width and height
 158                     * dynamic_range The dynamic range of the video. One of:
 159                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 160                     * tbr        Average bitrate of audio and video in KBit/s
 161                     * abr        Average audio bitrate in KBit/s
 162                     * acodec     Name of the audio codec in use
 163                     * asr        Audio sampling rate in Hertz
 164                     * audio_channels  Number of audio channels
 165                     * vbr        Average video bitrate in KBit/s
 166                     * fps        Frame rate
 167                     * vcodec     Name of the video codec in use
 168                     * container  Name of the container format
 169                     * filesize   The number of bytes, if known in advance
 170                     * filesize_approx  An estimate for the number of bytes
 171                     * player_url SWF Player URL (used for rtmpdump).
 172                     * protocol   The protocol that will be used for the actual
 173                                  download, lower-case. One of "http", "https" or
 174                                  one of the protocols defined in downloader.PROTOCOL_MAP
 175                     * fragment_base_url
 176                                  Base URL for fragments. Each fragment's path
 177                                  value (if present) will be relative to
 178                                  this URL.
 179                     * fragments  A list of fragments of a fragmented media.
 180                                  Each fragment entry must contain either an url
 181                                  or a path. If an url is present it should be
 182                                  considered by a client. Otherwise both path and
 183                                  fragment_base_url must be present. Here is
 184                                  the list of all potential fields:
 185                                  * "url" - fragment's URL
 186                                  * "path" - fragment's path relative to
 187                                             fragment_base_url
 188                                  * "duration" (optional, int or float)
 189                                  * "filesize" (optional, int)
 190                     * is_from_start  Is a live format that can be downloaded
 191                                 from the start. Boolean
 192                     * preference Order number of this format. If this field is
 193                                  present and not None, the formats get sorted
 194                                  by this field, regardless of all other values.
 195                                  -1 for default (order by other properties),
 196                                  -2 or smaller for less than default.
 197                                  < -1000 to hide the format (if there is
 198                                     another one which is strictly better)
 199                     * language   Language code, e.g. "de" or "en-US".
 200                     * language_preference  Is this in the language mentioned in
 201                                  the URL?
 202                                  10 if it's what the URL is about,
 203                                  -1 for default (don't know),
 204                                  -10 otherwise, other values reserved for now.
 205                     * quality    Order number of the video quality of this
 206                                  format, irrespective of the file format.
 207                                  -1 for default (order by other properties),
 208                                  -2 or smaller for less than default.
 209                     * source_preference  Order number for this video source
 210                                   (quality takes higher priority)
 211                                  -1 for default (order by other properties),
 212                                  -2 or smaller for less than default.
 213                     * http_headers  A dictionary of additional HTTP headers
 214                                  to add to the request.
 215                     * stretched_ratio  If given and not 1, indicates that the
 216                                  video's pixels are not square.
 217                                  width : height ratio as float.
 218                     * no_resume  The server does not support resuming the
 219                                  (HTTP or RTMP) download. Boolean.
 220                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 221                     * downloader_options  A dictionary of downloader options
 222                                  (For internal use only)
 223                                  * http_chunk_size Chunk size for HTTP downloads
 224                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 225                     RTMP formats can also have the additional fields: page_url,
 226                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 227                     rtmp_protocol, rtmp_real_time
 228
 229     url:            Final video URL.
 230     ext:            Video filename extension.
 231     format:         The video format, defaults to ext (used for --get-format)
 232     player_url:     SWF Player URL (used for rtmpdump).
 233
 234     The following fields are optional:
 235
 236     direct:         True if a direct video file was given (must only be set by GenericIE)
 237     alt_title:      A secondary title of the video.
 238     display_id      An alternative identifier for the video, not necessarily
 239                     unique, but available before title. Typically, id is
 240                     something like "4234987", title "Dancing naked mole rats",
 241                     and display_id "dancing-naked-mole-rats"
 242     thumbnails:     A list of dictionaries, with the following entries:
 243                         * "id" (optional, string) - Thumbnail format ID
 244                         * "url"
 245                         * "preference" (optional, int) - quality of the image
 246                         * "width" (optional, int)
 247                         * "height" (optional, int)
 248                         * "resolution" (optional, string "{width}x{height}",
 249                                         deprecated)
 250                         * "filesize" (optional, int)
 251                         * "http_headers" (dict) - HTTP headers for the request
 252     thumbnail:      Full URL to a video thumbnail image.
 253     description:    Full video description.
 254     uploader:       Full name of the video uploader.
 255     license:        License name the video is licensed under.
 256     creator:        The creator of the video.
 257     timestamp:      UNIX timestamp of the moment the video was uploaded
 258     upload_date:    Video upload date in UTC (YYYYMMDD).
 259                     If not explicitly set, calculated from timestamp
 260     release_timestamp: UNIX timestamp of the moment the video was released.
 261                     If it is not clear whether to use timestamp or this, use the former
 262     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 263                     If not explicitly set, calculated from release_timestamp
 264     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 265     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 266                     If not explicitly set, calculated from modified_timestamp
 267     uploader_id:    Nickname or id of the video uploader.
 268     uploader_url:   Full URL to a personal webpage of the video uploader.
 269     channel:        Full name of the channel the video is uploaded on.
 270                     Note that channel fields may or may not repeat uploader
 271                     fields. This depends on a particular extractor.
 272     channel_id:     Id of the channel.
 273     channel_url:    Full URL to a channel webpage.
 274     channel_follower_count: Number of followers of the channel.
 275     location:       Physical location where the video was filmed.
 276     subtitles:      The available subtitles as a dictionary in the format
 277                     {tag: subformats}. "tag" is usually a language code, and
 278                     "subformats" is a list sorted from lower to higher
 279                     preference, each element is a dictionary with the "ext"
 280                     entry and one of:
 281                         * "data": The subtitles file contents
 282                         * "url": A URL pointing to the subtitles file
 283                     It can optionally also have:
 284                         * "name": Name or description of the subtitles
 285                         * "http_headers": A dictionary of additional HTTP headers
 286                                   to add to the request.
 287                     "ext" will be calculated from URL if missing
 288     automatic_captions: Like 'subtitles'; contains automatically generated
 289                     captions instead of normal subtitles
 290     duration:       Length of the video in seconds, as an integer or float.
 291     view_count:     How many users have watched the video on the platform.
 292     concurrent_view_count: How many users are currently watching the video on the platform.
 293     like_count:     Number of positive ratings of the video
 294     dislike_count:  Number of negative ratings of the video
 295     repost_count:   Number of reposts of the video
 296     average_rating: Average rating give by users, the scale used depends on the webpage
 297     comment_count:  Number of comments on the video
 298     comments:       A list of comments, each with one or more of the following
 299                     properties (all but one of text or html optional):
 300                         * "author" - human-readable name of the comment author
 301                         * "author_id" - user ID of the comment author
 302                         * "author_thumbnail" - The thumbnail of the comment author
 303                         * "id" - Comment ID
 304                         * "html" - Comment as HTML
 305                         * "text" - Plain text of the comment
 306                         * "timestamp" - UNIX timestamp of comment
 307                         * "parent" - ID of the comment this one is replying to.
 308                                      Set to "root" to indicate that this is a
 309                                      comment to the original video.
 310                         * "like_count" - Number of positive ratings of the comment
 311                         * "dislike_count" - Number of negative ratings of the comment
 312                         * "is_favorited" - Whether the comment is marked as
 313                                            favorite by the video uploader
 314                         * "author_is_uploader" - Whether the comment is made by
 315                                                  the video uploader
 316     age_limit:      Age restriction for the video, as an integer (years)
 317     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 318                     should allow to get the same result again. (It will be set
 319                     by YoutubeDL if it's missing)
 320     categories:     A list of categories that the video falls in, for example
 321                     ["Sports", "Berlin"]
 322     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 323     cast:           A list of the video cast
 324     is_live:        True, False, or None (=unknown). Whether this video is a
 325                     live stream that goes on instead of a fixed-length video.
 326     was_live:       True, False, or None (=unknown). Whether this video was
 327                     originally a live stream.
 328     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 329                     or 'post_live' (was live, but VOD is not yet processed)
 330                     If absent, automatically set from is_live, was_live
 331     start_time:     Time in seconds where the reproduction should start, as
 332                     specified in the URL.
 333     end_time:       Time in seconds where the reproduction should end, as
 334                     specified in the URL.
 335     chapters:       A list of dictionaries, with the following entries:
 336                         * "start_time" - The start time of the chapter in seconds
 337                         * "end_time" - The end time of the chapter in seconds
 338                         * "title" (optional, string)
 339     playable_in_embed: Whether this video is allowed to play in embedded
 340                     players on other sites. Can be True (=always allowed),
 341                     False (=never allowed), None (=unknown), or a string
 342                     specifying the criteria for embedability; e.g. 'whitelist'
 343     availability:   Under what condition the video is available. One of
 344                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 345                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 346                     to set it
 347     _old_archive_ids: A list of old archive ids needed for backward compatibility
 348     _format_sort_fields: A list of fields to use for sorting formats
 349     __post_extractor: A function to be called just before the metadata is
 350                     written to either disk, logger or console. The function
 351                     must return a dict which will be added to the info_dict.
 352                     This is usefull for additional information that is
 353                     time-consuming to extract. Note that the fields thus
 354                     extracted will not be available to output template and
 355                     match_filter. So, only "comments" and "comment_count" are
 356                     currently allowed to be extracted via this method.
 357
 358     The following fields should only be used when the video belongs to some logical
 359     chapter or section:
 360
 361     chapter:        Name or title of the chapter the video belongs to.
 362     chapter_number: Number of the chapter the video belongs to, as an integer.
 363     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 364
 365     The following fields should only be used when the video is an episode of some
 366     series, programme or podcast:
 367
 368     series:         Title of the series or programme the video episode belongs to.
 369     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 370     season:         Title of the season the video episode belongs to.
 371     season_number:  Number of the season the video episode belongs to, as an integer.
 372     season_id:      Id of the season the video episode belongs to, as a unicode string.
 373     episode:        Title of the video episode. Unlike mandatory video title field,
 374                     this field should denote the exact title of the video episode
 375                     without any kind of decoration.
 376     episode_number: Number of the video episode within a season, as an integer.
 377     episode_id:     Id of the video episode, as a unicode string.
 378
 379     The following fields should only be used when the media is a track or a part of
 380     a music album:
 381
 382     track:          Title of the track.
 383     track_number:   Number of the track within an album or a disc, as an integer.
 384     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 385                     as a unicode string.
 386     artist:         Artist(s) of the track.
 387     genre:          Genre(s) of the track.
 388     album:          Title of the album the track belongs to.
 389     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 390     album_artist:   List of all artists appeared on the album (e.g.
 391                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 392                     and compilations).
 393     disc_number:    Number of the disc or other physical medium the track belongs to,
 394                     as an integer.
 395     release_year:   Year (YYYY) when the album was released.
 396     composer:       Composer of the piece
 397
 398     The following fields should only be set for clips that should be cut from the original video:
 399
 400     section_start:  Start time of the section in seconds
 401     section_end:    End time of the section in seconds
 402
 403     The following fields should only be set for storyboards:
 404     rows:           Number of rows in each storyboard fragment, as an integer
 405     columns:        Number of columns in each storyboard fragment, as an integer
 406
 407     Unless mentioned otherwise, the fields should be Unicode strings.
 408
 409     Unless mentioned otherwise, None is equivalent to absence of information.
 410
 411
 412     _type "playlist" indicates multiple videos.
 413     There must be a key "entries", which is a list, an iterable, or a PagedList
 414     object, each element of which is a valid dictionary by this specification.
 415
 416     Additionally, playlists can have "id", "title", and any other relevant
 417     attributes with the same semantics as videos (see above).
 418
 419     It can also have the following optional fields:
 420
 421     playlist_count: The total number of videos in a playlist. If not given,
 422                     YoutubeDL tries to calculate it from "entries"
 423
 424
 425     _type "multi_video" indicates that there are multiple videos that
 426     form a single show, for examples multiple acts of an opera or TV episode.
 427     It must have an entries key like a playlist and contain all the keys
 428     required for a video at the same time.
 429
 430
 431     _type "url" indicates that the video must be extracted from another
 432     location, possibly by a different extractor. Its only required key is:
 433     "url" - the next URL to extract.
 434     The key "ie_key" can be set to the class name (minus the trailing "IE",
 435     e.g. "Youtube") if the extractor class is known in advance.
 436     Additionally, the dictionary may have any properties of the resolved entity
 437     known in advance, for example "title" if the title of the referred video is
 438     known ahead of time.
 439
 440
 441     _type "url_transparent" entities have the same specification as "url", but
 442     indicate that the given additional information is more precise than the one
 443     associated with the resolved URL.
 444     This is useful when a site employs a video service that hosts the video and
 445     its technical metadata, but that video service does not embed a useful
 446     title, description etc.
 447
 448
 449     Subclasses of this should also be added to the list of extractors and
 450     should define a _VALID_URL regexp and, re-define the _real_extract() and
 451     (optionally) _real_initialize() methods.
 452
 453     Subclasses may also override suitable() if necessary, but ensure the function
 454     signature is preserved and that this function imports everything it needs
 455     (except other extractors), so that lazy_extractors works correctly.
 456
 457     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 458     the HTML of Generic webpages. It may also override _extract_embed_urls
 459     or _extract_from_webpage as necessary. While these are normally classmethods,
 460     _extract_from_webpage is allowed to be an instance method.
 461
 462     _extract_from_webpage may raise self.StopExtraction() to stop further
 463     processing of the webpage and obtain exclusive rights to it. This is useful
 464     when the extractor cannot reliably be matched using just the URL,
 465     e.g. invidious/peertube instances
 466
 467     Embed-only extractors can be defined by setting _VALID_URL = False.
 468
 469     To support username + password (or netrc) login, the extractor must define a
 470     _NETRC_MACHINE and re-define _perform_login(username, password) and
 471     (optionally) _initialize_pre_login() methods. The _perform_login method will
 472     be called between _initialize_pre_login and _real_initialize if credentials
 473     are passed by the user. In cases where it is necessary to have the login
 474     process as part of the extraction rather than initialization, _perform_login
 475     can be left undefined.
 476
 477     _GEO_BYPASS attribute may be set to False in order to disable
 478     geo restriction bypass mechanisms for a particular extractor.
 479     Though it won't disable explicit geo restriction bypass based on
 480     country code provided with geo_bypass_country.
 481
 482     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 483     countries for this extractor. One of these countries will be used by
 484     geo restriction bypass mechanism right away in order to bypass
 485     geo restriction, of course, if the mechanism is not disabled.
 486
 487     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 488     IP blocks in CIDR notation for this extractor. One of these IP blocks
 489     will be used by geo restriction bypass mechanism similarly
 490     to _GEO_COUNTRIES.
 491
 492     The _ENABLED attribute should be set to False for IEs that
 493     are disabled by default and must be explicitly enabled.
 494
 495     The _WORKING attribute should be set to False for broken IEs
 496     in order to warn the users and skip the tests.
 497     """
 498
 499     _ready = False
 500     _downloader = None
 501     _x_forwarded_for_ip = None
 502     _GEO_BYPASS = True
 503     _GEO_COUNTRIES = None
 504     _GEO_IP_BLOCKS = None
 505     _WORKING = True
 506     _ENABLED = True
 507     _NETRC_MACHINE = None
 508     IE_DESC = None
 509     SEARCH_KEY = None
 510     _VALID_URL = None
 511     _EMBED_REGEX = []
 512
 513     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 514         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 515         return {
 516             None: '',
 517             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 518             'password': f'Use {password_hint}',
 519             'cookies': (
 520                 'Use --cookies-from-browser or --cookies for the authentication. '
 521                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 522         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 523
 524     def __init__(self, downloader=None):
 525         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 526         If a downloader is not passed during initialization,
 527         it must be set using "set_downloader()" before "extract()" is called"""
 528         self._ready = False
 529         self._x_forwarded_for_ip = None
 530         self._printed_messages = set()
 531         self.set_downloader(downloader)
 532
 533     @classmethod
 534     def _match_valid_url(cls, url):
 535         if cls._VALID_URL is False:
 536             return None
 537         # This does not use has/getattr intentionally - we want to know whether
 538         # we have cached the regexp for *this* class, whereas getattr would also
 539         # match the superclass
 540         if '_VALID_URL_RE' not in cls.__dict__:
 541             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 542         return cls._VALID_URL_RE.match(url)
 543
 544     @classmethod
 545     def suitable(cls, url):
 546         """Receives a URL and returns True if suitable for this IE."""
 547         # This function must import everything it needs (except other extractors),
 548         # so that lazy_extractors works correctly
 549         return cls._match_valid_url(url) is not None
 550
 551     @classmethod
 552     def _match_id(cls, url):
 553         return cls._match_valid_url(url).group('id')
 554
 555     @classmethod
 556     def get_temp_id(cls, url):
 557         try:
 558             return cls._match_id(url)
 559         except (IndexError, AttributeError):
 560             return None
 561
 562     @classmethod
 563     def working(cls):
 564         """Getter method for _WORKING."""
 565         return cls._WORKING
 566
 567     @classmethod
 568     def supports_login(cls):
 569         return bool(cls._NETRC_MACHINE)
 570
 571     def initialize(self):
 572         """Initializes an instance (authentication, etc)."""
 573         self._printed_messages = set()
 574         self._initialize_geo_bypass({
 575             'countries': self._GEO_COUNTRIES,
 576             'ip_blocks': self._GEO_IP_BLOCKS,
 577         })
 578         if not self._ready:
 579             self._initialize_pre_login()
 580             if self.supports_login():
 581                 username, password = self._get_login_info()
 582                 if username:
 583                     self._perform_login(username, password)
 584             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 585                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 586             self._real_initialize()
 587             self._ready = True
 588
 589     def _initialize_geo_bypass(self, geo_bypass_context):
 590         """
 591         Initialize geo restriction bypass mechanism.
 592
 593         This method is used to initialize geo bypass mechanism based on faking
 594         X-Forwarded-For HTTP header. A random country from provided country list
 595         is selected and a random IP belonging to this country is generated. This
 596         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 597         HTTP requests.
 598
 599         This method will be used for initial geo bypass mechanism initialization
 600         during the instance initialization with _GEO_COUNTRIES and
 601         _GEO_IP_BLOCKS.
 602
 603         You may also manually call it from extractor's code if geo bypass
 604         information is not available beforehand (e.g. obtained during
 605         extraction) or due to some other reason. In this case you should pass
 606         this information in geo bypass context passed as first argument. It may
 607         contain following fields:
 608
 609         countries:  List of geo unrestricted countries (similar
 610                     to _GEO_COUNTRIES)
 611         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 612                     (similar to _GEO_IP_BLOCKS)
 613
 614         """
 615         if not self._x_forwarded_for_ip:
 616
 617             # Geo bypass mechanism is explicitly disabled by user
 618             if not self.get_param('geo_bypass', True):
 619                 return
 620
 621             if not geo_bypass_context:
 622                 geo_bypass_context = {}
 623
 624             # Backward compatibility: previously _initialize_geo_bypass
 625             # expected a list of countries, some 3rd party code may still use
 626             # it this way
 627             if isinstance(geo_bypass_context, (list, tuple)):
 628                 geo_bypass_context = {
 629                     'countries': geo_bypass_context,
 630                 }
 631
 632             # The whole point of geo bypass mechanism is to fake IP
 633             # as X-Forwarded-For HTTP header based on some IP block or
 634             # country code.
 635
 636             # Path 1: bypassing based on IP block in CIDR notation
 637
 638             # Explicit IP block specified by user, use it right away
 639             # regardless of whether extractor is geo bypassable or not
 640             ip_block = self.get_param('geo_bypass_ip_block', None)
 641
 642             # Otherwise use random IP block from geo bypass context but only
 643             # if extractor is known as geo bypassable
 644             if not ip_block:
 645                 ip_blocks = geo_bypass_context.get('ip_blocks')
 646                 if self._GEO_BYPASS and ip_blocks:
 647                     ip_block = random.choice(ip_blocks)
 648
 649             if ip_block:
 650                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 651                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 652                 return
 653
 654             # Path 2: bypassing based on country code
 655
 656             # Explicit country code specified by user, use it right away
 657             # regardless of whether extractor is geo bypassable or not
 658             country = self.get_param('geo_bypass_country', None)
 659
 660             # Otherwise use random country code from geo bypass context but
 661             # only if extractor is known as geo bypassable
 662             if not country:
 663                 countries = geo_bypass_context.get('countries')
 664                 if self._GEO_BYPASS and countries:
 665                     country = random.choice(countries)
 666
 667             if country:
 668                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 669                 self._downloader.write_debug(
 670                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 671
 672     def extract(self, url):
 673         """Extracts URL information and returns it in list of dicts."""
 674         try:
 675             for _ in range(2):
 676                 try:
 677                     self.initialize()
 678                     self.to_screen('Extracting URL: %s' % (
 679                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 680                     ie_result = self._real_extract(url)
 681                     if ie_result is None:
 682                         return None
 683                     if self._x_forwarded_for_ip:
 684                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 685                     subtitles = ie_result.get('subtitles') or {}
 686                     if 'no-live-chat' in self.get_param('compat_opts'):
 687                         for lang in ('live_chat', 'comments', 'danmaku'):
 688                             subtitles.pop(lang, None)
 689                     return ie_result
 690                 except GeoRestrictedError as e:
 691                     if self.__maybe_fake_ip_and_retry(e.countries):
 692                         continue
 693                     raise
 694         except UnsupportedError:
 695             raise
 696         except ExtractorError as e:
 697             e.video_id = e.video_id or self.get_temp_id(url),
 698             e.ie = e.ie or self.IE_NAME,
 699             e.traceback = e.traceback or sys.exc_info()[2]
 700             raise
 701         except http.client.IncompleteRead as e:
 702             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 703         except (KeyError, StopIteration) as e:
 704             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 705
 706     def __maybe_fake_ip_and_retry(self, countries):
 707         if (not self.get_param('geo_bypass_country', None)
 708                 and self._GEO_BYPASS
 709                 and self.get_param('geo_bypass', True)
 710                 and not self._x_forwarded_for_ip
 711                 and countries):
 712             country_code = random.choice(countries)
 713             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 714             if self._x_forwarded_for_ip:
 715                 self.report_warning(
 716                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 717                     % (self._x_forwarded_for_ip, country_code.upper()))
 718                 return True
 719         return False
 720
 721     def set_downloader(self, downloader):
 722         """Sets a YoutubeDL instance as the downloader for this IE."""
 723         self._downloader = downloader
 724
 725     @property
 726     def cache(self):
 727         return self._downloader.cache
 728
 729     @property
 730     def cookiejar(self):
 731         return self._downloader.cookiejar
 732
 733     def _initialize_pre_login(self):
 734         """ Initialization before login. Redefine in subclasses."""
 735         pass
 736
 737     def _perform_login(self, username, password):
 738         """ Login with username and password. Redefine in subclasses."""
 739         pass
 740
 741     def _real_initialize(self):
 742         """Real initialization process. Redefine in subclasses."""
 743         pass
 744
 745     def _real_extract(self, url):
 746         """Real extraction process. Redefine in subclasses."""
 747         raise NotImplementedError('This method must be implemented by subclasses')
 748
 749     @classmethod
 750     def ie_key(cls):
 751         """A string for getting the InfoExtractor with get_info_extractor"""
 752         return cls.__name__[:-2]
 753
 754     @classproperty
 755     def IE_NAME(cls):
 756         return cls.__name__[:-2]
 757
 758     @staticmethod
 759     def __can_accept_status_code(err, expected_status):
 760         assert isinstance(err, urllib.error.HTTPError)
 761         if expected_status is None:
 762             return False
 763         elif callable(expected_status):
 764             return expected_status(err.code) is True
 765         else:
 766             return err.code in variadic(expected_status)
 767
 768     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 769         if isinstance(url_or_request, urllib.request.Request):
 770             return update_Request(url_or_request, data=data, headers=headers, query=query)
 771         if query:
 772             url_or_request = update_url_query(url_or_request, query)
 773         return sanitized_Request(url_or_request, data, headers or {})
 774
 775     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 776         """
 777         Return the response handle.
 778
 779         See _download_webpage docstring for arguments specification.
 780         """
 781         if not self._downloader._first_webpage_request:
 782             sleep_interval = self.get_param('sleep_interval_requests') or 0
 783             if sleep_interval > 0:
 784                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 785                 time.sleep(sleep_interval)
 786         else:
 787             self._downloader._first_webpage_request = False
 788
 789         if note is None:
 790             self.report_download_webpage(video_id)
 791         elif note is not False:
 792             if video_id is None:
 793                 self.to_screen(str(note))
 794             else:
 795                 self.to_screen(f'{video_id}: {note}')
 796
 797         # Some sites check X-Forwarded-For HTTP header in order to figure out
 798         # the origin of the client behind proxy. This allows bypassing geo
 799         # restriction by faking this header's value to IP that belongs to some
 800         # geo unrestricted country. We will do so once we encounter any
 801         # geo restriction error.
 802         if self._x_forwarded_for_ip:
 803             headers = (headers or {}).copy()
 804             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 805
 806         try:
 807             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 808         except network_exceptions as err:
 809             if isinstance(err, urllib.error.HTTPError):
 810                 if self.__can_accept_status_code(err, expected_status):
 811                     # Retain reference to error to prevent file object from
 812                     # being closed before it can be read. Works around the
 813                     # effects of <https://bugs.python.org/issue15002>
 814                     # introduced in Python 3.4.1.
 815                     err.fp._error = err
 816                     return err.fp
 817
 818             if errnote is False:
 819                 return False
 820             if errnote is None:
 821                 errnote = 'Unable to download webpage'
 822
 823             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 824             if fatal:
 825                 raise ExtractorError(errmsg, cause=err)
 826             else:
 827                 self.report_warning(errmsg)
 828                 return False
 829
 830     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 831                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 832         """
 833         Return a tuple (page content as string, URL handle).
 834
 835         Arguments:
 836         url_or_request -- plain text URL as a string or
 837             a urllib.request.Request object
 838         video_id -- Video/playlist/item identifier (string)
 839
 840         Keyword arguments:
 841         note -- note printed before downloading (string)
 842         errnote -- note printed in case of an error (string)
 843         fatal -- flag denoting whether error should be considered fatal,
 844             i.e. whether it should cause ExtractionError to be raised,
 845             otherwise a warning will be reported and extraction continued
 846         encoding -- encoding for a page content decoding, guessed automatically
 847             when not explicitly specified
 848         data -- POST data (bytes)
 849         headers -- HTTP headers (dict)
 850         query -- URL query (dict)
 851         expected_status -- allows to accept failed HTTP requests (non 2xx
 852             status code) by explicitly specifying a set of accepted status
 853             codes. Can be any of the following entities:
 854                 - an integer type specifying an exact failed status code to
 855                   accept
 856                 - a list or a tuple of integer types specifying a list of
 857                   failed status codes to accept
 858                 - a callable accepting an actual failed status code and
 859                   returning True if it should be accepted
 860             Note that this argument does not affect success status codes (2xx)
 861             which are always accepted.
 862         """
 863
 864         # Strip hashes from the URL (#1038)
 865         if isinstance(url_or_request, str):
 866             url_or_request = url_or_request.partition('#')[0]
 867
 868         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 869         if urlh is False:
 870             assert not fatal
 871             return False
 872         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 873         return (content, urlh)
 874
 875     @staticmethod
 876     def _guess_encoding_from_content(content_type, webpage_bytes):
 877         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 878         if m:
 879             encoding = m.group(1)
 880         else:
 881             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 882                           webpage_bytes[:1024])
 883             if m:
 884                 encoding = m.group(1).decode('ascii')
 885             elif webpage_bytes.startswith(b'\xff\xfe'):
 886                 encoding = 'utf-16'
 887             else:
 888                 encoding = 'utf-8'
 889
 890         return encoding
 891
 892     def __check_blocked(self, content):
 893         first_block = content[:512]
 894         if ('<title>Access to this site is blocked</title>' in content
 895                 and 'Websense' in first_block):
 896             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 897             blocked_iframe = self._html_search_regex(
 898                 r'<iframe src="([^"]+)"', content,
 899                 'Websense information URL', default=None)
 900             if blocked_iframe:
 901                 msg += ' Visit %s for more details' % blocked_iframe
 902             raise ExtractorError(msg, expected=True)
 903         if '<title>The URL you requested has been blocked</title>' in first_block:
 904             msg = (
 905                 'Access to this webpage has been blocked by Indian censorship. '
 906                 'Use a VPN or proxy server (with --proxy) to route around it.')
 907             block_msg = self._html_search_regex(
 908                 r'</h1><p>(.*?)</p>',
 909                 content, 'block message', default=None)
 910             if block_msg:
 911                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 912             raise ExtractorError(msg, expected=True)
 913         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 914                 and 'blocklist.rkn.gov.ru' in content):
 915             raise ExtractorError(
 916                 'Access to this webpage has been blocked by decision of the Russian government. '
 917                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 918                 expected=True)
 919
 920     def _request_dump_filename(self, url, video_id):
 921         basen = f'{video_id}_{url}'
 922         trim_length = self.get_param('trim_file_name') or 240
 923         if len(basen) > trim_length:
 924             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 925             basen = basen[:trim_length - len(h)] + h
 926         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 927         # Working around MAX_PATH limitation on Windows (see
 928         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 929         if compat_os_name == 'nt':
 930             absfilepath = os.path.abspath(filename)
 931             if len(absfilepath) > 259:
 932                 filename = fR'\\?\{absfilepath}'
 933         return filename
 934
 935     def __decode_webpage(self, webpage_bytes, encoding, headers):
 936         if not encoding:
 937             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 938         try:
 939             return webpage_bytes.decode(encoding, 'replace')
 940         except LookupError:
 941             return webpage_bytes.decode('utf-8', 'replace')
 942
 943     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 944         webpage_bytes = urlh.read()
 945         if prefix is not None:
 946             webpage_bytes = prefix + webpage_bytes
 947         if self.get_param('dump_intermediate_pages', False):
 948             self.to_screen('Dumping request to ' + urlh.geturl())
 949             dump = base64.b64encode(webpage_bytes).decode('ascii')
 950             self._downloader.to_screen(dump)
 951         if self.get_param('write_pages'):
 952             filename = self._request_dump_filename(urlh.geturl(), video_id)
 953             self.to_screen(f'Saving request to {filename}')
 954             with open(filename, 'wb') as outf:
 955                 outf.write(webpage_bytes)
 956
 957         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 958         self.__check_blocked(content)
 959
 960         return content
 961
 962     def __print_error(self, errnote, fatal, video_id, err):
 963         if fatal:
 964             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 965         elif errnote:
 966             self.report_warning(f'{video_id}: {errnote}: {err}')
 967
 968     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 969         if transform_source:
 970             xml_string = transform_source(xml_string)
 971         try:
 972             return compat_etree_fromstring(xml_string.encode('utf-8'))
 973         except xml.etree.ElementTree.ParseError as ve:
 974             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 975
 976     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
 977         try:
 978             return json.loads(
 979                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 980         except ValueError as ve:
 981             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
 982
 983     def _parse_socket_response_as_json(self, data, *args, **kwargs):
 984         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
 985
 986     def __create_download_methods(name, parser, note, errnote, return_value):
 987
 988         def parse(ie, content, *args, errnote=errnote, **kwargs):
 989             if parser is None:
 990                 return content
 991             if errnote is False:
 992                 kwargs['errnote'] = errnote
 993             # parser is fetched by name so subclasses can override it
 994             return getattr(ie, parser)(content, *args, **kwargs)
 995
 996         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 997                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 998             res = self._download_webpage_handle(
 999                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1000                 data=data, headers=headers, query=query, expected_status=expected_status)
1001             if res is False:
1002                 return res
1003             content, urlh = res
1004             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1005
1006         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1007                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1008             if self.get_param('load_pages'):
1009                 url_or_request = self._create_request(url_or_request, data, headers, query)
1010                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1011                 self.to_screen(f'Loading request from {filename}')
1012                 try:
1013                     with open(filename, 'rb') as dumpf:
1014                         webpage_bytes = dumpf.read()
1015                 except OSError as e:
1016                     self.report_warning(f'Unable to load request from disk: {e}')
1017                 else:
1018                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1019                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1020             kwargs = {
1021                 'note': note,
1022                 'errnote': errnote,
1023                 'transform_source': transform_source,
1024                 'fatal': fatal,
1025                 'encoding': encoding,
1026                 'data': data,
1027                 'headers': headers,
1028                 'query': query,
1029                 'expected_status': expected_status,
1030             }
1031             if parser is None:
1032                 kwargs.pop('transform_source')
1033             # The method is fetched by name so subclasses can override _download_..._handle
1034             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1035             return res if res is False else res[0]
1036
1037         def impersonate(func, name, return_value):
1038             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1039             func.__doc__ = f'''
1040                 @param transform_source     Apply this transformation before parsing
1041                 @returns                    {return_value}
1042
1043                 See _download_webpage_handle docstring for other arguments specification
1044             '''
1045
1046         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1047         impersonate(download_content, f'_download_{name}', f'{return_value}')
1048         return download_handle, download_content
1049
1050     _download_xml_handle, _download_xml = __create_download_methods(
1051         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1052     _download_json_handle, _download_json = __create_download_methods(
1053         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1054     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1055         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1056     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1057
1058     def _download_webpage(
1059             self, url_or_request, video_id, note=None, errnote=None,
1060             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1061         """
1062         Return the data of the page as a string.
1063
1064         Keyword arguments:
1065         tries -- number of tries
1066         timeout -- sleep interval between tries
1067
1068         See _download_webpage_handle docstring for other arguments specification.
1069         """
1070
1071         R''' # NB: These are unused; should they be deprecated?
1072         if tries != 1:
1073             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1074         if timeout is NO_DEFAULT:
1075             timeout = 5
1076         else:
1077             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1078         '''
1079
1080         try_count = 0
1081         while True:
1082             try:
1083                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1084             except http.client.IncompleteRead as e:
1085                 try_count += 1
1086                 if try_count >= tries:
1087                     raise e
1088                 self._sleep(timeout, video_id)
1089
1090     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1091         idstr = format_field(video_id, None, '%s: ')
1092         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1093         if only_once:
1094             if f'WARNING: {msg}' in self._printed_messages:
1095                 return
1096             self._printed_messages.add(f'WARNING: {msg}')
1097         self._downloader.report_warning(msg, *args, **kwargs)
1098
1099     def to_screen(self, msg, *args, **kwargs):
1100         """Print msg to screen, prefixing it with '[ie_name]'"""
1101         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1102
1103     def write_debug(self, msg, *args, **kwargs):
1104         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1105
1106     def get_param(self, name, default=None, *args, **kwargs):
1107         if self._downloader:
1108             return self._downloader.params.get(name, default, *args, **kwargs)
1109         return default
1110
1111     def report_drm(self, video_id, partial=NO_DEFAULT):
1112         if partial is not NO_DEFAULT:
1113             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1114         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1115
1116     def report_extraction(self, id_or_name):
1117         """Report information extraction."""
1118         self.to_screen('%s: Extracting information' % id_or_name)
1119
1120     def report_download_webpage(self, video_id):
1121         """Report webpage download."""
1122         self.to_screen('%s: Downloading webpage' % video_id)
1123
1124     def report_age_confirmation(self):
1125         """Report attempt to confirm age."""
1126         self.to_screen('Confirming age')
1127
1128     def report_login(self):
1129         """Report attempt to log in."""
1130         self.to_screen('Logging in')
1131
1132     def raise_login_required(
1133             self, msg='This video is only available for registered users',
1134             metadata_available=False, method=NO_DEFAULT):
1135         if metadata_available and (
1136                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1137             self.report_warning(msg)
1138             return
1139         msg += format_field(self._login_hint(method), None, '. %s')
1140         raise ExtractorError(msg, expected=True)
1141
1142     def raise_geo_restricted(
1143             self, msg='This video is not available from your location due to geo restriction',
1144             countries=None, metadata_available=False):
1145         if metadata_available and (
1146                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1147             self.report_warning(msg)
1148         else:
1149             raise GeoRestrictedError(msg, countries=countries)
1150
1151     def raise_no_formats(self, msg, expected=False, video_id=None):
1152         if expected and (
1153                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1154             self.report_warning(msg, video_id)
1155         elif isinstance(msg, ExtractorError):
1156             raise msg
1157         else:
1158             raise ExtractorError(msg, expected=expected, video_id=video_id)
1159
1160     # Methods for following #608
1161     @staticmethod
1162     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1163         """Returns a URL that points to a page that should be processed"""
1164         if ie is not None:
1165             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1166         if video_id is not None:
1167             kwargs['id'] = video_id
1168         if video_title is not None:
1169             kwargs['title'] = video_title
1170         return {
1171             **kwargs,
1172             '_type': 'url_transparent' if url_transparent else 'url',
1173             'url': url,
1174         }
1175
1176     @classmethod
1177     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1178                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1179         return cls.playlist_result(
1180             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1181             playlist_id, playlist_title, **kwargs)
1182
1183     @staticmethod
1184     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1185         """Returns a playlist"""
1186         if playlist_id:
1187             kwargs['id'] = playlist_id
1188         if playlist_title:
1189             kwargs['title'] = playlist_title
1190         if playlist_description is not None:
1191             kwargs['description'] = playlist_description
1192         return {
1193             **kwargs,
1194             '_type': 'multi_video' if multi_video else 'playlist',
1195             'entries': entries,
1196         }
1197
1198     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1199         """
1200         Perform a regex search on the given string, using a single or a list of
1201         patterns returning the first matching group.
1202         In case of failure return a default value or raise a WARNING or a
1203         RegexNotFoundError, depending on fatal, specifying the field name.
1204         """
1205         if string is None:
1206             mobj = None
1207         elif isinstance(pattern, (str, re.Pattern)):
1208             mobj = re.search(pattern, string, flags)
1209         else:
1210             for p in pattern:
1211                 mobj = re.search(p, string, flags)
1212                 if mobj:
1213                     break
1214
1215         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1216
1217         if mobj:
1218             if group is None:
1219                 # return the first matching group
1220                 return next(g for g in mobj.groups() if g is not None)
1221             elif isinstance(group, (list, tuple)):
1222                 return tuple(mobj.group(g) for g in group)
1223             else:
1224                 return mobj.group(group)
1225         elif default is not NO_DEFAULT:
1226             return default
1227         elif fatal:
1228             raise RegexNotFoundError('Unable to extract %s' % _name)
1229         else:
1230             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1231             return None
1232
1233     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1234                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1235         """Searches string for the JSON object specified by start_pattern"""
1236         # NB: end_pattern is only used to reduce the size of the initial match
1237         if default is NO_DEFAULT:
1238             default, has_default = {}, False
1239         else:
1240             fatal, has_default = False, True
1241
1242         json_string = self._search_regex(
1243             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1244             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1245         if not json_string:
1246             return default
1247
1248         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1249         try:
1250             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1251         except ExtractorError as e:
1252             if fatal:
1253                 raise ExtractorError(
1254                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1255             elif not has_default:
1256                 self.report_warning(
1257                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1258         return default
1259
1260     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1261         """
1262         Like _search_regex, but strips HTML tags and unescapes entities.
1263         """
1264         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1265         if res:
1266             return clean_html(res).strip()
1267         else:
1268             return res
1269
1270     def _get_netrc_login_info(self, netrc_machine=None):
1271         username = None
1272         password = None
1273         netrc_machine = netrc_machine or self._NETRC_MACHINE
1274
1275         if self.get_param('usenetrc', False):
1276             try:
1277                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1278                 if os.path.isdir(netrc_file):
1279                     netrc_file = os.path.join(netrc_file, '.netrc')
1280                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1281                 if info is not None:
1282                     username = info[0]
1283                     password = info[2]
1284                 else:
1285                     raise netrc.NetrcParseError(
1286                         'No authenticators for %s' % netrc_machine)
1287             except (OSError, netrc.NetrcParseError) as err:
1288                 self.report_warning(
1289                     'parsing .netrc: %s' % error_to_compat_str(err))
1290
1291         return username, password
1292
1293     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1294         """
1295         Get the login info as (username, password)
1296         First look for the manually specified credentials using username_option
1297         and password_option as keys in params dictionary. If no such credentials
1298         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1299         value.
1300         If there's no info available, return (None, None)
1301         """
1302
1303         # Attempt to use provided username and password or .netrc data
1304         username = self.get_param(username_option)
1305         if username is not None:
1306             password = self.get_param(password_option)
1307         else:
1308             username, password = self._get_netrc_login_info(netrc_machine)
1309
1310         return username, password
1311
1312     def _get_tfa_info(self, note='two-factor verification code'):
1313         """
1314         Get the two-factor authentication info
1315         TODO - asking the user will be required for sms/phone verify
1316         currently just uses the command line option
1317         If there's no info available, return None
1318         """
1319
1320         tfa = self.get_param('twofactor')
1321         if tfa is not None:
1322             return tfa
1323
1324         return getpass.getpass('Type %s and press [Return]: ' % note)
1325
1326     # Helper functions for extracting OpenGraph info
1327     @staticmethod
1328     def _og_regexes(prop):
1329         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1330         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1331                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1332         template = r'<meta[^>]+?%s[^>]+?%s'
1333         return [
1334             template % (property_re, content_re),
1335             template % (content_re, property_re),
1336         ]
1337
1338     @staticmethod
1339     def _meta_regex(prop):
1340         return r'''(?isx)<meta
1341                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1342                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1343
1344     def _og_search_property(self, prop, html, name=None, **kargs):
1345         prop = variadic(prop)
1346         if name is None:
1347             name = 'OpenGraph %s' % prop[0]
1348         og_regexes = []
1349         for p in prop:
1350             og_regexes.extend(self._og_regexes(p))
1351         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1352         if escaped is None:
1353             return None
1354         return unescapeHTML(escaped)
1355
1356     def _og_search_thumbnail(self, html, **kargs):
1357         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1358
1359     def _og_search_description(self, html, **kargs):
1360         return self._og_search_property('description', html, fatal=False, **kargs)
1361
1362     def _og_search_title(self, html, *, fatal=False, **kargs):
1363         return self._og_search_property('title', html, fatal=fatal, **kargs)
1364
1365     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1366         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1367         if secure:
1368             regexes = self._og_regexes('video:secure_url') + regexes
1369         return self._html_search_regex(regexes, html, name, **kargs)
1370
1371     def _og_search_url(self, html, **kargs):
1372         return self._og_search_property('url', html, **kargs)
1373
1374     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1375         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1376
1377     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1378         name = variadic(name)
1379         if display_name is None:
1380             display_name = name[0]
1381         return self._html_search_regex(
1382             [self._meta_regex(n) for n in name],
1383             html, display_name, fatal=fatal, group='content', **kwargs)
1384
1385     def _dc_search_uploader(self, html):
1386         return self._html_search_meta('dc.creator', html, 'uploader')
1387
1388     @staticmethod
1389     def _rta_search(html):
1390         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1391         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1392                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1393                      html):
1394             return 18
1395
1396         # And then there are the jokers who advertise that they use RTA, but actually don't.
1397         AGE_LIMIT_MARKERS = [
1398             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1399         ]
1400         if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
1401             return 18
1402         return 0
1403
1404     def _media_rating_search(self, html):
1405         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1406         rating = self._html_search_meta('rating', html)
1407
1408         if not rating:
1409             return None
1410
1411         RATING_TABLE = {
1412             'safe for kids': 0,
1413             'general': 8,
1414             '14 years': 14,
1415             'mature': 17,
1416             'restricted': 19,
1417         }
1418         return RATING_TABLE.get(rating.lower())
1419
1420     def _family_friendly_search(self, html):
1421         # See http://schema.org/VideoObject
1422         family_friendly = self._html_search_meta(
1423             'isFamilyFriendly', html, default=None)
1424
1425         if not family_friendly:
1426             return None
1427
1428         RATING_TABLE = {
1429             '1': 0,
1430             'true': 0,
1431             '0': 18,
1432             'false': 18,
1433         }
1434         return RATING_TABLE.get(family_friendly.lower())
1435
1436     def _twitter_search_player(self, html):
1437         return self._html_search_meta('twitter:player', html,
1438                                       'twitter card player')
1439
1440     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1441         """Yield all json ld objects in the html"""
1442         if default is not NO_DEFAULT:
1443             fatal = False
1444         for mobj in re.finditer(JSON_LD_RE, html):
1445             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1446             for json_ld in variadic(json_ld_item):
1447                 if isinstance(json_ld, dict):
1448                     yield json_ld
1449
1450     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1451         """Search for a video in any json ld in the html"""
1452         if default is not NO_DEFAULT:
1453             fatal = False
1454         info = self._json_ld(
1455             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1456             video_id, fatal=fatal, expected_type=expected_type)
1457         if info:
1458             return info
1459         if default is not NO_DEFAULT:
1460             return default
1461         elif fatal:
1462             raise RegexNotFoundError('Unable to extract JSON-LD')
1463         else:
1464             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1465             return {}
1466
1467     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1468         if isinstance(json_ld, str):
1469             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1470         if not json_ld:
1471             return {}
1472         info = {}
1473
1474         INTERACTION_TYPE_MAP = {
1475             'CommentAction': 'comment',
1476             'AgreeAction': 'like',
1477             'DisagreeAction': 'dislike',
1478             'LikeAction': 'like',
1479             'DislikeAction': 'dislike',
1480             'ListenAction': 'view',
1481             'WatchAction': 'view',
1482             'ViewAction': 'view',
1483         }
1484
1485         def is_type(e, *expected_types):
1486             type = variadic(traverse_obj(e, '@type'))
1487             return any(x in type for x in expected_types)
1488
1489         def extract_interaction_type(e):
1490             interaction_type = e.get('interactionType')
1491             if isinstance(interaction_type, dict):
1492                 interaction_type = interaction_type.get('@type')
1493             return str_or_none(interaction_type)
1494
1495         def extract_interaction_statistic(e):
1496             interaction_statistic = e.get('interactionStatistic')
1497             if isinstance(interaction_statistic, dict):
1498                 interaction_statistic = [interaction_statistic]
1499             if not isinstance(interaction_statistic, list):
1500                 return
1501             for is_e in interaction_statistic:
1502                 if not is_type(is_e, 'InteractionCounter'):
1503                     continue
1504                 interaction_type = extract_interaction_type(is_e)
1505                 if not interaction_type:
1506                     continue
1507                 # For interaction count some sites provide string instead of
1508                 # an integer (as per spec) with non digit characters (e.g. ",")
1509                 # so extracting count with more relaxed str_to_int
1510                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1511                 if interaction_count is None:
1512                     continue
1513                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1514                 if not count_kind:
1515                     continue
1516                 count_key = '%s_count' % count_kind
1517                 if info.get(count_key) is not None:
1518                     continue
1519                 info[count_key] = interaction_count
1520
1521         def extract_chapter_information(e):
1522             chapters = [{
1523                 'title': part.get('name'),
1524                 'start_time': part.get('startOffset'),
1525                 'end_time': part.get('endOffset'),
1526             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1527             for idx, (last_c, current_c, next_c) in enumerate(zip(
1528                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1529                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1530                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1531                 if None in current_c.values():
1532                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1533                     return
1534             if chapters:
1535                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1536                 info['chapters'] = chapters
1537
1538         def extract_video_object(e):
1539             author = e.get('author')
1540             info.update({
1541                 'url': url_or_none(e.get('contentUrl')),
1542                 'ext': mimetype2ext(e.get('encodingFormat')),
1543                 'title': unescapeHTML(e.get('name')),
1544                 'description': unescapeHTML(e.get('description')),
1545                 'thumbnails': [{'url': unescapeHTML(url)}
1546                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1547                                if url_or_none(url)],
1548                 'duration': parse_duration(e.get('duration')),
1549                 'timestamp': unified_timestamp(e.get('uploadDate')),
1550                 # author can be an instance of 'Organization' or 'Person' types.
1551                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1552                 # however some websites are using 'Text' type instead.
1553                 # 1. https://schema.org/VideoObject
1554                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1555                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1556                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1557                 'tbr': int_or_none(e.get('bitrate')),
1558                 'width': int_or_none(e.get('width')),
1559                 'height': int_or_none(e.get('height')),
1560                 'view_count': int_or_none(e.get('interactionCount')),
1561                 'tags': try_call(lambda: e.get('keywords').split(',')),
1562             })
1563             if is_type(e, 'AudioObject'):
1564                 info.update({
1565                     'vcodec': 'none',
1566                     'abr': int_or_none(e.get('bitrate')),
1567                 })
1568             extract_interaction_statistic(e)
1569             extract_chapter_information(e)
1570
1571         def traverse_json_ld(json_ld, at_top_level=True):
1572             for e in variadic(json_ld):
1573                 if not isinstance(e, dict):
1574                     continue
1575                 if at_top_level and '@context' not in e:
1576                     continue
1577                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1578                     traverse_json_ld(e['@graph'], at_top_level=False)
1579                     continue
1580                 if expected_type is not None and not is_type(e, expected_type):
1581                     continue
1582                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1583                 if rating is not None:
1584                     info['average_rating'] = rating
1585                 if is_type(e, 'TVEpisode', 'Episode'):
1586                     episode_name = unescapeHTML(e.get('name'))
1587                     info.update({
1588                         'episode': episode_name,
1589                         'episode_number': int_or_none(e.get('episodeNumber')),
1590                         'description': unescapeHTML(e.get('description')),
1591                     })
1592                     if not info.get('title') and episode_name:
1593                         info['title'] = episode_name
1594                     part_of_season = e.get('partOfSeason')
1595                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1596                         info.update({
1597                             'season': unescapeHTML(part_of_season.get('name')),
1598                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1599                         })
1600                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1601                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1602                         info['series'] = unescapeHTML(part_of_series.get('name'))
1603                 elif is_type(e, 'Movie'):
1604                     info.update({
1605                         'title': unescapeHTML(e.get('name')),
1606                         'description': unescapeHTML(e.get('description')),
1607                         'duration': parse_duration(e.get('duration')),
1608                         'timestamp': unified_timestamp(e.get('dateCreated')),
1609                     })
1610                 elif is_type(e, 'Article', 'NewsArticle'):
1611                     info.update({
1612                         'timestamp': parse_iso8601(e.get('datePublished')),
1613                         'title': unescapeHTML(e.get('headline')),
1614                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1615                     })
1616                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1617                         extract_video_object(e['video'][0])
1618                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1619                         extract_video_object(e['subjectOf'][0])
1620                 elif is_type(e, 'VideoObject', 'AudioObject'):
1621                     extract_video_object(e)
1622                     if expected_type is None:
1623                         continue
1624                     else:
1625                         break
1626                 video = e.get('video')
1627                 if is_type(video, 'VideoObject'):
1628                     extract_video_object(video)
1629                 if expected_type is None:
1630                     continue
1631                 else:
1632                     break
1633
1634         traverse_json_ld(json_ld)
1635         return filter_dict(info)
1636
1637     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1638         return self._parse_json(
1639             self._search_regex(
1640                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1641                 webpage, 'next.js data', fatal=fatal, **kw),
1642             video_id, transform_source=transform_source, fatal=fatal)
1643
1644     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1645         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1646         rectx = re.escape(context_name)
1647         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1648         js, arg_keys, arg_vals = self._search_regex(
1649             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1650             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1651             default=NO_DEFAULT if fatal else (None, None, None))
1652         if js is None:
1653             return {}
1654
1655         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1656
1657         for key, val in args.items():
1658             if val in ('undefined', 'void 0'):
1659                 args[key] = 'null'
1660
1661         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1662         return traverse_obj(ret, traverse) or {}
1663
1664     @staticmethod
1665     def _hidden_inputs(html):
1666         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1667         hidden_inputs = {}
1668         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1669             attrs = extract_attributes(input)
1670             if not input:
1671                 continue
1672             if attrs.get('type') not in ('hidden', 'submit'):
1673                 continue
1674             name = attrs.get('name') or attrs.get('id')
1675             value = attrs.get('value')
1676             if name and value is not None:
1677                 hidden_inputs[name] = value
1678         return hidden_inputs
1679
1680     def _form_hidden_inputs(self, form_id, html):
1681         form = self._search_regex(
1682             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1683             html, '%s form' % form_id, group='form')
1684         return self._hidden_inputs(form)
1685
1686     @classproperty(cache=True)
1687     def FormatSort(cls):
1688         class FormatSort(FormatSorter):
1689             def __init__(ie, *args, **kwargs):
1690                 super().__init__(ie._downloader, *args, **kwargs)
1691
1692         deprecation_warning(
1693             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1694             'Use yt_dlp.utils.FormatSorter instead')
1695         return FormatSort
1696
1697     def _sort_formats(self, formats, field_preference=[]):
1698         if not field_preference:
1699             self._downloader.deprecation_warning(
1700                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1701             return
1702         self._downloader.deprecation_warning(
1703             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1704             'Return _format_sort_fields in the info_dict instead')
1705         if formats:
1706             formats[0]['__sort_fields'] = field_preference
1707
1708     def _check_formats(self, formats, video_id):
1709         if formats:
1710             formats[:] = filter(
1711                 lambda f: self._is_valid_url(
1712                     f['url'], video_id,
1713                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1714                 formats)
1715
1716     @staticmethod
1717     def _remove_duplicate_formats(formats):
1718         format_urls = set()
1719         unique_formats = []
1720         for f in formats:
1721             if f['url'] not in format_urls:
1722                 format_urls.add(f['url'])
1723                 unique_formats.append(f)
1724         formats[:] = unique_formats
1725
1726     def _is_valid_url(self, url, video_id, item='video', headers={}):
1727         url = self._proto_relative_url(url, scheme='http:')
1728         # For now assume non HTTP(S) URLs always valid
1729         if not (url.startswith('http://') or url.startswith('https://')):
1730             return True
1731         try:
1732             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1733             return True
1734         except ExtractorError as e:
1735             self.to_screen(
1736                 '%s: %s URL is invalid, skipping: %s'
1737                 % (video_id, item, error_to_compat_str(e.cause)))
1738             return False
1739
1740     def http_scheme(self):
1741         """ Either "http:" or "https:", depending on the user's preferences """
1742         return (
1743             'http:'
1744             if self.get_param('prefer_insecure', False)
1745             else 'https:')
1746
1747     def _proto_relative_url(self, url, scheme=None):
1748         scheme = scheme or self.http_scheme()
1749         assert scheme.endswith(':')
1750         return sanitize_url(url, scheme=scheme[:-1])
1751
1752     def _sleep(self, timeout, video_id, msg_template=None):
1753         if msg_template is None:
1754             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1755         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1756         self.to_screen(msg)
1757         time.sleep(timeout)
1758
1759     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1760                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1761                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1762         if self.get_param('ignore_no_formats_error'):
1763             fatal = False
1764
1765         res = self._download_xml_handle(
1766             manifest_url, video_id, 'Downloading f4m manifest',
1767             'Unable to download f4m manifest',
1768             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1769             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1770             transform_source=transform_source,
1771             fatal=fatal, data=data, headers=headers, query=query)
1772         if res is False:
1773             return []
1774
1775         manifest, urlh = res
1776         manifest_url = urlh.geturl()
1777
1778         return self._parse_f4m_formats(
1779             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1780             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1781
1782     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1783                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1784                            fatal=True, m3u8_id=None):
1785         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1786             return []
1787
1788         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1789         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1790         if akamai_pv is not None and ';' in akamai_pv.text:
1791             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1792             if playerVerificationChallenge.strip() != '':
1793                 return []
1794
1795         formats = []
1796         manifest_version = '1.0'
1797         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1798         if not media_nodes:
1799             manifest_version = '2.0'
1800             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1801         # Remove unsupported DRM protected media from final formats
1802         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1803         media_nodes = remove_encrypted_media(media_nodes)
1804         if not media_nodes:
1805             return formats
1806
1807         manifest_base_url = get_base_url(manifest)
1808
1809         bootstrap_info = xpath_element(
1810             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1811             'bootstrap info', default=None)
1812
1813         vcodec = None
1814         mime_type = xpath_text(
1815             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1816             'base URL', default=None)
1817         if mime_type and mime_type.startswith('audio/'):
1818             vcodec = 'none'
1819
1820         for i, media_el in enumerate(media_nodes):
1821             tbr = int_or_none(media_el.attrib.get('bitrate'))
1822             width = int_or_none(media_el.attrib.get('width'))
1823             height = int_or_none(media_el.attrib.get('height'))
1824             format_id = join_nonempty(f4m_id, tbr or i)
1825             # If <bootstrapInfo> is present, the specified f4m is a
1826             # stream-level manifest, and only set-level manifests may refer to
1827             # external resources.  See section 11.4 and section 4 of F4M spec
1828             if bootstrap_info is None:
1829                 media_url = None
1830                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1831                 if manifest_version == '2.0':
1832                     media_url = media_el.attrib.get('href')
1833                 if media_url is None:
1834                     media_url = media_el.attrib.get('url')
1835                 if not media_url:
1836                     continue
1837                 manifest_url = (
1838                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1839                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1840                 # If media_url is itself a f4m manifest do the recursive extraction
1841                 # since bitrates in parent manifest (this one) and media_url manifest
1842                 # may differ leading to inability to resolve the format by requested
1843                 # bitrate in f4m downloader
1844                 ext = determine_ext(manifest_url)
1845                 if ext == 'f4m':
1846                     f4m_formats = self._extract_f4m_formats(
1847                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1848                         transform_source=transform_source, fatal=fatal)
1849                     # Sometimes stream-level manifest contains single media entry that
1850                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1851                     # At the same time parent's media entry in set-level manifest may
1852                     # contain it. We will copy it from parent in such cases.
1853                     if len(f4m_formats) == 1:
1854                         f = f4m_formats[0]
1855                         f.update({
1856                             'tbr': f.get('tbr') or tbr,
1857                             'width': f.get('width') or width,
1858                             'height': f.get('height') or height,
1859                             'format_id': f.get('format_id') if not tbr else format_id,
1860                             'vcodec': vcodec,
1861                         })
1862                     formats.extend(f4m_formats)
1863                     continue
1864                 elif ext == 'm3u8':
1865                     formats.extend(self._extract_m3u8_formats(
1866                         manifest_url, video_id, 'mp4', preference=preference,
1867                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1868                     continue
1869             formats.append({
1870                 'format_id': format_id,
1871                 'url': manifest_url,
1872                 'manifest_url': manifest_url,
1873                 'ext': 'flv' if bootstrap_info is not None else None,
1874                 'protocol': 'f4m',
1875                 'tbr': tbr,
1876                 'width': width,
1877                 'height': height,
1878                 'vcodec': vcodec,
1879                 'preference': preference,
1880                 'quality': quality,
1881             })
1882         return formats
1883
1884     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1885         return {
1886             'format_id': join_nonempty(m3u8_id, 'meta'),
1887             'url': m3u8_url,
1888             'ext': ext,
1889             'protocol': 'm3u8',
1890             'preference': preference - 100 if preference else -100,
1891             'quality': quality,
1892             'resolution': 'multiple',
1893             'format_note': 'Quality selection URL',
1894         }
1895
1896     def _report_ignoring_subs(self, name):
1897         self.report_warning(bug_reports_message(
1898             f'Ignoring subtitle tracks found in the {name} manifest; '
1899             'if any subtitle tracks are missing,'
1900         ), only_once=True)
1901
1902     def _extract_m3u8_formats(self, *args, **kwargs):
1903         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1904         if subs:
1905             self._report_ignoring_subs('HLS')
1906         return fmts
1907
1908     def _extract_m3u8_formats_and_subtitles(
1909             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1910             preference=None, quality=None, m3u8_id=None, note=None,
1911             errnote=None, fatal=True, live=False, data=None, headers={},
1912             query={}):
1913
1914         if self.get_param('ignore_no_formats_error'):
1915             fatal = False
1916
1917         if not m3u8_url:
1918             if errnote is not False:
1919                 errnote = errnote or 'Failed to obtain m3u8 URL'
1920                 if fatal:
1921                     raise ExtractorError(errnote, video_id=video_id)
1922                 self.report_warning(f'{errnote}{bug_reports_message()}')
1923             return [], {}
1924
1925         res = self._download_webpage_handle(
1926             m3u8_url, video_id,
1927             note='Downloading m3u8 information' if note is None else note,
1928             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1929             fatal=fatal, data=data, headers=headers, query=query)
1930
1931         if res is False:
1932             return [], {}
1933
1934         m3u8_doc, urlh = res
1935         m3u8_url = urlh.geturl()
1936
1937         return self._parse_m3u8_formats_and_subtitles(
1938             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1939             preference=preference, quality=quality, m3u8_id=m3u8_id,
1940             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1941             headers=headers, query=query, video_id=video_id)
1942
1943     def _parse_m3u8_formats_and_subtitles(
1944             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
1945             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1946             errnote=None, fatal=True, data=None, headers={}, query={},
1947             video_id=None):
1948         formats, subtitles = [], {}
1949
1950         has_drm = re.search('|'.join([
1951             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
1952             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
1953         ]), m3u8_doc)
1954
1955         def format_url(url):
1956             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
1957
1958         if self.get_param('hls_split_discontinuity', False):
1959             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1960                 if not m3u8_doc:
1961                     if not manifest_url:
1962                         return []
1963                     m3u8_doc = self._download_webpage(
1964                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
1965                         note=False, errnote='Failed to download m3u8 playlist information')
1966                     if m3u8_doc is False:
1967                         return []
1968                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
1969
1970         else:
1971             def _extract_m3u8_playlist_indices(*args, **kwargs):
1972                 return [None]
1973
1974         # References:
1975         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1976         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1977         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1978
1979         # We should try extracting formats only from master playlists [1, 4.3.4],
1980         # i.e. playlists that describe available qualities. On the other hand
1981         # media playlists [1, 4.3.3] should be returned as is since they contain
1982         # just the media without qualities renditions.
1983         # Fortunately, master playlist can be easily distinguished from media
1984         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1985         # master playlist tags MUST NOT appear in a media playlist and vice versa.
1986         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1987         # media playlist and MUST NOT appear in master playlist thus we can
1988         # clearly detect media playlist with this criterion.
1989
1990         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1991             formats = [{
1992                 'format_id': join_nonempty(m3u8_id, idx),
1993                 'format_index': idx,
1994                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
1995                 'ext': ext,
1996                 'protocol': entry_protocol,
1997                 'preference': preference,
1998                 'quality': quality,
1999                 'has_drm': has_drm,
2000             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2001
2002             return formats, subtitles
2003
2004         groups = {}
2005         last_stream_inf = {}
2006
2007         def extract_media(x_media_line):
2008             media = parse_m3u8_attributes(x_media_line)
2009             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2010             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2011             if not (media_type and group_id and name):
2012                 return
2013             groups.setdefault(group_id, []).append(media)
2014             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2015             if media_type == 'SUBTITLES':
2016                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2017                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2018                 # However, lack of URI has been spotted in the wild.
2019                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2020                 if not media.get('URI'):
2021                     return
2022                 url = format_url(media['URI'])
2023                 sub_info = {
2024                     'url': url,
2025                     'ext': determine_ext(url),
2026                 }
2027                 if sub_info['ext'] == 'm3u8':
2028                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2029                     # files may contain is WebVTT:
2030                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2031                     sub_info['ext'] = 'vtt'
2032                     sub_info['protocol'] = 'm3u8_native'
2033                 lang = media.get('LANGUAGE') or 'und'
2034                 subtitles.setdefault(lang, []).append(sub_info)
2035             if media_type not in ('VIDEO', 'AUDIO'):
2036                 return
2037             media_url = media.get('URI')
2038             if media_url:
2039                 manifest_url = format_url(media_url)
2040                 formats.extend({
2041                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2042                     'format_note': name,
2043                     'format_index': idx,
2044                     'url': manifest_url,
2045                     'manifest_url': m3u8_url,
2046                     'language': media.get('LANGUAGE'),
2047                     'ext': ext,
2048                     'protocol': entry_protocol,
2049                     'preference': preference,
2050                     'quality': quality,
2051                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2052                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2053
2054         def build_stream_name():
2055             # Despite specification does not mention NAME attribute for
2056             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2057             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2058             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2059             stream_name = last_stream_inf.get('NAME')
2060             if stream_name:
2061                 return stream_name
2062             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2063             # from corresponding rendition group
2064             stream_group_id = last_stream_inf.get('VIDEO')
2065             if not stream_group_id:
2066                 return
2067             stream_group = groups.get(stream_group_id)
2068             if not stream_group:
2069                 return stream_group_id
2070             rendition = stream_group[0]
2071             return rendition.get('NAME') or stream_group_id
2072
2073         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2074         # chance to detect video only formats when EXT-X-STREAM-INF tags
2075         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2076         for line in m3u8_doc.splitlines():
2077             if line.startswith('#EXT-X-MEDIA:'):
2078                 extract_media(line)
2079
2080         for line in m3u8_doc.splitlines():
2081             if line.startswith('#EXT-X-STREAM-INF:'):
2082                 last_stream_inf = parse_m3u8_attributes(line)
2083             elif line.startswith('#') or not line.strip():
2084                 continue
2085             else:
2086                 tbr = float_or_none(
2087                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2088                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2089                 manifest_url = format_url(line.strip())
2090
2091                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2092                     format_id = [m3u8_id, None, idx]
2093                     # Bandwidth of live streams may differ over time thus making
2094                     # format_id unpredictable. So it's better to keep provided
2095                     # format_id intact.
2096                     if not live:
2097                         stream_name = build_stream_name()
2098                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2099                     f = {
2100                         'format_id': join_nonempty(*format_id),
2101                         'format_index': idx,
2102                         'url': manifest_url,
2103                         'manifest_url': m3u8_url,
2104                         'tbr': tbr,
2105                         'ext': ext,
2106                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2107                         'protocol': entry_protocol,
2108                         'preference': preference,
2109                         'quality': quality,
2110                     }
2111                     resolution = last_stream_inf.get('RESOLUTION')
2112                     if resolution:
2113                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2114                         if mobj:
2115                             f['width'] = int(mobj.group('width'))
2116                             f['height'] = int(mobj.group('height'))
2117                     # Unified Streaming Platform
2118                     mobj = re.search(
2119                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2120                     if mobj:
2121                         abr, vbr = mobj.groups()
2122                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2123                         f.update({
2124                             'vbr': vbr,
2125                             'abr': abr,
2126                         })
2127                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2128                     f.update(codecs)
2129                     audio_group_id = last_stream_inf.get('AUDIO')
2130                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2131                     # references a rendition group MUST have a CODECS attribute.
2132                     # However, this is not always respected. E.g. [2]
2133                     # contains EXT-X-STREAM-INF tag which references AUDIO
2134                     # rendition group but does not have CODECS and despite
2135                     # referencing an audio group it represents a complete
2136                     # (with audio and video) format. So, for such cases we will
2137                     # ignore references to rendition groups and treat them
2138                     # as complete formats.
2139                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2140                         audio_group = groups.get(audio_group_id)
2141                         if audio_group and audio_group[0].get('URI'):
2142                             # TODO: update acodec for audio only formats with
2143                             # the same GROUP-ID
2144                             f['acodec'] = 'none'
2145                     if not f.get('ext'):
2146                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2147                     formats.append(f)
2148
2149                     # for DailyMotion
2150                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2151                     if progressive_uri:
2152                         http_f = f.copy()
2153                         del http_f['manifest_url']
2154                         http_f.update({
2155                             'format_id': f['format_id'].replace('hls-', 'http-'),
2156                             'protocol': 'http',
2157                             'url': progressive_uri,
2158                         })
2159                         formats.append(http_f)
2160
2161                 last_stream_inf = {}
2162         return formats, subtitles
2163
2164     def _extract_m3u8_vod_duration(
2165             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2166
2167         m3u8_vod = self._download_webpage(
2168             m3u8_vod_url, video_id,
2169             note='Downloading m3u8 VOD manifest' if note is None else note,
2170             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2171             fatal=False, data=data, headers=headers, query=query)
2172
2173         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2174
2175     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2176         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2177             return None
2178
2179         return int(sum(
2180             float(line[len('#EXTINF:'):].split(',')[0])
2181             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2182
2183     @staticmethod
2184     def _xpath_ns(path, namespace=None):
2185         if not namespace:
2186             return path
2187         out = []
2188         for c in path.split('/'):
2189             if not c or c == '.':
2190                 out.append(c)
2191             else:
2192                 out.append('{%s}%s' % (namespace, c))
2193         return '/'.join(out)
2194
2195     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2196         if self.get_param('ignore_no_formats_error'):
2197             fatal = False
2198
2199         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2200         if res is False:
2201             assert not fatal
2202             return [], {}
2203
2204         smil, urlh = res
2205         smil_url = urlh.geturl()
2206
2207         namespace = self._parse_smil_namespace(smil)
2208
2209         fmts = self._parse_smil_formats(
2210             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2211         subs = self._parse_smil_subtitles(
2212             smil, namespace=namespace)
2213
2214         return fmts, subs
2215
2216     def _extract_smil_formats(self, *args, **kwargs):
2217         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2218         if subs:
2219             self._report_ignoring_subs('SMIL')
2220         return fmts
2221
2222     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2223         res = self._download_smil(smil_url, video_id, fatal=fatal)
2224         if res is False:
2225             return {}
2226
2227         smil, urlh = res
2228         smil_url = urlh.geturl()
2229
2230         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2231
2232     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2233         return self._download_xml_handle(
2234             smil_url, video_id, 'Downloading SMIL file',
2235             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2236
2237     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2238         namespace = self._parse_smil_namespace(smil)
2239
2240         formats = self._parse_smil_formats(
2241             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2242         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2243
2244         video_id = os.path.splitext(url_basename(smil_url))[0]
2245         title = None
2246         description = None
2247         upload_date = None
2248         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2249             name = meta.attrib.get('name')
2250             content = meta.attrib.get('content')
2251             if not name or not content:
2252                 continue
2253             if not title and name == 'title':
2254                 title = content
2255             elif not description and name in ('description', 'abstract'):
2256                 description = content
2257             elif not upload_date and name == 'date':
2258                 upload_date = unified_strdate(content)
2259
2260         thumbnails = [{
2261             'id': image.get('type'),
2262             'url': image.get('src'),
2263             'width': int_or_none(image.get('width')),
2264             'height': int_or_none(image.get('height')),
2265         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2266
2267         return {
2268             'id': video_id,
2269             'title': title or video_id,
2270             'description': description,
2271             'upload_date': upload_date,
2272             'thumbnails': thumbnails,
2273             'formats': formats,
2274             'subtitles': subtitles,
2275         }
2276
2277     def _parse_smil_namespace(self, smil):
2278         return self._search_regex(
2279             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2280
2281     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2282         base = smil_url
2283         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2284             b = meta.get('base') or meta.get('httpBase')
2285             if b:
2286                 base = b
2287                 break
2288
2289         formats = []
2290         rtmp_count = 0
2291         http_count = 0
2292         m3u8_count = 0
2293         imgs_count = 0
2294
2295         srcs = set()
2296         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2297         for medium in media:
2298             src = medium.get('src')
2299             if not src or src in srcs:
2300                 continue
2301             srcs.add(src)
2302
2303             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2304             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2305             width = int_or_none(medium.get('width'))
2306             height = int_or_none(medium.get('height'))
2307             proto = medium.get('proto')
2308             ext = medium.get('ext')
2309             src_ext = determine_ext(src)
2310             streamer = medium.get('streamer') or base
2311
2312             if proto == 'rtmp' or streamer.startswith('rtmp'):
2313                 rtmp_count += 1
2314                 formats.append({
2315                     'url': streamer,
2316                     'play_path': src,
2317                     'ext': 'flv',
2318                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2319                     'tbr': bitrate,
2320                     'filesize': filesize,
2321                     'width': width,
2322                     'height': height,
2323                 })
2324                 if transform_rtmp_url:
2325                     streamer, src = transform_rtmp_url(streamer, src)
2326                     formats[-1].update({
2327                         'url': streamer,
2328                         'play_path': src,
2329                     })
2330                 continue
2331
2332             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2333             src_url = src_url.strip()
2334
2335             if proto == 'm3u8' or src_ext == 'm3u8':
2336                 m3u8_formats = self._extract_m3u8_formats(
2337                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2338                 if len(m3u8_formats) == 1:
2339                     m3u8_count += 1
2340                     m3u8_formats[0].update({
2341                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2342                         'tbr': bitrate,
2343                         'width': width,
2344                         'height': height,
2345                     })
2346                 formats.extend(m3u8_formats)
2347             elif src_ext == 'f4m':
2348                 f4m_url = src_url
2349                 if not f4m_params:
2350                     f4m_params = {
2351                         'hdcore': '3.2.0',
2352                         'plugin': 'flowplayer-3.2.0.1',
2353                     }
2354                 f4m_url += '&' if '?' in f4m_url else '?'
2355                 f4m_url += urllib.parse.urlencode(f4m_params)
2356                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2357             elif src_ext == 'mpd':
2358                 formats.extend(self._extract_mpd_formats(
2359                     src_url, video_id, mpd_id='dash', fatal=False))
2360             elif re.search(r'\.ism/[Mm]anifest', src_url):
2361                 formats.extend(self._extract_ism_formats(
2362                     src_url, video_id, ism_id='mss', fatal=False))
2363             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2364                 http_count += 1
2365                 formats.append({
2366                     'url': src_url,
2367                     'ext': ext or src_ext or 'flv',
2368                     'format_id': 'http-%d' % (bitrate or http_count),
2369                     'tbr': bitrate,
2370                     'filesize': filesize,
2371                     'width': width,
2372                     'height': height,
2373                 })
2374
2375         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2376             src = medium.get('src')
2377             if not src or src in srcs:
2378                 continue
2379             srcs.add(src)
2380
2381             imgs_count += 1
2382             formats.append({
2383                 'format_id': 'imagestream-%d' % (imgs_count),
2384                 'url': src,
2385                 'ext': mimetype2ext(medium.get('type')),
2386                 'acodec': 'none',
2387                 'vcodec': 'none',
2388                 'width': int_or_none(medium.get('width')),
2389                 'height': int_or_none(medium.get('height')),
2390                 'format_note': 'SMIL storyboards',
2391             })
2392
2393         return formats
2394
2395     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2396         urls = []
2397         subtitles = {}
2398         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2399             src = textstream.get('src')
2400             if not src or src in urls:
2401                 continue
2402             urls.append(src)
2403             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2404             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2405             subtitles.setdefault(lang, []).append({
2406                 'url': src,
2407                 'ext': ext,
2408             })
2409         return subtitles
2410
2411     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2412         res = self._download_xml_handle(
2413             xspf_url, playlist_id, 'Downloading xpsf playlist',
2414             'Unable to download xspf manifest', fatal=fatal)
2415         if res is False:
2416             return []
2417
2418         xspf, urlh = res
2419         xspf_url = urlh.geturl()
2420
2421         return self._parse_xspf(
2422             xspf, playlist_id, xspf_url=xspf_url,
2423             xspf_base_url=base_url(xspf_url))
2424
2425     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2426         NS_MAP = {
2427             'xspf': 'http://xspf.org/ns/0/',
2428             's1': 'http://static.streamone.nl/player/ns/0',
2429         }
2430
2431         entries = []
2432         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2433             title = xpath_text(
2434                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2435             description = xpath_text(
2436                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2437             thumbnail = xpath_text(
2438                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2439             duration = float_or_none(
2440                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2441
2442             formats = []
2443             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2444                 format_url = urljoin(xspf_base_url, location.text)
2445                 if not format_url:
2446                     continue
2447                 formats.append({
2448                     'url': format_url,
2449                     'manifest_url': xspf_url,
2450                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2451                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2452                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2453                 })
2454
2455             entries.append({
2456                 'id': playlist_id,
2457                 'title': title,
2458                 'description': description,
2459                 'thumbnail': thumbnail,
2460                 'duration': duration,
2461                 'formats': formats,
2462             })
2463         return entries
2464
2465     def _extract_mpd_formats(self, *args, **kwargs):
2466         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2467         if subs:
2468             self._report_ignoring_subs('DASH')
2469         return fmts
2470
2471     def _extract_mpd_formats_and_subtitles(
2472             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2473             fatal=True, data=None, headers={}, query={}):
2474
2475         if self.get_param('ignore_no_formats_error'):
2476             fatal = False
2477
2478         res = self._download_xml_handle(
2479             mpd_url, video_id,
2480             note='Downloading MPD manifest' if note is None else note,
2481             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2482             fatal=fatal, data=data, headers=headers, query=query)
2483         if res is False:
2484             return [], {}
2485         mpd_doc, urlh = res
2486         if mpd_doc is None:
2487             return [], {}
2488
2489         # We could have been redirected to a new url when we retrieved our mpd file.
2490         mpd_url = urlh.geturl()
2491         mpd_base_url = base_url(mpd_url)
2492
2493         return self._parse_mpd_formats_and_subtitles(
2494             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2495
2496     def _parse_mpd_formats(self, *args, **kwargs):
2497         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2498         if subs:
2499             self._report_ignoring_subs('DASH')
2500         return fmts
2501
2502     def _parse_mpd_formats_and_subtitles(
2503             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2504         """
2505         Parse formats from MPD manifest.
2506         References:
2507          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2508             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2509          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2510         """
2511         if not self.get_param('dynamic_mpd', True):
2512             if mpd_doc.get('type') == 'dynamic':
2513                 return [], {}
2514
2515         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2516
2517         def _add_ns(path):
2518             return self._xpath_ns(path, namespace)
2519
2520         def is_drm_protected(element):
2521             return element.find(_add_ns('ContentProtection')) is not None
2522
2523         def extract_multisegment_info(element, ms_parent_info):
2524             ms_info = ms_parent_info.copy()
2525
2526             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2527             # common attributes and elements.  We will only extract relevant
2528             # for us.
2529             def extract_common(source):
2530                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2531                 if segment_timeline is not None:
2532                     s_e = segment_timeline.findall(_add_ns('S'))
2533                     if s_e:
2534                         ms_info['total_number'] = 0
2535                         ms_info['s'] = []
2536                         for s in s_e:
2537                             r = int(s.get('r', 0))
2538                             ms_info['total_number'] += 1 + r
2539                             ms_info['s'].append({
2540                                 't': int(s.get('t', 0)),
2541                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2542                                 'd': int(s.attrib['d']),
2543                                 'r': r,
2544                             })
2545                 start_number = source.get('startNumber')
2546                 if start_number:
2547                     ms_info['start_number'] = int(start_number)
2548                 timescale = source.get('timescale')
2549                 if timescale:
2550                     ms_info['timescale'] = int(timescale)
2551                 segment_duration = source.get('duration')
2552                 if segment_duration:
2553                     ms_info['segment_duration'] = float(segment_duration)
2554
2555             def extract_Initialization(source):
2556                 initialization = source.find(_add_ns('Initialization'))
2557                 if initialization is not None:
2558                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2559
2560             segment_list = element.find(_add_ns('SegmentList'))
2561             if segment_list is not None:
2562                 extract_common(segment_list)
2563                 extract_Initialization(segment_list)
2564                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2565                 if segment_urls_e:
2566                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2567             else:
2568                 segment_template = element.find(_add_ns('SegmentTemplate'))
2569                 if segment_template is not None:
2570                     extract_common(segment_template)
2571                     media = segment_template.get('media')
2572                     if media:
2573                         ms_info['media'] = media
2574                     initialization = segment_template.get('initialization')
2575                     if initialization:
2576                         ms_info['initialization'] = initialization
2577                     else:
2578                         extract_Initialization(segment_template)
2579             return ms_info
2580
2581         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2582         formats, subtitles = [], {}
2583         stream_numbers = collections.defaultdict(int)
2584         for period in mpd_doc.findall(_add_ns('Period')):
2585             period_duration = parse_duration(period.get('duration')) or mpd_duration
2586             period_ms_info = extract_multisegment_info(period, {
2587                 'start_number': 1,
2588                 'timescale': 1,
2589             })
2590             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2591                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2592                 for representation in adaptation_set.findall(_add_ns('Representation')):
2593                     representation_attrib = adaptation_set.attrib.copy()
2594                     representation_attrib.update(representation.attrib)
2595                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2596                     mime_type = representation_attrib['mimeType']
2597                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2598
2599                     codec_str = representation_attrib.get('codecs', '')
2600                     # Some kind of binary subtitle found in some youtube livestreams
2601                     if mime_type == 'application/x-rawcc':
2602                         codecs = {'scodec': codec_str}
2603                     else:
2604                         codecs = parse_codecs(codec_str)
2605                     if content_type not in ('video', 'audio', 'text'):
2606                         if mime_type == 'image/jpeg':
2607                             content_type = mime_type
2608                         elif codecs.get('vcodec', 'none') != 'none':
2609                             content_type = 'video'
2610                         elif codecs.get('acodec', 'none') != 'none':
2611                             content_type = 'audio'
2612                         elif codecs.get('scodec', 'none') != 'none':
2613                             content_type = 'text'
2614                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2615                             content_type = 'text'
2616                         else:
2617                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2618                             continue
2619
2620                     base_url = ''
2621                     for element in (representation, adaptation_set, period, mpd_doc):
2622                         base_url_e = element.find(_add_ns('BaseURL'))
2623                         if try_call(lambda: base_url_e.text) is not None:
2624                             base_url = base_url_e.text + base_url
2625                             if re.match(r'^https?://', base_url):
2626                                 break
2627                     if mpd_base_url and base_url.startswith('/'):
2628                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2629                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2630                         if not mpd_base_url.endswith('/'):
2631                             mpd_base_url += '/'
2632                         base_url = mpd_base_url + base_url
2633                     representation_id = representation_attrib.get('id')
2634                     lang = representation_attrib.get('lang')
2635                     url_el = representation.find(_add_ns('BaseURL'))
2636                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2637                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2638                     if representation_id is not None:
2639                         format_id = representation_id
2640                     else:
2641                         format_id = content_type
2642                     if mpd_id:
2643                         format_id = mpd_id + '-' + format_id
2644                     if content_type in ('video', 'audio'):
2645                         f = {
2646                             'format_id': format_id,
2647                             'manifest_url': mpd_url,
2648                             'ext': mimetype2ext(mime_type),
2649                             'width': int_or_none(representation_attrib.get('width')),
2650                             'height': int_or_none(representation_attrib.get('height')),
2651                             'tbr': float_or_none(bandwidth, 1000),
2652                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2653                             'fps': int_or_none(representation_attrib.get('frameRate')),
2654                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2655                             'format_note': 'DASH %s' % content_type,
2656                             'filesize': filesize,
2657                             'container': mimetype2ext(mime_type) + '_dash',
2658                             **codecs
2659                         }
2660                     elif content_type == 'text':
2661                         f = {
2662                             'ext': mimetype2ext(mime_type),
2663                             'manifest_url': mpd_url,
2664                             'filesize': filesize,
2665                         }
2666                     elif content_type == 'image/jpeg':
2667                         # See test case in VikiIE
2668                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2669                         f = {
2670                             'format_id': format_id,
2671                             'ext': 'mhtml',
2672                             'manifest_url': mpd_url,
2673                             'format_note': 'DASH storyboards (jpeg)',
2674                             'acodec': 'none',
2675                             'vcodec': 'none',
2676                         }
2677                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2678                         f['has_drm'] = True
2679                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2680
2681                     def prepare_template(template_name, identifiers):
2682                         tmpl = representation_ms_info[template_name]
2683                         if representation_id is not None:
2684                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2685                         # First of, % characters outside $...$ templates
2686                         # must be escaped by doubling for proper processing
2687                         # by % operator string formatting used further (see
2688                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2689                         t = ''
2690                         in_template = False
2691                         for c in tmpl:
2692                             t += c
2693                             if c == '$':
2694                                 in_template = not in_template
2695                             elif c == '%' and not in_template:
2696                                 t += c
2697                         # Next, $...$ templates are translated to their
2698                         # %(...) counterparts to be used with % operator
2699                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2700                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2701                         t.replace('$$', '$')
2702                         return t
2703
2704                     # @initialization is a regular template like @media one
2705                     # so it should be handled just the same way (see
2706                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2707                     if 'initialization' in representation_ms_info:
2708                         initialization_template = prepare_template(
2709                             'initialization',
2710                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2711                             # $Time$ shall not be included for @initialization thus
2712                             # only $Bandwidth$ remains
2713                             ('Bandwidth', ))
2714                         representation_ms_info['initialization_url'] = initialization_template % {
2715                             'Bandwidth': bandwidth,
2716                         }
2717
2718                     def location_key(location):
2719                         return 'url' if re.match(r'^https?://', location) else 'path'
2720
2721                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2722
2723                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2724                         media_location_key = location_key(media_template)
2725
2726                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2727                         # can't be used at the same time
2728                         if '%(Number' in media_template and 's' not in representation_ms_info:
2729                             segment_duration = None
2730                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2731                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2732                                 representation_ms_info['total_number'] = int(math.ceil(
2733                                     float_or_none(period_duration, segment_duration, default=0)))
2734                             representation_ms_info['fragments'] = [{
2735                                 media_location_key: media_template % {
2736                                     'Number': segment_number,
2737                                     'Bandwidth': bandwidth,
2738                                 },
2739                                 'duration': segment_duration,
2740                             } for segment_number in range(
2741                                 representation_ms_info['start_number'],
2742                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2743                         else:
2744                             # $Number*$ or $Time$ in media template with S list available
2745                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2746                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2747                             representation_ms_info['fragments'] = []
2748                             segment_time = 0
2749                             segment_d = None
2750                             segment_number = representation_ms_info['start_number']
2751
2752                             def add_segment_url():
2753                                 segment_url = media_template % {
2754                                     'Time': segment_time,
2755                                     'Bandwidth': bandwidth,
2756                                     'Number': segment_number,
2757                                 }
2758                                 representation_ms_info['fragments'].append({
2759                                     media_location_key: segment_url,
2760                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2761                                 })
2762
2763                             for num, s in enumerate(representation_ms_info['s']):
2764                                 segment_time = s.get('t') or segment_time
2765                                 segment_d = s['d']
2766                                 add_segment_url()
2767                                 segment_number += 1
2768                                 for r in range(s.get('r', 0)):
2769                                     segment_time += segment_d
2770                                     add_segment_url()
2771                                     segment_number += 1
2772                                 segment_time += segment_d
2773                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2774                         # No media template,
2775                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2776                         # or any YouTube dashsegments video
2777                         fragments = []
2778                         segment_index = 0
2779                         timescale = representation_ms_info['timescale']
2780                         for s in representation_ms_info['s']:
2781                             duration = float_or_none(s['d'], timescale)
2782                             for r in range(s.get('r', 0) + 1):
2783                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2784                                 fragments.append({
2785                                     location_key(segment_uri): segment_uri,
2786                                     'duration': duration,
2787                                 })
2788                                 segment_index += 1
2789                         representation_ms_info['fragments'] = fragments
2790                     elif 'segment_urls' in representation_ms_info:
2791                         # Segment URLs with no SegmentTimeline
2792                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2793                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2794                         fragments = []
2795                         segment_duration = float_or_none(
2796                             representation_ms_info['segment_duration'],
2797                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2798                         for segment_url in representation_ms_info['segment_urls']:
2799                             fragment = {
2800                                 location_key(segment_url): segment_url,
2801                             }
2802                             if segment_duration:
2803                                 fragment['duration'] = segment_duration
2804                             fragments.append(fragment)
2805                         representation_ms_info['fragments'] = fragments
2806                     # If there is a fragments key available then we correctly recognized fragmented media.
2807                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2808                     # assumption is not necessarily correct since we may simply have no support for
2809                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2810                     if 'fragments' in representation_ms_info:
2811                         f.update({
2812                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2813                             'url': mpd_url or base_url,
2814                             'fragment_base_url': base_url,
2815                             'fragments': [],
2816                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2817                         })
2818                         if 'initialization_url' in representation_ms_info:
2819                             initialization_url = representation_ms_info['initialization_url']
2820                             if not f.get('url'):
2821                                 f['url'] = initialization_url
2822                             f['fragments'].append({location_key(initialization_url): initialization_url})
2823                         f['fragments'].extend(representation_ms_info['fragments'])
2824                         if not period_duration:
2825                             period_duration = try_get(
2826                                 representation_ms_info,
2827                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2828                     else:
2829                         # Assuming direct URL to unfragmented media.
2830                         f['url'] = base_url
2831                     if content_type in ('video', 'audio', 'image/jpeg'):
2832                         f['manifest_stream_number'] = stream_numbers[f['url']]
2833                         stream_numbers[f['url']] += 1
2834                         formats.append(f)
2835                     elif content_type == 'text':
2836                         subtitles.setdefault(lang or 'und', []).append(f)
2837
2838         return formats, subtitles
2839
2840     def _extract_ism_formats(self, *args, **kwargs):
2841         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2842         if subs:
2843             self._report_ignoring_subs('ISM')
2844         return fmts
2845
2846     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2847         if self.get_param('ignore_no_formats_error'):
2848             fatal = False
2849
2850         res = self._download_xml_handle(
2851             ism_url, video_id,
2852             note='Downloading ISM manifest' if note is None else note,
2853             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2854             fatal=fatal, data=data, headers=headers, query=query)
2855         if res is False:
2856             return [], {}
2857         ism_doc, urlh = res
2858         if ism_doc is None:
2859             return [], {}
2860
2861         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2862
2863     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2864         """
2865         Parse formats from ISM manifest.
2866         References:
2867          1. [MS-SSTR]: Smooth Streaming Protocol,
2868             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2869         """
2870         if ism_doc.get('IsLive') == 'TRUE':
2871             return [], {}
2872
2873         duration = int(ism_doc.attrib['Duration'])
2874         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2875
2876         formats = []
2877         subtitles = {}
2878         for stream in ism_doc.findall('StreamIndex'):
2879             stream_type = stream.get('Type')
2880             if stream_type not in ('video', 'audio', 'text'):
2881                 continue
2882             url_pattern = stream.attrib['Url']
2883             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2884             stream_name = stream.get('Name')
2885             stream_language = stream.get('Language', 'und')
2886             for track in stream.findall('QualityLevel'):
2887                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2888                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
2889                 # TODO: add support for WVC1 and WMAP
2890                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
2891                     self.report_warning('%s is not a supported codec' % fourcc)
2892                     continue
2893                 tbr = int(track.attrib['Bitrate']) // 1000
2894                 # [1] does not mention Width and Height attributes. However,
2895                 # they're often present while MaxWidth and MaxHeight are
2896                 # missing, so should be used as fallbacks
2897                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2898                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2899                 sampling_rate = int_or_none(track.get('SamplingRate'))
2900
2901                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2902                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
2903
2904                 fragments = []
2905                 fragment_ctx = {
2906                     'time': 0,
2907                 }
2908                 stream_fragments = stream.findall('c')
2909                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2910                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2911                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2912                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2913                     if not fragment_ctx['duration']:
2914                         try:
2915                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2916                         except IndexError:
2917                             next_fragment_time = duration
2918                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2919                     for _ in range(fragment_repeat):
2920                         fragments.append({
2921                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
2922                             'duration': fragment_ctx['duration'] / stream_timescale,
2923                         })
2924                         fragment_ctx['time'] += fragment_ctx['duration']
2925
2926                 if stream_type == 'text':
2927                     subtitles.setdefault(stream_language, []).append({
2928                         'ext': 'ismt',
2929                         'protocol': 'ism',
2930                         'url': ism_url,
2931                         'manifest_url': ism_url,
2932                         'fragments': fragments,
2933                         '_download_params': {
2934                             'stream_type': stream_type,
2935                             'duration': duration,
2936                             'timescale': stream_timescale,
2937                             'fourcc': fourcc,
2938                             'language': stream_language,
2939                             'codec_private_data': track.get('CodecPrivateData'),
2940                         }
2941                     })
2942                 elif stream_type in ('video', 'audio'):
2943                     formats.append({
2944                         'format_id': join_nonempty(ism_id, stream_name, tbr),
2945                         'url': ism_url,
2946                         'manifest_url': ism_url,
2947                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2948                         'width': width,
2949                         'height': height,
2950                         'tbr': tbr,
2951                         'asr': sampling_rate,
2952                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2953                         'acodec': 'none' if stream_type == 'video' else fourcc,
2954                         'protocol': 'ism',
2955                         'fragments': fragments,
2956                         'has_drm': ism_doc.find('Protection') is not None,
2957                         '_download_params': {
2958                             'stream_type': stream_type,
2959                             'duration': duration,
2960                             'timescale': stream_timescale,
2961                             'width': width or 0,
2962                             'height': height or 0,
2963                             'fourcc': fourcc,
2964                             'language': stream_language,
2965                             'codec_private_data': track.get('CodecPrivateData'),
2966                             'sampling_rate': sampling_rate,
2967                             'channels': int_or_none(track.get('Channels', 2)),
2968                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2969                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2970                         },
2971                     })
2972         return formats, subtitles
2973
2974     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
2975         def absolute_url(item_url):
2976             return urljoin(base_url, item_url)
2977
2978         def parse_content_type(content_type):
2979             if not content_type:
2980                 return {}
2981             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2982             if ctr:
2983                 mimetype, codecs = ctr.groups()
2984                 f = parse_codecs(codecs)
2985                 f['ext'] = mimetype2ext(mimetype)
2986                 return f
2987             return {}
2988
2989         def _media_formats(src, cur_media_type, type_info=None):
2990             type_info = type_info or {}
2991             full_url = absolute_url(src)
2992             ext = type_info.get('ext') or determine_ext(full_url)
2993             if ext == 'm3u8':
2994                 is_plain_url = False
2995                 formats = self._extract_m3u8_formats(
2996                     full_url, video_id, ext='mp4',
2997                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2998                     preference=preference, quality=quality, fatal=False)
2999             elif ext == 'mpd':
3000                 is_plain_url = False
3001                 formats = self._extract_mpd_formats(
3002                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3003             else:
3004                 is_plain_url = True
3005                 formats = [{
3006                     'url': full_url,
3007                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3008                     'ext': ext,
3009                 }]
3010             return is_plain_url, formats
3011
3012         entries = []
3013         # amp-video and amp-audio are very similar to their HTML5 counterparts
3014         # so we will include them right here (see
3015         # https://www.ampproject.org/docs/reference/components/amp-video)
3016         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3017         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3018         media_tags = [(media_tag, media_tag_name, media_type, '')
3019                       for media_tag, media_tag_name, media_type
3020                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3021         media_tags.extend(re.findall(
3022             # We only allow video|audio followed by a whitespace or '>'.
3023             # Allowing more characters may end up in significant slow down (see
3024             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3025             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3026             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3027         for media_tag, _, media_type, media_content in media_tags:
3028             media_info = {
3029                 'formats': [],
3030                 'subtitles': {},
3031             }
3032             media_attributes = extract_attributes(media_tag)
3033             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3034             if src:
3035                 f = parse_content_type(media_attributes.get('type'))
3036                 _, formats = _media_formats(src, media_type, f)
3037                 media_info['formats'].extend(formats)
3038             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3039             if media_content:
3040                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3041                     s_attr = extract_attributes(source_tag)
3042                     # data-video-src and data-src are non standard but seen
3043                     # several times in the wild
3044                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3045                     if not src:
3046                         continue
3047                     f = parse_content_type(s_attr.get('type'))
3048                     is_plain_url, formats = _media_formats(src, media_type, f)
3049                     if is_plain_url:
3050                         # width, height, res, label and title attributes are
3051                         # all not standard but seen several times in the wild
3052                         labels = [
3053                             s_attr.get(lbl)
3054                             for lbl in ('label', 'title')
3055                             if str_or_none(s_attr.get(lbl))
3056                         ]
3057                         width = int_or_none(s_attr.get('width'))
3058                         height = (int_or_none(s_attr.get('height'))
3059                                   or int_or_none(s_attr.get('res')))
3060                         if not width or not height:
3061                             for lbl in labels:
3062                                 resolution = parse_resolution(lbl)
3063                                 if not resolution:
3064                                     continue
3065                                 width = width or resolution.get('width')
3066                                 height = height or resolution.get('height')
3067                         for lbl in labels:
3068                             tbr = parse_bitrate(lbl)
3069                             if tbr:
3070                                 break
3071                         else:
3072                             tbr = None
3073                         f.update({
3074                             'width': width,
3075                             'height': height,
3076                             'tbr': tbr,
3077                             'format_id': s_attr.get('label') or s_attr.get('title'),
3078                         })
3079                         f.update(formats[0])
3080                         media_info['formats'].append(f)
3081                     else:
3082                         media_info['formats'].extend(formats)
3083                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3084                     track_attributes = extract_attributes(track_tag)
3085                     kind = track_attributes.get('kind')
3086                     if not kind or kind in ('subtitles', 'captions'):
3087                         src = strip_or_none(track_attributes.get('src'))
3088                         if not src:
3089                             continue
3090                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3091                         media_info['subtitles'].setdefault(lang, []).append({
3092                             'url': absolute_url(src),
3093                         })
3094             for f in media_info['formats']:
3095                 f.setdefault('http_headers', {})['Referer'] = base_url
3096             if media_info['formats'] or media_info['subtitles']:
3097                 entries.append(media_info)
3098         return entries
3099
3100     def _extract_akamai_formats(self, *args, **kwargs):
3101         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3102         if subs:
3103             self._report_ignoring_subs('akamai')
3104         return fmts
3105
3106     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3107         signed = 'hdnea=' in manifest_url
3108         if not signed:
3109             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3110             manifest_url = re.sub(
3111                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3112                 '', manifest_url).strip('?')
3113
3114         formats = []
3115         subtitles = {}
3116
3117         hdcore_sign = 'hdcore=3.7.0'
3118         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3119         hds_host = hosts.get('hds')
3120         if hds_host:
3121             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3122         if 'hdcore=' not in f4m_url:
3123             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3124         f4m_formats = self._extract_f4m_formats(
3125             f4m_url, video_id, f4m_id='hds', fatal=False)
3126         for entry in f4m_formats:
3127             entry.update({'extra_param_to_segment_url': hdcore_sign})
3128         formats.extend(f4m_formats)
3129
3130         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3131         hls_host = hosts.get('hls')
3132         if hls_host:
3133             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3134         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3135             m3u8_url, video_id, 'mp4', 'm3u8_native',
3136             m3u8_id='hls', fatal=False)
3137         formats.extend(m3u8_formats)
3138         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3139
3140         http_host = hosts.get('http')
3141         if http_host and m3u8_formats and not signed:
3142             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3143             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3144             qualities_length = len(qualities)
3145             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3146                 i = 0
3147                 for f in m3u8_formats:
3148                     if f['vcodec'] != 'none':
3149                         for protocol in ('http', 'https'):
3150                             http_f = f.copy()
3151                             del http_f['manifest_url']
3152                             http_url = re.sub(
3153                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3154                             http_f.update({
3155                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3156                                 'url': http_url,
3157                                 'protocol': protocol,
3158                             })
3159                             formats.append(http_f)
3160                         i += 1
3161
3162         return formats, subtitles
3163
3164     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3165         query = urllib.parse.urlparse(url).query
3166         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3167         mobj = re.search(
3168             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3169         url_base = mobj.group('url')
3170         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3171         formats = []
3172
3173         def manifest_url(manifest):
3174             m_url = f'{http_base_url}/{manifest}'
3175             if query:
3176                 m_url += '?%s' % query
3177             return m_url
3178
3179         if 'm3u8' not in skip_protocols:
3180             formats.extend(self._extract_m3u8_formats(
3181                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3182                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3183         if 'f4m' not in skip_protocols:
3184             formats.extend(self._extract_f4m_formats(
3185                 manifest_url('manifest.f4m'),
3186                 video_id, f4m_id='hds', fatal=False))
3187         if 'dash' not in skip_protocols:
3188             formats.extend(self._extract_mpd_formats(
3189                 manifest_url('manifest.mpd'),
3190                 video_id, mpd_id='dash', fatal=False))
3191         if re.search(r'(?:/smil:|\.smil)', url_base):
3192             if 'smil' not in skip_protocols:
3193                 rtmp_formats = self._extract_smil_formats(
3194                     manifest_url('jwplayer.smil'),
3195                     video_id, fatal=False)
3196                 for rtmp_format in rtmp_formats:
3197                     rtsp_format = rtmp_format.copy()
3198                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3199                     del rtsp_format['play_path']
3200                     del rtsp_format['ext']
3201                     rtsp_format.update({
3202                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3203                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3204                         'protocol': 'rtsp',
3205                     })
3206                     formats.extend([rtmp_format, rtsp_format])
3207         else:
3208             for protocol in ('rtmp', 'rtsp'):
3209                 if protocol not in skip_protocols:
3210                     formats.append({
3211                         'url': f'{protocol}:{url_base}',
3212                         'format_id': protocol,
3213                         'protocol': protocol,
3214                     })
3215         return formats
3216
3217     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3218         mobj = re.search(
3219             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3220             webpage)
3221         if mobj:
3222             try:
3223                 jwplayer_data = self._parse_json(mobj.group('options'),
3224                                                  video_id=video_id,
3225                                                  transform_source=transform_source)
3226             except ExtractorError:
3227                 pass
3228             else:
3229                 if isinstance(jwplayer_data, dict):
3230                     return jwplayer_data
3231
3232     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3233         jwplayer_data = self._find_jwplayer_data(
3234             webpage, video_id, transform_source=js_to_json)
3235         return self._parse_jwplayer_data(
3236             jwplayer_data, video_id, *args, **kwargs)
3237
3238     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3239                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3240         # JWPlayer backward compatibility: flattened playlists
3241         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3242         if 'playlist' not in jwplayer_data:
3243             jwplayer_data = {'playlist': [jwplayer_data]}
3244
3245         entries = []
3246
3247         # JWPlayer backward compatibility: single playlist item
3248         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3249         if not isinstance(jwplayer_data['playlist'], list):
3250             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3251
3252         for video_data in jwplayer_data['playlist']:
3253             # JWPlayer backward compatibility: flattened sources
3254             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3255             if 'sources' not in video_data:
3256                 video_data['sources'] = [video_data]
3257
3258             this_video_id = video_id or video_data['mediaid']
3259
3260             formats = self._parse_jwplayer_formats(
3261                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3262                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3263
3264             subtitles = {}
3265             tracks = video_data.get('tracks')
3266             if tracks and isinstance(tracks, list):
3267                 for track in tracks:
3268                     if not isinstance(track, dict):
3269                         continue
3270                     track_kind = track.get('kind')
3271                     if not track_kind or not isinstance(track_kind, str):
3272                         continue
3273                     if track_kind.lower() not in ('captions', 'subtitles'):
3274                         continue
3275                     track_url = urljoin(base_url, track.get('file'))
3276                     if not track_url:
3277                         continue
3278                     subtitles.setdefault(track.get('label') or 'en', []).append({
3279                         'url': self._proto_relative_url(track_url)
3280                     })
3281
3282             entry = {
3283                 'id': this_video_id,
3284                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3285                 'description': clean_html(video_data.get('description')),
3286                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3287                 'timestamp': int_or_none(video_data.get('pubdate')),
3288                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3289                 'subtitles': subtitles,
3290             }
3291             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3292             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3293                 entry.update({
3294                     '_type': 'url_transparent',
3295                     'url': formats[0]['url'],
3296                 })
3297             else:
3298                 entry['formats'] = formats
3299             entries.append(entry)
3300         if len(entries) == 1:
3301             return entries[0]
3302         else:
3303             return self.playlist_result(entries)
3304
3305     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3306                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3307         urls = []
3308         formats = []
3309         for source in jwplayer_sources_data:
3310             if not isinstance(source, dict):
3311                 continue
3312             source_url = urljoin(
3313                 base_url, self._proto_relative_url(source.get('file')))
3314             if not source_url or source_url in urls:
3315                 continue
3316             urls.append(source_url)
3317             source_type = source.get('type') or ''
3318             ext = mimetype2ext(source_type) or determine_ext(source_url)
3319             if source_type == 'hls' or ext == 'm3u8':
3320                 formats.extend(self._extract_m3u8_formats(
3321                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3322                     m3u8_id=m3u8_id, fatal=False))
3323             elif source_type == 'dash' or ext == 'mpd':
3324                 formats.extend(self._extract_mpd_formats(
3325                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3326             elif ext == 'smil':
3327                 formats.extend(self._extract_smil_formats(
3328                     source_url, video_id, fatal=False))
3329             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3330             elif source_type.startswith('audio') or ext in (
3331                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3332                 formats.append({
3333                     'url': source_url,
3334                     'vcodec': 'none',
3335                     'ext': ext,
3336                 })
3337             else:
3338                 height = int_or_none(source.get('height'))
3339                 if height is None:
3340                     # Often no height is provided but there is a label in
3341                     # format like "1080p", "720p SD", or 1080.
3342                     height = int_or_none(self._search_regex(
3343                         r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
3344                         'height', default=None))
3345                 a_format = {
3346                     'url': source_url,
3347                     'width': int_or_none(source.get('width')),
3348                     'height': height,
3349                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3350                     'filesize': int_or_none(source.get('filesize')),
3351                     'ext': ext,
3352                 }
3353                 if source_url.startswith('rtmp'):
3354                     a_format['ext'] = 'flv'
3355                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3356                     # of jwplayer.flash.swf
3357                     rtmp_url_parts = re.split(
3358                         r'((?:mp4|mp3|flv):)', source_url, 1)
3359                     if len(rtmp_url_parts) == 3:
3360                         rtmp_url, prefix, play_path = rtmp_url_parts
3361                         a_format.update({
3362                             'url': rtmp_url,
3363                             'play_path': prefix + play_path,
3364                         })
3365                     if rtmp_params:
3366                         a_format.update(rtmp_params)
3367                 formats.append(a_format)
3368         return formats
3369
3370     def _live_title(self, name):
3371         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3372         return name
3373
3374     def _int(self, v, name, fatal=False, **kwargs):
3375         res = int_or_none(v, **kwargs)
3376         if res is None:
3377             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3378             if fatal:
3379                 raise ExtractorError(msg)
3380             else:
3381                 self.report_warning(msg)
3382         return res
3383
3384     def _float(self, v, name, fatal=False, **kwargs):
3385         res = float_or_none(v, **kwargs)
3386         if res is None:
3387             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3388             if fatal:
3389                 raise ExtractorError(msg)
3390             else:
3391                 self.report_warning(msg)
3392         return res
3393
3394     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3395                     path='/', secure=False, discard=False, rest={}, **kwargs):
3396         cookie = http.cookiejar.Cookie(
3397             0, name, value, port, port is not None, domain, True,
3398             domain.startswith('.'), path, True, secure, expire_time,
3399             discard, None, None, rest)
3400         self.cookiejar.set_cookie(cookie)
3401
3402     def _get_cookies(self, url):
3403         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3404         return LenientSimpleCookie(self._downloader._calc_cookies(url))
3405
3406     def _apply_first_set_cookie_header(self, url_handle, cookie):
3407         """
3408         Apply first Set-Cookie header instead of the last. Experimental.
3409
3410         Some sites (e.g. [1-3]) may serve two cookies under the same name
3411         in Set-Cookie header and expect the first (old) one to be set rather
3412         than second (new). However, as of RFC6265 the newer one cookie
3413         should be set into cookie store what actually happens.
3414         We will workaround this issue by resetting the cookie to
3415         the first one manually.
3416         1. https://new.vk.com/
3417         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3418         3. https://learning.oreilly.com/
3419         """
3420         for header, cookies in url_handle.headers.items():
3421             if header.lower() != 'set-cookie':
3422                 continue
3423             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3424             cookie_value = re.search(
3425                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3426             if cookie_value:
3427                 value, domain = cookie_value.groups()
3428                 self._set_cookie(domain, cookie, value)
3429                 break
3430
3431     @classmethod
3432     def get_testcases(cls, include_onlymatching=False):
3433         # Do not look in super classes
3434         t = vars(cls).get('_TEST')
3435         if t:
3436             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3437             tests = [t]
3438         else:
3439             tests = vars(cls).get('_TESTS', [])
3440         for t in tests:
3441             if not include_onlymatching and t.get('only_matching', False):
3442                 continue
3443             t['name'] = cls.ie_key()
3444             yield t
3445         if getattr(cls, '__wrapped__', None):
3446             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3447
3448     @classmethod
3449     def get_webpage_testcases(cls):
3450         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3451         for t in tests:
3452             t['name'] = cls.ie_key()
3453             yield t
3454         if getattr(cls, '__wrapped__', None):
3455             yield from cls.__wrapped__.get_webpage_testcases()
3456
3457     @classproperty(cache=True)
3458     def age_limit(cls):
3459         """Get age limit from the testcases"""
3460         return max(traverse_obj(
3461             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3462             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3463
3464     @classproperty(cache=True)
3465     def _RETURN_TYPE(cls):
3466         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3467         tests = tuple(cls.get_testcases(include_onlymatching=False))
3468         if not tests:
3469             return None
3470         elif not any(k.startswith('playlist') for test in tests for k in test):
3471             return 'video'
3472         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3473             return 'playlist'
3474         return 'any'
3475
3476     @classmethod
3477     def is_single_video(cls, url):
3478         """Returns whether the URL is of a single video, None if unknown"""
3479         assert cls.suitable(url), 'The URL must be suitable for the extractor'
3480         return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3481
3482     @classmethod
3483     def is_suitable(cls, age_limit):
3484         """Test whether the extractor is generally suitable for the given age limit"""
3485         return not age_restricted(cls.age_limit, age_limit)
3486
3487     @classmethod
3488     def description(cls, *, markdown=True, search_examples=None):
3489         """Description of the extractor"""
3490         desc = ''
3491         if cls._NETRC_MACHINE:
3492             if markdown:
3493                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3494             else:
3495                 desc += f' [{cls._NETRC_MACHINE}]'
3496         if cls.IE_DESC is False:
3497             desc += ' [HIDDEN]'
3498         elif cls.IE_DESC:
3499             desc += f' {cls.IE_DESC}'
3500         if cls.SEARCH_KEY:
3501             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3502             if search_examples:
3503                 _COUNTS = ('', '5', '10', 'all')
3504                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3505         if not cls.working():
3506             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3507
3508         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3509         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3510         return f'{name}:{desc}' if desc else name
3511
3512     def extract_subtitles(self, *args, **kwargs):
3513         if (self.get_param('writesubtitles', False)
3514                 or self.get_param('listsubtitles')):
3515             return self._get_subtitles(*args, **kwargs)
3516         return {}
3517
3518     def _get_subtitles(self, *args, **kwargs):
3519         raise NotImplementedError('This method must be implemented by subclasses')
3520
3521     class CommentsDisabled(Exception):
3522         """Raise in _get_comments if comments are disabled for the video"""
3523
3524     def extract_comments(self, *args, **kwargs):
3525         if not self.get_param('getcomments'):
3526             return None
3527         generator = self._get_comments(*args, **kwargs)
3528
3529         def extractor():
3530             comments = []
3531             interrupted = True
3532             try:
3533                 while True:
3534                     comments.append(next(generator))
3535             except StopIteration:
3536                 interrupted = False
3537             except KeyboardInterrupt:
3538                 self.to_screen('Interrupted by user')
3539             except self.CommentsDisabled:
3540                 return {'comments': None, 'comment_count': None}
3541             except Exception as e:
3542                 if self.get_param('ignoreerrors') is not True:
3543                     raise
3544                 self._downloader.report_error(e)
3545             comment_count = len(comments)
3546             self.to_screen(f'Extracted {comment_count} comments')
3547             return {
3548                 'comments': comments,
3549                 'comment_count': None if interrupted else comment_count
3550             }
3551         return extractor
3552
3553     def _get_comments(self, *args, **kwargs):
3554         raise NotImplementedError('This method must be implemented by subclasses')
3555
3556     @staticmethod
3557     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3558         """ Merge subtitle items for one language. Items with duplicated URLs/data
3559         will be dropped. """
3560         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3561         ret = list(subtitle_list1)
3562         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3563         return ret
3564
3565     @classmethod
3566     def _merge_subtitles(cls, *dicts, target=None):
3567         """ Merge subtitle dictionaries, language by language. """
3568         if target is None:
3569             target = {}
3570         for d in dicts:
3571             for lang, subs in d.items():
3572                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3573         return target
3574
3575     def extract_automatic_captions(self, *args, **kwargs):
3576         if (self.get_param('writeautomaticsub', False)
3577                 or self.get_param('listsubtitles')):
3578             return self._get_automatic_captions(*args, **kwargs)
3579         return {}
3580
3581     def _get_automatic_captions(self, *args, **kwargs):
3582         raise NotImplementedError('This method must be implemented by subclasses')
3583
3584     @functools.cached_property
3585     def _cookies_passed(self):
3586         """Whether cookies have been passed to YoutubeDL"""
3587         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3588
3589     def mark_watched(self, *args, **kwargs):
3590         if not self.get_param('mark_watched', False):
3591             return
3592         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3593             self._mark_watched(*args, **kwargs)
3594
3595     def _mark_watched(self, *args, **kwargs):
3596         raise NotImplementedError('This method must be implemented by subclasses')
3597
3598     def geo_verification_headers(self):
3599         headers = {}
3600         geo_verification_proxy = self.get_param('geo_verification_proxy')
3601         if geo_verification_proxy:
3602             headers['Ytdl-request-proxy'] = geo_verification_proxy
3603         return headers
3604
3605     @staticmethod
3606     def _generic_id(url):
3607         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3608
3609     def _generic_title(self, url='', webpage='', *, default=None):
3610         return (self._og_search_title(webpage, default=None)
3611                 or self._html_extract_title(webpage, default=None)
3612                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3613                 or default)
3614
3615     @staticmethod
3616     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3617         all_known = all(map(
3618             lambda x: x is not None,
3619             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3620         return (
3621             'private' if is_private
3622             else 'premium_only' if needs_premium
3623             else 'subscriber_only' if needs_subscription
3624             else 'needs_auth' if needs_auth
3625             else 'unlisted' if is_unlisted
3626             else 'public' if all_known
3627             else None)
3628
3629     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3630         '''
3631         @returns            A list of values for the extractor argument given by "key"
3632                             or "default" if no such key is present
3633         @param default      The default value to return when the key is not present (default: [])
3634         @param casesense    When false, the values are converted to lower case
3635         '''
3636         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3637         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3638         if val is None:
3639             return [] if default is NO_DEFAULT else default
3640         return list(val) if casesense else [x.lower() for x in val]
3641
3642     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3643         if not playlist_id or not video_id:
3644             return not video_id
3645
3646         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3647         if no_playlist is not None:
3648             return not no_playlist
3649
3650         video_id = '' if video_id is True else f' {video_id}'
3651         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3652         if self.get_param('noplaylist'):
3653             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3654             return False
3655         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3656         return True
3657
3658     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3659         RetryManager.report_retry(
3660             err, _count or int(fatal), _retries,
3661             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3662             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3663
3664     def RetryManager(self, **kwargs):
3665         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3666
3667     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3668         display_id = traverse_obj(info_dict, 'display_id', 'id')
3669         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3670         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3671             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3672
3673     @classmethod
3674     def extract_from_webpage(cls, ydl, url, webpage):
3675         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3676               else ydl.get_info_extractor(cls.ie_key()))
3677         for info in ie._extract_from_webpage(url, webpage) or []:
3678             # url = None since we do not want to set (webpage/original)_url
3679             ydl.add_default_extra_info(info, ie, None)
3680             yield info
3681
3682     @classmethod
3683     def _extract_from_webpage(cls, url, webpage):
3684         for embed_url in orderedSet(
3685                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3686             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3687
3688     @classmethod
3689     def _extract_embed_urls(cls, url, webpage):
3690         """@returns all the embed urls on the webpage"""
3691         if '_EMBED_URL_RE' not in cls.__dict__:
3692             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3693             for idx, regex in enumerate(cls._EMBED_REGEX):
3694                 assert regex.count('(?P<url>') == 1, \
3695                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3696             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3697
3698         for regex in cls._EMBED_URL_RE:
3699             for mobj in regex.finditer(webpage):
3700                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3701                 if cls._VALID_URL is False or cls.suitable(embed_url):
3702                     yield embed_url
3703
3704     class StopExtraction(Exception):
3705         pass
3706
3707     @classmethod
3708     def _extract_url(cls, webpage):  # TODO: Remove
3709         """Only for compatibility with some older extractors"""
3710         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3711
3712     @classmethod
3713     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3714         if plugin_name:
3715             mro = inspect.getmro(cls)
3716             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3717             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3718             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3719             while getattr(super_class, '__wrapped__', None):
3720                 super_class = super_class.__wrapped__
3721             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3722             _PLUGIN_OVERRIDES[super_class].append(cls)
3723
3724         return super().__init_subclass__(**kwargs)
3725
3726
3727 class SearchInfoExtractor(InfoExtractor):
3728     """
3729     Base class for paged search queries extractors.
3730     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3731     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3732     """
3733
3734     _MAX_RESULTS = float('inf')
3735     _RETURN_TYPE = 'playlist'
3736
3737     @classproperty
3738     def _VALID_URL(cls):
3739         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3740
3741     def _real_extract(self, query):
3742         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3743         if prefix == '':
3744             return self._get_n_results(query, 1)
3745         elif prefix == 'all':
3746             return self._get_n_results(query, self._MAX_RESULTS)
3747         else:
3748             n = int(prefix)
3749             if n <= 0:
3750                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3751             elif n > self._MAX_RESULTS:
3752                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3753                 n = self._MAX_RESULTS
3754             return self._get_n_results(query, n)
3755
3756     def _get_n_results(self, query, n):
3757         """Get a specified number of results for a query.
3758         Either this function or _search_results must be overridden by subclasses """
3759         return self.playlist_result(
3760             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3761             query, query)
3762
3763     def _search_results(self, query):
3764         """Returns an iterator of search results"""
3765         raise NotImplementedError('This method must be implemented by subclasses')
3766
3767     @classproperty
3768     def SEARCH_KEY(cls):
3769         return cls._SEARCH_KEY
3770
3771
3772 class UnsupportedURLIE(InfoExtractor):
3773     _VALID_URL = '.*'
3774     _ENABLED = False
3775     IE_DESC = False
3776
3777     def _real_extract(self, url):
3778         raise UnsupportedError(url)
3779
3780
3781 _PLUGIN_OVERRIDES = collections.defaultdict(list)