[extractor] Add dev option `--load-pages`

[yt-dlp.git] / yt_dlp / extractor / common.py
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index 0a14f7c0d32b289a2797b54b9babfdb667dc5630..669b6bd00644e2c248c6e33759f6be9ffa380a3f 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1,23 +1,20 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
  import base64
-import datetime
+import collections
  import hashlib
  import itertools
  import json
+import math
  import netrc
  import os
  import random
-import re
  import sys
  import time
-import math
+import xml.etree.ElementTree
  
+from ..compat import functools, re  # isort: split
  from ..compat import (
      compat_cookiejar_Cookie,
      compat_cookies_SimpleCookie,
-    compat_etree_Element,
      compat_etree_fromstring,
      compat_expanduser,
      compat_getpass,
@@ -29,36 +26,37 @@
      compat_urllib_parse_urlencode,
      compat_urllib_request,
      compat_urlparse,
-    compat_xml_parse_error,
  )
  from ..downloader import FileDownloader
-from ..downloader.f4m import (
-    get_base_url,
-    remove_encrypted_media,
-)
+from ..downloader.f4m import get_base_url, remove_encrypted_media
  from ..utils import (
+    JSON_LD_RE,
+    NO_DEFAULT,
+    ExtractorError,
+    GeoRestrictedError,
+    GeoUtils,
+    RegexNotFoundError,
+    UnsupportedError,
      age_restricted,
      base_url,
      bug_reports_message,
+    classproperty,
      clean_html,
-    compiled_regex_type,
      determine_ext,
      determine_protocol,
      dict_get,
+    encode_data_uri,
      error_to_compat_str,
      extract_attributes,
-    ExtractorError,
+    filter_dict,
      fix_xml_ampersands,
      float_or_none,
      format_field,
-    GeoRestrictedError,
-    GeoUtils,
      int_or_none,
+    join_nonempty,
      js_to_json,
-    JSON_LD_RE,
      mimetype2ext,
      network_exceptions,
-    NO_DEFAULT,
      orderedSet,
      parse_bitrate,
      parse_codecs,
@@ -66,18 +64,17 @@
      parse_iso8601,
      parse_m3u8_attributes,
      parse_resolution,
-    RegexNotFoundError,
      sanitize_filename,
      sanitized_Request,
      str_or_none,
      str_to_int,
      strip_or_none,
      traverse_obj,
+    try_get,
      unescapeHTML,
      unified_strdate,
      unified_timestamp,
      update_Request,
-    update_url_query,
      url_basename,
      url_or_none,
      urljoin,
@@ -88,7 +85,7 @@
  )
  
  
-class InfoExtractor(object):
+class InfoExtractor:
      """Information Extractor class.
  
      Information extractors are the classes that, given a URL, extract
@@ -106,7 +103,9 @@ class InfoExtractor(object):
      For a video, the dictionaries must include the following fields:
  
      id:             Video identifier.
-    title:          Video title, unescaped.
+    title:          Video title, unescaped. Set to an empty string if video has
+                    no title as opposed to "None" which signifies that the
+                    extractor failed to obtain a title
  
      Additionally, it must contain either a formats entry or a url one:
  
@@ -134,6 +133,8 @@ class InfoExtractor(object):
                                     for HDS - URL of the F4M manifest,
                                     for DASH - URL of the MPD manifest,
                                     for MSS - URL of the ISM manifest.
+                    * manifest_stream_number  (For internal use only)
+                                 The index of the stream in the manifest file
                      * ext        Will be calculated from URL if missing
                      * format     A human-readable description of the format
                                   ("mp4 container with h264/opus").
@@ -147,6 +148,8 @@ class InfoExtractor(object):
                      * width      Width of the video, if known
                      * height     Height of the video, if known
                      * resolution Textual description of width and height
+                    * dynamic_range The dynamic range of the video. One of:
+                                 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
                      * tbr        Average bitrate of audio and video in KBit/s
                      * abr        Average audio bitrate in KBit/s
                      * acodec     Name of the audio codec in use
@@ -159,9 +162,8 @@ class InfoExtractor(object):
                      * filesize_approx  An estimate for the number of bytes
                      * player_url SWF Player URL (used for rtmpdump).
                      * protocol   The protocol that will be used for the actual
-                                 download, lower-case.
-                                 "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
-                                 "m3u8", "m3u8_native" or "http_dash_segments".
+                                 download, lower-case. One of "http", "https" or
+                                 one of the protocols defined in downloader.PROTOCOL_MAP
                      * fragment_base_url
                                   Base URL for fragments. Each fragment's path
                                   value (if present) will be relative to
@@ -177,6 +179,8 @@ class InfoExtractor(object):
                                              fragment_base_url
                                   * "duration" (optional, int or float)
                                   * "filesize" (optional, int)
+                    * is_from_start  Is a live format that can be downloaded
+                                from the start. Boolean
                      * preference Order number of this format. If this field is
                                   present and not None, the formats get sorted
                                   by this field, regardless of all other values.
@@ -206,8 +210,10 @@ class InfoExtractor(object):
                      * no_resume  The server does not support resuming the
                                   (HTTP or RTMP) download. Boolean.
                      * has_drm    The format has DRM and cannot be downloaded. Boolean
-                    * downloader_options  A dictionary of downloader options as
-                                 described in FileDownloader
+                    * downloader_options  A dictionary of downloader options
+                                 (For internal use only)
+                                 * http_chunk_size Chunk size for HTTP downloads
+                                 * ffmpeg_args     Extra arguments for ffmpeg downloader
                      RTMP formats can also have the additional fields: page_url,
                      app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
                      rtmp_protocol, rtmp_real_time
@@ -219,6 +225,7 @@ class InfoExtractor(object):
  
      The following fields are optional:
  
+    direct:         True if a direct video file was given (must only be set by GenericIE)
      alt_title:      A secondary title of the video.
      display_id      An alternative identifier for the video, not necessarily
                      unique, but available before title. Typically, id is
@@ -233,16 +240,22 @@ class InfoExtractor(object):
                          * "resolution" (optional, string "{width}x{height}",
                                          deprecated)
                          * "filesize" (optional, int)
+                        * "http_headers" (dict) - HTTP headers for the request
      thumbnail:      Full URL to a video thumbnail image.
      description:    Full video description.
      uploader:       Full name of the video uploader.
      license:        License name the video is licensed under.
      creator:        The creator of the video.
-    release_timestamp: UNIX timestamp of the moment the video was released.
-    release_date:   The date (YYYYMMDD) when the video was released.
      timestamp:      UNIX timestamp of the moment the video was uploaded
-    upload_date:    Video upload date (YYYYMMDD).
-                    If not explicitly set, calculated from timestamp.
+    upload_date:    Video upload date in UTC (YYYYMMDD).
+                    If not explicitly set, calculated from timestamp
+    release_timestamp: UNIX timestamp of the moment the video was released.
+                    If it is not clear whether to use timestamp or this, use the former
+    release_date:   The date (YYYYMMDD) when the video was released in UTC.
+                    If not explicitly set, calculated from release_timestamp
+    modified_timestamp: UNIX timestamp of the moment the video was last modified.
+    modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
+                    If not explicitly set, calculated from modified_timestamp
      uploader_id:    Nickname or id of the video uploader.
      uploader_url:   Full URL to a personal webpage of the video uploader.
      channel:        Full name of the channel the video is uploaded on.
@@ -250,6 +263,7 @@ class InfoExtractor(object):
                      fields. This depends on a particular extractor.
      channel_id:     Id of the channel.
      channel_url:    Full URL to a channel webpage.
+    channel_follower_count: Number of followers of the channel.
      location:       Physical location where the video was filmed.
      subtitles:      The available subtitles as a dictionary in the format
                      {tag: subformats}. "tag" is usually a language code, and
@@ -260,6 +274,8 @@ class InfoExtractor(object):
                          * "url": A URL pointing to the subtitles file
                      It can optionally also have:
                          * "name": Name or description of the subtitles
+                        * "http_headers": A dictionary of additional HTTP headers
+                                  to add to the request.
                      "ext" will be calculated from URL if missing
      automatic_captions: Like 'subtitles'; contains automatically generated
                      captions instead of normal subtitles
@@ -338,6 +354,7 @@ class InfoExtractor(object):
      series, programme or podcast:
  
      series:         Title of the series or programme the video episode belongs to.
+    series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
      season:         Title of the season the video episode belongs to.
      season_number:  Number of the season the video episode belongs to, as an integer.
      season_id:      Id of the season the video episode belongs to, as a unicode string.
@@ -364,6 +381,7 @@ class InfoExtractor(object):
      disc_number:    Number of the disc or other physical medium the track belongs to,
                      as an integer.
      release_year:   Year (YYYY) when the album was released.
+    composer:       Composer of the piece
  
      Unless mentioned otherwise, the fields should be Unicode strings.
  
@@ -377,6 +395,11 @@ class InfoExtractor(object):
      Additionally, playlists can have "id", "title", and any other relevent
      attributes with the same semantics as videos (see above).
  
+    It can also have the following optional fields:
+
+    playlist_count: The total number of videos in a playlist. If not given,
+                    YoutubeDL tries to calculate it from "entries"
+
  
      _type "multi_video" indicates that there are multiple videos that
      form a single show, for examples multiple acts of an opera or TV episode.
@@ -402,13 +425,21 @@ class InfoExtractor(object):
      title, description etc.
  
  
-    Subclasses of this one should re-define the _real_initialize() and
-    _real_extract() methods and define a _VALID_URL regexp.
+    Subclasses of this should define a _VALID_URL regexp and, re-define the
+    _real_extract() and (optionally) _real_initialize() methods.
      Probably, they should also be added to the list of extractors.
  
      Subclasses may also override suitable() if necessary, but ensure the function
      signature is preserved and that this function imports everything it needs
-    (except other extractors), so that lazy_extractors works correctly
+    (except other extractors), so that lazy_extractors works correctly.
+
+    To support username + password (or netrc) login, the extractor must define a
+    _NETRC_MACHINE and re-define _perform_login(username, password) and
+    (optionally) _initialize_pre_login() methods. The _perform_login method will
+    be called between _initialize_pre_login and _real_initialize if credentials
+    are passed by the user. In cases where it is necessary to have the login
+    process as part of the extraction rather than initialization, _perform_login
+    can be left undefined.
  
      _GEO_BYPASS attribute may be set to False in order to disable
      geo restriction bypass mechanisms for a particular extractor.
@@ -436,17 +467,25 @@ class InfoExtractor(object):
      _GEO_COUNTRIES = None
      _GEO_IP_BLOCKS = None
      _WORKING = True
+    _NETRC_MACHINE = None
+    IE_DESC = None
+    SEARCH_KEY = None
  
-    _LOGIN_HINTS = {
-        'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
-        'cookies': (
-            'Use --cookies-from-browser or --cookies for the authentication. '
-            'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
-        'password': 'Use --username and --password or --netrc to provide account credentials',
-    }
+    def _login_hint(self, method=NO_DEFAULT, netrc=None):
+        password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
+        return {
+            None: '',
+            'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
+            'password': f'Use {password_hint}',
+            'cookies': (
+                'Use --cookies-from-browser or --cookies for the authentication. '
+                'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
+        }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
  
      def __init__(self, downloader=None):
-        """Constructor. Receives an optional downloader."""
+        """Constructor. Receives an optional downloader (a YoutubeDL instance).
+        If a downloader is not passed during initialization,
+        it must be set using "set_downloader()" before "extract()" is called"""
          self._ready = False
          self._x_forwarded_for_ip = None
          self._printed_messages = set()
@@ -458,6 +497,8 @@ def _match_valid_url(cls, url):
          # we have cached the regexp for *this* class, whereas getattr would also
          # match the superclass
          if '_VALID_URL_RE' not in cls.__dict__:
+            if '_VALID_URL' not in cls.__dict__:
+                cls._VALID_URL = cls._make_valid_url()
              cls._VALID_URL_RE = re.compile(cls._VALID_URL)
          return cls._VALID_URL_RE.match(url)
  
@@ -484,6 +525,10 @@ def working(cls):
          """Getter method for _WORKING."""
          return cls._WORKING
  
+    @classmethod
+    def supports_login(cls):
+        return bool(cls._NETRC_MACHINE)
+
      def initialize(self):
          """Initializes an instance (authentication, etc)."""
          self._printed_messages = set()
@@ -492,6 +537,13 @@ def initialize(self):
              'ip_blocks': self._GEO_IP_BLOCKS,
          })
          if not self._ready:
+            self._initialize_pre_login()
+            if self.supports_login():
+                username, password = self._get_login_info()
+                if username:
+                    self._perform_login(username, password)
+            elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
+                self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
              self._real_initialize()
              self._ready = True
  
@@ -557,8 +609,7 @@ def _initialize_geo_bypass(self, geo_bypass_context):
  
              if ip_block:
                  self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
-                self._downloader.write_debug(
-                    '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
+                self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
                  return
  
              # Path 2: bypassing based on country code
@@ -577,7 +628,7 @@ def _initialize_geo_bypass(self, geo_bypass_context):
              if country:
                  self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
                  self._downloader.write_debug(
-                    'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
+                    f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
  
      def extract(self, url):
          """Extracts URL information and returns it in list of dicts."""
@@ -600,10 +651,19 @@ def extract(self, url):
                      if self.__maybe_fake_ip_and_retry(e.countries):
                          continue
                      raise
+        except UnsupportedError:
+            raise
          except ExtractorError as e:
-            video_id = e.video_id or self.get_temp_id(url)
-            raise ExtractorError(
-                e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause)
+            kwargs = {
+                'video_id': e.video_id or self.get_temp_id(url),
+                'ie': self.IE_NAME,
+                'tb': e.traceback or sys.exc_info()[2],
+                'expected': e.expected,
+                'cause': e.cause
+            }
+            if hasattr(e, 'countries'):
+                kwargs['countries'] = e.countries
+            raise type(e)(e.orig_msg, **kwargs)
          except compat_http_client.IncompleteRead as e:
              raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
          except (KeyError, StopIteration) as e:
@@ -625,25 +685,33 @@ def __maybe_fake_ip_and_retry(self, countries):
          return False
  
      def set_downloader(self, downloader):
-        """Sets the downloader for this IE."""
+        """Sets a YoutubeDL instance as the downloader for this IE."""
          self._downloader = downloader
  
+    def _initialize_pre_login(self):
+        """ Intialization before login. Redefine in subclasses."""
+        pass
+
+    def _perform_login(self, username, password):
+        """ Login with username and password. Redefine in subclasses."""
+        pass
+
      def _real_initialize(self):
          """Real initialization process. Redefine in subclasses."""
          pass
  
      def _real_extract(self, url):
          """Real extraction process. Redefine in subclasses."""
-        pass
+        raise NotImplementedError('This method must be implemented by subclasses')
  
      @classmethod
      def ie_key(cls):
          """A string for getting the InfoExtractor with get_info_extractor"""
          return cls.__name__[:-2]
  
-    @property
-    def IE_NAME(self):
-        return compat_str(type(self).__name__[:-2])
+    @classproperty
+    def IE_NAME(cls):
+        return cls.__name__[:-2]
  
      @staticmethod
      def __can_accept_status_code(err, expected_status):
@@ -655,6 +723,11 @@ def __can_accept_status_code(err, expected_status):
          else:
              return err.code in variadic(expected_status)
  
+    def _create_request(self, url_or_request, data=None, headers={}, query={}):
+        if not isinstance(url_or_request, compat_urllib_request.Request):
+            url_or_request = sanitized_Request(url_or_request)
+        return update_Request(url_or_request, data=data, headers=headers, query=query)
+
      def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
          """
          Return the response handle.
@@ -662,7 +735,7 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa
          See _download_webpage docstring for arguments specification.
          """
          if not self._downloader._first_webpage_request:
-            sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
+            sleep_interval = self.get_param('sleep_interval_requests') or 0
              if sleep_interval > 0:
                  self.to_screen('Sleeping %s seconds ...' % sleep_interval)
                  time.sleep(sleep_interval)
@@ -673,9 +746,9 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa
              self.report_download_webpage(video_id)
          elif note is not False:
              if video_id is None:
-                self.to_screen('%s' % (note,))
+                self.to_screen(str(note))
              else:
-                self.to_screen('%s: %s' % (video_id, note))
+                self.to_screen(f'{video_id}: {note}')
  
          # Some sites check X-Forwarded-For HTTP header in order to figure out
          # the origin of the client behind proxy. This allows bypassing geo
@@ -686,16 +759,8 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa
              if 'X-Forwarded-For' not in headers:
                  headers['X-Forwarded-For'] = self._x_forwarded_for_ip
  
-        if isinstance(url_or_request, compat_urllib_request.Request):
-            url_or_request = update_Request(
-                url_or_request, data=data, headers=headers, query=query)
-        else:
-            if query:
-                url_or_request = update_url_query(url_or_request, query)
-            if data is not None or headers:
-                url_or_request = sanitized_Request(url_or_request, data, headers)
          try:
-            return self._downloader.urlopen(url_or_request)
+            return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
          except network_exceptions as err:
              if isinstance(err, compat_urllib_error.HTTPError):
                  if self.__can_accept_status_code(err, expected_status):
@@ -711,9 +776,9 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa
              if errnote is None:
                  errnote = 'Unable to download webpage'
  
-            errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
+            errmsg = f'{errnote}: {error_to_compat_str(err)}'
              if fatal:
-                raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
+                raise ExtractorError(errmsg, cause=err)
              else:
                  self.report_warning(errmsg)
                  return False
@@ -722,8 +787,35 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
          """
          Return a tuple (page content as string, URL handle).
  
-        See _download_webpage docstring for arguments specification.
+        Arguments:
+        url_or_request -- plain text URL as a string or
+            a compat_urllib_request.Requestobject
+        video_id -- Video/playlist/item identifier (string)
+
+        Keyword arguments:
+        note -- note printed before downloading (string)
+        errnote -- note printed in case of an error (string)
+        fatal -- flag denoting whether error should be considered fatal,
+            i.e. whether it should cause ExtractionError to be raised,
+            otherwise a warning will be reported and extraction continued
+        encoding -- encoding for a page content decoding, guessed automatically
+            when not explicitly specified
+        data -- POST data (bytes)
+        headers -- HTTP headers (dict)
+        query -- URL query (dict)
+        expected_status -- allows to accept failed HTTP requests (non 2xx
+            status code) by explicitly specifying a set of accepted status
+            codes. Can be any of the following entities:
+                - an integer type specifying an exact failed status code to
+                  accept
+                - a list or a tuple of integer types specifying a list of
+                  failed status codes to accept
+                - a callable accepting an actual failed status code and
+                  returning True if it should be accepted
+            Note that this argument does not affect success status codes (2xx)
+            which are always accepted.
          """
+
          # Strip hashes from the URL (#1038)
          if isinstance(url_or_request, (compat_str, str)):
              url_or_request = url_or_request.partition('#')[0]
@@ -780,195 +872,73 @@ def __check_blocked(self, content):
                  'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
                  expected=True)
  
+    def _request_dump_filename(self, url, video_id):
+        basen = f'{video_id}_{url}'
+        trim_length = self.get_param('trim_file_name') or 240
+        if len(basen) > trim_length:
+            h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
+            basen = basen[:trim_length - len(h)] + h
+        filename = sanitize_filename(f'{basen}.dump', restricted=True)
+        # Working around MAX_PATH limitation on Windows (see
+        # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
+        if compat_os_name == 'nt':
+            absfilepath = os.path.abspath(filename)
+            if len(absfilepath) > 259:
+                filename = fR'\\?\{absfilepath}'
+        return filename
+
+    def __decode_webpage(self, webpage_bytes, encoding, headers):
+        if not encoding:
+            encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
+        try:
+            return webpage_bytes.decode(encoding, 'replace')
+        except LookupError:
+            return webpage_bytes.decode('utf-8', 'replace')
+
      def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
-        content_type = urlh.headers.get('Content-Type', '')
          webpage_bytes = urlh.read()
          if prefix is not None:
              webpage_bytes = prefix + webpage_bytes
-        if not encoding:
-            encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
          if self.get_param('dump_intermediate_pages', False):
              self.to_screen('Dumping request to ' + urlh.geturl())
              dump = base64.b64encode(webpage_bytes).decode('ascii')
              self._downloader.to_screen(dump)
-        if self.get_param('write_pages', False):
-            basen = '%s_%s' % (video_id, urlh.geturl())
-            trim_length = self.get_param('trim_file_name') or 240
-            if len(basen) > trim_length:
-                h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
-                basen = basen[:trim_length - len(h)] + h
-            raw_filename = basen + '.dump'
-            filename = sanitize_filename(raw_filename, restricted=True)
-            self.to_screen('Saving request to ' + filename)
-            # Working around MAX_PATH limitation on Windows (see
-            # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
-            if compat_os_name == 'nt':
-                absfilepath = os.path.abspath(filename)
-                if len(absfilepath) > 259:
-                    filename = '\\\\?\\' + absfilepath
+        if self.get_param('write_pages'):
+            filename = self._request_dump_filename(video_id, urlh.geturl())
+            self.to_screen(f'Saving request to {filename}')
              with open(filename, 'wb') as outf:
                  outf.write(webpage_bytes)
  
-        try:
-            content = webpage_bytes.decode(encoding, 'replace')
-        except LookupError:
-            content = webpage_bytes.decode('utf-8', 'replace')
-
+        content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
          self.__check_blocked(content)
  
          return content
  
-    def _download_webpage(
-            self, url_or_request, video_id, note=None, errnote=None,
-            fatal=True, tries=1, timeout=5, encoding=None, data=None,
-            headers={}, query={}, expected_status=None):
-        """
-        Return the data of the page as a string.
-
-        Arguments:
-        url_or_request -- plain text URL as a string or
-            a compat_urllib_request.Requestobject
-        video_id -- Video/playlist/item identifier (string)
-
-        Keyword arguments:
-        note -- note printed before downloading (string)
-        errnote -- note printed in case of an error (string)
-        fatal -- flag denoting whether error should be considered fatal,
-            i.e. whether it should cause ExtractionError to be raised,
-            otherwise a warning will be reported and extraction continued
-        tries -- number of tries
-        timeout -- sleep interval between tries
-        encoding -- encoding for a page content decoding, guessed automatically
-            when not explicitly specified
-        data -- POST data (bytes)
-        headers -- HTTP headers (dict)
-        query -- URL query (dict)
-        expected_status -- allows to accept failed HTTP requests (non 2xx
-            status code) by explicitly specifying a set of accepted status
-            codes. Can be any of the following entities:
-                - an integer type specifying an exact failed status code to
-                  accept
-                - a list or a tuple of integer types specifying a list of
-                  failed status codes to accept
-                - a callable accepting an actual failed status code and
-                  returning True if it should be accepted
-            Note that this argument does not affect success status codes (2xx)
-            which are always accepted.
-        """
-
-        success = False
-        try_count = 0
-        while success is False:
-            try:
-                res = self._download_webpage_handle(
-                    url_or_request, video_id, note, errnote, fatal,
-                    encoding=encoding, data=data, headers=headers, query=query,
-                    expected_status=expected_status)
-                success = True
-            except compat_http_client.IncompleteRead as e:
-                try_count += 1
-                if try_count >= tries:
-                    raise e
-                self._sleep(timeout, video_id)
-        if res is False:
-            return res
-        else:
-            content, _ = res
-            return content
-
-    def _download_xml_handle(
-            self, url_or_request, video_id, note='Downloading XML',
-            errnote='Unable to download XML', transform_source=None,
-            fatal=True, encoding=None, data=None, headers={}, query={},
-            expected_status=None):
-        """
-        Return a tuple (xml as an compat_etree_Element, URL handle).
-
-        See _download_webpage docstring for arguments specification.
-        """
-        res = self._download_webpage_handle(
-            url_or_request, video_id, note, errnote, fatal=fatal,
-            encoding=encoding, data=data, headers=headers, query=query,
-            expected_status=expected_status)
-        if res is False:
-            return res
-        xml_string, urlh = res
-        return self._parse_xml(
-            xml_string, video_id, transform_source=transform_source,
-            fatal=fatal), urlh
-
-    def _download_xml(
-            self, url_or_request, video_id,
-            note='Downloading XML', errnote='Unable to download XML',
-            transform_source=None, fatal=True, encoding=None,
-            data=None, headers={}, query={}, expected_status=None):
-        """
-        Return the xml as an compat_etree_Element.
-
-        See _download_webpage docstring for arguments specification.
-        """
-        res = self._download_xml_handle(
-            url_or_request, video_id, note=note, errnote=errnote,
-            transform_source=transform_source, fatal=fatal, encoding=encoding,
-            data=data, headers=headers, query=query,
-            expected_status=expected_status)
-        return res if res is False else res[0]
-
      def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
          if transform_source:
              xml_string = transform_source(xml_string)
          try:
              return compat_etree_fromstring(xml_string.encode('utf-8'))
-        except compat_xml_parse_error as ve:
+        except xml.etree.ElementTree.ParseError as ve:
              errmsg = '%s: Failed to parse XML ' % video_id
              if fatal:
                  raise ExtractorError(errmsg, cause=ve)
              else:
                  self.report_warning(errmsg + str(ve))
  
-    def _download_json_handle(
-            self, url_or_request, video_id, note='Downloading JSON metadata',
-            errnote='Unable to download JSON metadata', transform_source=None,
-            fatal=True, encoding=None, data=None, headers={}, query={},
-            expected_status=None):
-        """
-        Return a tuple (JSON object, URL handle).
-
-        See _download_webpage docstring for arguments specification.
-        """
-        res = self._download_webpage_handle(
-            url_or_request, video_id, note, errnote, fatal=fatal,
-            encoding=encoding, data=data, headers=headers, query=query,
-            expected_status=expected_status)
-        if res is False:
-            return res
-        json_string, urlh = res
-        return self._parse_json(
-            json_string, video_id, transform_source=transform_source,
-            fatal=fatal), urlh
-
-    def _download_json(
-            self, url_or_request, video_id, note='Downloading JSON metadata',
-            errnote='Unable to download JSON metadata', transform_source=None,
-            fatal=True, encoding=None, data=None, headers={}, query={},
-            expected_status=None):
-        """
-        Return the JSON object as a dict.
-
-        See _download_webpage docstring for arguments specification.
-        """
-        res = self._download_json_handle(
-            url_or_request, video_id, note=note, errnote=errnote,
-            transform_source=transform_source, fatal=fatal, encoding=encoding,
-            data=data, headers=headers, query=query,
-            expected_status=expected_status)
-        return res if res is False else res[0]
-
-    def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
+    def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, lenient=False):
          if transform_source:
              json_string = transform_source(json_string)
          try:
-            return json.loads(json_string)
+            try:
+                return json.loads(json_string, strict=False)
+            except json.JSONDecodeError as e:
+                if not lenient:
+                    raise
+                try:
+                    return json.loads(json_string[:e.pos], strict=False)
+                except ValueError:
+                    raise e
          except ValueError as ve:
              errmsg = '%s: Failed to parse JSON ' % video_id
              if fatal:
@@ -981,43 +951,95 @@ def _parse_socket_response_as_json(self, data, video_id, transform_source=None,
              data[data.find('{'):data.rfind('}') + 1],
              video_id, transform_source, fatal)
  
-    def _download_socket_json_handle(
-            self, url_or_request, video_id, note='Polling socket',
-            errnote='Unable to poll socket', transform_source=None,
-            fatal=True, encoding=None, data=None, headers={}, query={},
-            expected_status=None):
-        """
-        Return a tuple (JSON object, URL handle).
+    def __create_download_methods(name, parser, note, errnote, return_value):
+
+        def parse(ie, content, *args, **kwargs):
+            if parser is None:
+                return content
+            # parser is fetched by name so subclasses can override it
+            return getattr(ie, parser)(content, *args, **kwargs)
+
+        def download_handle(self, url_or_request, video_id, note=note, errnote=errnote,
+                            transform_source=None, fatal=True, *args, **kwargs):
+            res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, *args, **kwargs)
+            if res is False:
+                return res
+            content, urlh = res
+            return parse(self, content, video_id, transform_source, fatal), urlh
+
+        def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
+                             fatal=True, encoding=None, data=None, headers={}, query={}, *args, **kwargs):
+            if self.get_param('load_pages'):
+                url_or_request = self._create_request(url_or_request, data, headers, query)
+                filename = self._request_dump_filename(url_or_request.full_url, video_id)
+                self.to_screen(f'Loading request from {filename}')
+                try:
+                    with open(filename, 'rb') as dumpf:
+                        webpage_bytes = dumpf.read()
+                except OSError as e:
+                    self.report_warning(f'Unable to load request from disk: {e}')
+                else:
+                    content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
+                    return parse(self, content, video_id, transform_source, fatal)
+            args = [url_or_request, video_id, note, errnote, transform_source, fatal, encoding, data, headers, query, *args]
+            if parser is None:
+                args.pop(4)  # transform_source
+            # The method is fetched by name so subclasses can override _download_..._handle
+            res = getattr(self, download_handle.__name__)(*args, **kwargs)
+            return res if res is False else res[0]
+
+        def impersonate(func, name, return_value):
+            func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
+            func.__doc__ = f'''
+                @param transform_source     Apply this transformation before parsing
+                @returns                    {return_value}
+
+                See _download_webpage_handle docstring for other arguments specification
+            '''
+
+        impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
+        impersonate(download_content, f'_download_{name}', f'{return_value}')
+        return download_handle, download_content
+
+    _download_xml_handle, _download_xml = __create_download_methods(
+        'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
+    _download_json_handle, _download_json = __create_download_methods(
+        'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
+    _download_socket_json_handle, _download_socket_json = __create_download_methods(
+        'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
+    __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
  
-        See _download_webpage docstring for arguments specification.
-        """
-        res = self._download_webpage_handle(
-            url_or_request, video_id, note, errnote, fatal=fatal,
-            encoding=encoding, data=data, headers=headers, query=query,
-            expected_status=expected_status)
-        if res is False:
-            return res
-        webpage, urlh = res
-        return self._parse_socket_response_as_json(
-            webpage, video_id, transform_source=transform_source,
-            fatal=fatal), urlh
-
-    def _download_socket_json(
-            self, url_or_request, video_id, note='Polling socket',
-            errnote='Unable to poll socket', transform_source=None,
-            fatal=True, encoding=None, data=None, headers={}, query={},
-            expected_status=None):
+    def _download_webpage(
+            self, url_or_request, video_id, note=None, errnote=None,
+            fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
          """
-        Return the JSON object as a dict.
+        Return the data of the page as a string.
  
-        See _download_webpage docstring for arguments specification.
+        Keyword arguments:
+        tries -- number of tries
+        timeout -- sleep interval between tries
+
+        See _download_webpage_handle docstring for other arguments specification.
          """
-        res = self._download_socket_json_handle(
-            url_or_request, video_id, note=note, errnote=errnote,
-            transform_source=transform_source, fatal=fatal, encoding=encoding,
-            data=data, headers=headers, query=query,
-            expected_status=expected_status)
-        return res if res is False else res[0]
+
+        R''' # NB: These are unused; should they be deprecated?
+        if tries != 1:
+            self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
+        if timeout is NO_DEFAULT:
+            timeout = 5
+        else:
+            self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
+        '''
+
+        try_count = 0
+        while True:
+            try:
+                return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
+            except compat_http_client.IncompleteRead as e:
+                try_count += 1
+                if try_count >= tries:
+                    raise e
+                self._sleep(timeout, video_id)
  
      def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
          idstr = format_field(video_id, template='%s: ')
@@ -1030,10 +1052,10 @@ def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
  
      def to_screen(self, msg, *args, **kwargs):
          """Print msg to screen, prefixing it with '[ie_name]'"""
-        self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
+        self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
  
      def write_debug(self, msg, *args, **kwargs):
-        self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
+        self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
  
      def get_param(self, name, default=None, *args, **kwargs):
          if self._downloader:
@@ -1061,23 +1083,26 @@ def report_login(self):
  
      def raise_login_required(
              self, msg='This video is only available for registered users',
-            metadata_available=False, method='any'):
-        if metadata_available and self.get_param('ignore_no_formats_error'):
+            metadata_available=False, method=NO_DEFAULT):
+        if metadata_available and (
+                self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
              self.report_warning(msg)
-        if method is not None:
-            msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
+            return
+        msg += format_field(self._login_hint(method), template='. %s')
          raise ExtractorError(msg, expected=True)
  
      def raise_geo_restricted(
              self, msg='This video is not available from your location due to geo restriction',
              countries=None, metadata_available=False):
-        if metadata_available and self.get_param('ignore_no_formats_error'):
+        if metadata_available and (
+                self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
              self.report_warning(msg)
          else:
              raise GeoRestrictedError(msg, countries=countries)
  
      def raise_no_formats(self, msg, expected=False, video_id=None):
-        if expected and self.get_param('ignore_no_formats_error'):
+        if expected and (
+                self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
              self.report_warning(msg, video_id)
          elif isinstance(msg, ExtractorError):
              raise msg
@@ -1086,39 +1111,39 @@ def raise_no_formats(self, msg, expected=False, video_id=None):
  
      # Methods for following #608
      @staticmethod
-    def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
+    def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
          """Returns a URL that points to a page that should be processed"""
-        # TODO: ie should be the class used for getting the info
-        video_info = {'_type': 'url',
-                      'url': url,
-                      'ie_key': ie}
-        video_info.update(kwargs)
+        if ie is not None:
+            kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
          if video_id is not None:
-            video_info['id'] = video_id
+            kwargs['id'] = video_id
          if video_title is not None:
-            video_info['title'] = video_title
-        return video_info
+            kwargs['title'] = video_title
+        return {
+            **kwargs,
+            '_type': 'url_transparent' if url_transparent else 'url',
+            'url': url,
+        }
  
-    def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
-        urls = orderedSet(
-            self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
-            for m in matches)
-        return self.playlist_result(
-            urls, playlist_id=playlist_id, playlist_title=playlist_title)
+    def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
+        urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
+                for m in orderedSet(map(getter, matches) if getter else matches))
+        return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
  
      @staticmethod
-    def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
+    def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
          """Returns a playlist"""
-        video_info = {'_type': 'playlist',
-                      'entries': entries}
-        video_info.update(kwargs)
          if playlist_id:
-            video_info['id'] = playlist_id
+            kwargs['id'] = playlist_id
          if playlist_title:
-            video_info['title'] = playlist_title
+            kwargs['title'] = playlist_title
          if playlist_description is not None:
-            video_info['description'] = playlist_description
-        return video_info
+            kwargs['description'] = playlist_description
+        return {
+            **kwargs,
+            '_type': 'multi_video' if multi_video else 'playlist',
+            'entries': entries,
+        }
  
      def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
          """
@@ -1127,7 +1152,9 @@ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, f
          In case of failure return a default value or raise a WARNING or a
          RegexNotFoundError, depending on fatal, specifying the field name.
          """
-        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
+        if string is None:
+            mobj = None
+        elif isinstance(pattern, (str, re.Pattern)):
              mobj = re.search(pattern, string, flags)
          else:
              for p in pattern:
@@ -1135,7 +1162,7 @@ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, f
                  if mobj:
                      break
  
-        _name = self._downloader._color_text(name, 'blue')
+        _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
  
          if mobj:
              if group is None:
@@ -1180,7 +1207,7 @@ def _get_netrc_login_info(self, netrc_machine=None):
                  else:
                      raise netrc.NetrcParseError(
                          'No authenticators for %s' % netrc_machine)
-            except (IOError, netrc.NetrcParseError) as err:
+            except (OSError, netrc.NetrcParseError) as err:
                  self.report_warning(
                      'parsing .netrc: %s' % error_to_compat_str(err))
  
@@ -1223,8 +1250,8 @@ def _get_tfa_info(self, note='two-factor verification code'):
      @staticmethod
      def _og_regexes(prop):
          content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
-        property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
-                       % {'prop': re.escape(prop)})
+        property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
+                       % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
          template = r'<meta[^>]+?%s[^>]+?%s'
          return [
              template % (property_re, content_re),
@@ -1255,8 +1282,8 @@ def _og_search_thumbnail(self, html, **kargs):
      def _og_search_description(self, html, **kargs):
          return self._og_search_property('description', html, fatal=False, **kargs)
  
-    def _og_search_title(self, html, **kargs):
-        return self._og_search_property('title', html, **kargs)
+    def _og_search_title(self, html, *, fatal=False, **kargs):
+        return self._og_search_property('title', html, fatal=fatal, **kargs)
  
      def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
          regexes = self._og_regexes('video') + self._og_regexes('video:url')
@@ -1267,6 +1294,9 @@ def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
      def _og_search_url(self, html, **kargs):
          return self._og_search_property('url', html, **kargs)
  
+    def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
+        return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
+
      def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
          name = variadic(name)
          if display_name is None:
@@ -1407,6 +1437,23 @@ def extract_interaction_statistic(e):
                      continue
                  info[count_key] = interaction_count
  
+        def extract_chapter_information(e):
+            chapters = [{
+                'title': part.get('name'),
+                'start_time': part.get('startOffset'),
+                'end_time': part.get('endOffset'),
+            } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
+            for idx, (last_c, current_c, next_c) in enumerate(zip(
+                    [{'end_time': 0}] + chapters, chapters, chapters[1:])):
+                current_c['end_time'] = current_c['end_time'] or next_c['start_time']
+                current_c['start_time'] = current_c['start_time'] or last_c['end_time']
+                if None in current_c.values():
+                    self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
+                    return
+            if chapters:
+                chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
+                info['chapters'] = chapters
+
          def extract_video_object(e):
              assert e['@type'] == 'VideoObject'
              author = e.get('author')
@@ -1414,7 +1461,9 @@ def extract_video_object(e):
                  'url': url_or_none(e.get('contentUrl')),
                  'title': unescapeHTML(e.get('name')),
                  'description': unescapeHTML(e.get('description')),
-                'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
+                'thumbnails': [{'url': url}
+                               for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
+                               if url_or_none(url)],
                  'duration': parse_duration(e.get('duration')),
                  'timestamp': unified_timestamp(e.get('uploadDate')),
                  # author can be an instance of 'Organization' or 'Person' types.
@@ -1429,12 +1478,21 @@ def extract_video_object(e):
                  'view_count': int_or_none(e.get('interactionCount')),
              })
              extract_interaction_statistic(e)
+            extract_chapter_information(e)
  
-        for e in json_ld:
-            if '@context' in e:
+        def traverse_json_ld(json_ld, at_top_level=True):
+            for e in json_ld:
+                if at_top_level and '@context' not in e:
+                    continue
+                if at_top_level and set(e.keys()) == {'@context', '@graph'}:
+                    traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
+                    break
                  item_type = e.get('@type')
                  if expected_type is not None and expected_type != item_type:
                      continue
+                rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
+                if rating is not None:
+                    info['average_rating'] = rating
                  if item_type in ('TVEpisode', 'Episode'):
                      episode_name = unescapeHTML(e.get('name'))
                      info.update({
@@ -1464,8 +1522,10 @@ def extract_video_object(e):
                      info.update({
                          'timestamp': parse_iso8601(e.get('datePublished')),
                          'title': unescapeHTML(e.get('headline')),
-                        'description': unescapeHTML(e.get('articleBody')),
+                        'description': unescapeHTML(e.get('articleBody') or e.get('description')),
                      })
+                    if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
+                        extract_video_object(e['video'][0])
                  elif item_type == 'VideoObject':
                      extract_video_object(e)
                      if expected_type is None:
@@ -1479,7 +1539,34 @@ def extract_video_object(e):
                      continue
                  else:
                      break
-        return dict((k, v) for k, v in info.items() if v is not None)
+        traverse_json_ld(json_ld)
+
+        return filter_dict(info)
+
+    def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
+        return self._parse_json(
+            self._search_regex(
+                r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
+                webpage, 'next.js data', fatal=fatal, **kw),
+            video_id, transform_source=transform_source, fatal=fatal)
+
+    def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
+        ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
+        # not all website do this, but it can be changed
+        # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
+        rectx = re.escape(context_name)
+        js, arg_keys, arg_vals = self._search_regex(
+            (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
+             r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
+            webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
+
+        args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
+
+        for key, val in args.items():
+            if val in ('undefined', 'void 0'):
+                args[key] = 'null'
+
+        return self._parse_json(js_to_json(js, args), video_id)['data'][0]
  
      @staticmethod
      def _hidden_inputs(html):
@@ -1507,19 +1594,21 @@ class FormatSort:
          regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
  
          default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
-                   'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
-                   'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
+                   'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
+                   'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
          ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
                          'height', 'width', 'proto', 'vext', 'abr', 'aext',
-                        'fps', 'fs_approx', 'source', 'format_id')
+                        'fps', 'fs_approx', 'source', 'id')
  
          settings = {
              'vcodec': {'type': 'ordered', 'regex': True,
                         'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
              'acodec': {'type': 'ordered', 'regex': True,
-                       'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
+                       'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
+            'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
+                    'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
              'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
-                      'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
+                      'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
              'vext': {'type': 'ordered', 'field': 'video_ext',
                       'order': ('mp4', 'webm', 'flv', '', 'none'),
                       'order_free': ('webm', 'mp4', 'flv', '', 'none')},
@@ -1533,8 +1622,8 @@ class FormatSort:
              'ie_pref': {'priority': True, 'type': 'extractor'},
              'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
              'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
-            'lang': {'convert': 'ignore', 'field': 'language_preference'},
-            'quality': {'convert': 'float_none', 'default': -1},
+            'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
+            'quality': {'convert': 'float', 'default': -1},
              'filesize': {'convert': 'bytes'},
              'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
              'id': {'convert': 'string', 'field': 'format_id'},
@@ -1545,7 +1634,7 @@ class FormatSort:
              'vbr': {'convert': 'float_none'},
              'abr': {'convert': 'float_none'},
              'asr': {'convert': 'float_none'},
-            'source': {'convert': 'ignore', 'field': 'source_preference'},
+            'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
  
              'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
              'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
@@ -1554,39 +1643,51 @@ class FormatSort:
              'res': {'type': 'multiple', 'field': ('height', 'width'),
                      'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
  
-            # Most of these exist only for compatibility reasons
-            'dimension': {'type': 'alias', 'field': 'res'},
-            'resolution': {'type': 'alias', 'field': 'res'},
-            'extension': {'type': 'alias', 'field': 'ext'},
-            'bitrate': {'type': 'alias', 'field': 'br'},
-            'total_bitrate': {'type': 'alias', 'field': 'tbr'},
-            'video_bitrate': {'type': 'alias', 'field': 'vbr'},
-            'audio_bitrate': {'type': 'alias', 'field': 'abr'},
-            'framerate': {'type': 'alias', 'field': 'fps'},
-            'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
-            'protocol': {'type': 'alias', 'field': 'proto'},
+            # For compatibility with youtube-dl
+            'format_id': {'type': 'alias', 'field': 'id'},
+            'preference': {'type': 'alias', 'field': 'ie_pref'},
+            'language_preference': {'type': 'alias', 'field': 'lang'},
              'source_preference': {'type': 'alias', 'field': 'source'},
+            'protocol': {'type': 'alias', 'field': 'proto'},
              'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
-            'filesize_estimate': {'type': 'alias', 'field': 'size'},
-            'samplerate': {'type': 'alias', 'field': 'asr'},
-            'video_ext': {'type': 'alias', 'field': 'vext'},
-            'audio_ext': {'type': 'alias', 'field': 'aext'},
-            'video_codec': {'type': 'alias', 'field': 'vcodec'},
-            'audio_codec': {'type': 'alias', 'field': 'acodec'},
-            'video': {'type': 'alias', 'field': 'hasvid'},
-            'has_video': {'type': 'alias', 'field': 'hasvid'},
-            'audio': {'type': 'alias', 'field': 'hasaud'},
-            'has_audio': {'type': 'alias', 'field': 'hasaud'},
-            'extractor': {'type': 'alias', 'field': 'ie_pref'},
-            'preference': {'type': 'alias', 'field': 'ie_pref'},
-            'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
-            'format_id': {'type': 'alias', 'field': 'id'},
+
+            # Deprecated
+            'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
+            'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
+            'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
+            'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
+            'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
+            'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
+            'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
+            'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
+            'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
+            'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
+            'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
+            'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
+            'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
+            'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
+            'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
+            'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
+            'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
+            'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
+            'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
+            'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
          }
  
-        _order = []
+        def __init__(self, ie, field_preference):
+            self._order = []
+            self.ydl = ie._downloader
+            self.evaluate_params(self.ydl.params, field_preference)
+            if ie.get_param('verbose'):
+                self.print_verbose_info(self.ydl.write_debug)
  
          def _get_field_setting(self, field, key):
              if field not in self.settings:
+                if key in ('forced', 'priority'):
+                    return False
+                self.ydl.deprecation_warning(
+                    f'Using arbitrary fields ({field}) for format sorting is deprecated '
+                    'and may be removed in a future version')
                  self.settings[field] = {}
              propObj = self.settings[field]
              if key not in propObj:
@@ -1669,7 +1770,11 @@ def add_item(field, reverse, closest, limit_text):
                  if field is None:
                      continue
                  if self._get_field_setting(field, 'type') == 'alias':
-                    field = self._get_field_setting(field, 'field')
+                    alias, field = field, self._get_field_setting(field, 'field')
+                    if self._get_field_setting(alias, 'deprecated'):
+                        self.ydl.deprecation_warning(
+                            f'Format sorting alias {alias} is deprecated '
+                            f'and may be removed in a future version. Please use {field} instead')
                  reverse = match.group('reverse') is not None
                  closest = match.group('separator') == '~'
                  limit_text = match.group('limit')
@@ -1773,11 +1878,7 @@ def calculate_preference(self, format):
      def _sort_formats(self, formats, field_preference=[]):
          if not formats:
              return
-        format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
-        format_sort.evaluate_params(self._downloader.params, field_preference)
-        if self.get_param('verbose', False):
-            format_sort.print_verbose_info(self._downloader.write_debug)
-        formats.sort(key=lambda f: format_sort.calculate_preference(f))
+        formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
  
      def _check_formats(self, formats, video_id):
          if formats:
@@ -1838,17 +1939,19 @@ def _sleep(self, timeout, video_id, msg_template=None):
      def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
                               transform_source=lambda s: fix_xml_ampersands(s).strip(),
                               fatal=True, m3u8_id=None, data=None, headers={}, query={}):
-        manifest = self._download_xml(
+        res = self._download_xml_handle(
              manifest_url, video_id, 'Downloading f4m manifest',
              'Unable to download f4m manifest',
              # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
              # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
              transform_source=transform_source,
              fatal=fatal, data=data, headers=headers, query=query)
-
-        if manifest is False:
+        if res is False:
              return []
  
+        manifest, urlh = res
+        manifest_url = urlh.geturl()
+
          return self._parse_f4m_formats(
              manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
              transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
@@ -1856,7 +1959,7 @@ def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=
      def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
                             fatal=True, m3u8_id=None):
-        if not isinstance(manifest, compat_etree_Element) and not fatal:
+        if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
              return []
  
          # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
@@ -1895,7 +1998,7 @@ def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None,
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              width = int_or_none(media_el.attrib.get('width'))
              height = int_or_none(media_el.attrib.get('height'))
-            format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
+            format_id = join_nonempty(f4m_id, tbr or i)
              # If <bootstrapInfo> is present, the specified f4m is a
              # stream-level manifest, and only set-level manifests may refer to
              # external resources.  See section 11.4 and section 4 of F4M spec
@@ -1957,7 +2060,7 @@ def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None,
  
      def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
          return {
-            'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
+            'format_id': join_nonempty(m3u8_id, 'meta'),
              'url': m3u8_url,
              'ext': ext,
              'protocol': 'm3u8',
@@ -2004,16 +2107,16 @@ def _extract_m3u8_formats_and_subtitles(
              headers=headers, query=query, video_id=video_id)
  
      def _parse_m3u8_formats_and_subtitles(
-            self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
+            self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
              preference=None, quality=None, m3u8_id=None, live=False, note=None,
              errnote=None, fatal=True, data=None, headers={}, query={},
              video_id=None):
          formats, subtitles = [], {}
  
-        if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
-            return formats, subtitles
-
-        has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc)
+        has_drm = re.search('|'.join([
+            r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
+            r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
+        ]), m3u8_doc)
  
          def format_url(url):
              return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
@@ -2052,9 +2155,9 @@ def _extract_m3u8_playlist_indices(*args, **kwargs):
  
          if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
              formats = [{
-                'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
+                'format_id': join_nonempty(m3u8_id, idx),
                  'format_index': idx,
-                'url': m3u8_url,
+                'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
                  'ext': ext,
                  'protocol': entry_protocol,
                  'preference': preference,
@@ -2101,7 +2204,7 @@ def extract_media(x_media_line):
              if media_url:
                  manifest_url = format_url(media_url)
                  formats.extend({
-                    'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
+                    'format_id': join_nonempty(m3u8_id, group_id, name, idx),
                      'format_note': name,
                      'format_index': idx,
                      'url': manifest_url,
@@ -2158,9 +2261,9 @@ def build_stream_name():
                      # format_id intact.
                      if not live:
                          stream_name = build_stream_name()
-                        format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
+                        format_id[1] = stream_name or '%d' % (tbr or len(formats))
                      f = {
-                        'format_id': '-'.join(map(str, filter(None, format_id))),
+                        'format_id': join_nonempty(*format_id),
                          'format_index': idx,
                          'url': manifest_url,
                          'manifest_url': m3u8_url,
@@ -2256,11 +2359,13 @@ def _xpath_ns(path, namespace=None):
          return '/'.join(out)
  
      def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
-        smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
-
-        if smil is False:
+        res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
+        if res is False:
              assert not fatal
-            return []
+            return [], {}
+
+        smil, urlh = res
+        smil_url = urlh.geturl()
  
          namespace = self._parse_smil_namespace(smil)
  
@@ -2278,13 +2383,17 @@ def _extract_smil_formats(self, *args, **kwargs):
          return fmts
  
      def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
-        smil = self._download_smil(smil_url, video_id, fatal=fatal)
-        if smil is False:
+        res = self._download_smil(smil_url, video_id, fatal=fatal)
+        if res is False:
              return {}
+
+        smil, urlh = res
+        smil_url = urlh.geturl()
+
          return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
  
      def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
-        return self._download_xml(
+        return self._download_xml_handle(
              smil_url, video_id, 'Downloading SMIL file',
              'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
  
@@ -2463,11 +2572,15 @@ def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
          return subtitles
  
      def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
-        xspf = self._download_xml(
+        res = self._download_xml_handle(
              xspf_url, playlist_id, 'Downloading xpsf playlist',
              'Unable to download xspf manifest', fatal=fatal)
-        if xspf is False:
+        if res is False:
              return []
+
+        xspf, urlh = res
+        xspf_url = urlh.geturl()
+
          return self._parse_xspf(
              xspf, playlist_id, xspf_url=xspf_url,
              xspf_base_url=base_url(xspf_url))
@@ -2532,7 +2645,10 @@ def _extract_mpd_formats_and_subtitles(
          mpd_doc, urlh = res
          if mpd_doc is None:
              return [], {}
-        mpd_base_url = base_url(urlh.geturl())
+
+        # We could have been redirected to a new url when we retrieved our mpd file.
+        mpd_url = urlh.geturl()
+        mpd_base_url = base_url(mpd_url)
  
          return self._parse_mpd_formats_and_subtitles(
              mpd_doc, mpd_id, mpd_base_url, mpd_url)
@@ -2624,7 +2740,7 @@ def extract_Initialization(source):
  
          mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
          formats, subtitles = [], {}
-        stream_numbers = {'audio': 0, 'video': 0}
+        stream_numbers = collections.defaultdict(int)
          for period in mpd_doc.findall(_add_ns('Period')):
              period_duration = parse_duration(period.get('duration')) or mpd_duration
              period_ms_info = extract_multisegment_info(period, {
@@ -2640,11 +2756,20 @@ def extract_Initialization(source):
                      mime_type = representation_attrib['mimeType']
                      content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
  
-                    codecs = representation_attrib.get('codecs', '')
+                    codec_str = representation_attrib.get('codecs', '')
+                    # Some kind of binary subtitle found in some youtube livestreams
+                    if mime_type == 'application/x-rawcc':
+                        codecs = {'scodec': codec_str}
+                    else:
+                        codecs = parse_codecs(codec_str)
                      if content_type not in ('video', 'audio', 'text'):
                          if mime_type == 'image/jpeg':
                              content_type = mime_type
-                        elif codecs.split('.')[0] == 'stpp':
+                        elif codecs.get('vcodec', 'none') != 'none':
+                            content_type = 'video'
+                        elif codecs.get('acodec', 'none') != 'none':
+                            content_type = 'audio'
+                        elif codecs.get('scodec', 'none') != 'none':
                              content_type = 'text'
                          elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
                              content_type = 'text'
@@ -2690,10 +2815,8 @@ def extract_Initialization(source):
                              'format_note': 'DASH %s' % content_type,
                              'filesize': filesize,
                              'container': mimetype2ext(mime_type) + '_dash',
-                            'manifest_stream_number': stream_numbers[content_type]
+                            **codecs
                          }
-                        f.update(parse_codecs(codecs))
-                        stream_numbers[content_type] += 1
                      elif content_type == 'text':
                          f = {
                              'ext': mimetype2ext(mime_type),
@@ -2766,7 +2889,8 @@ def location_key(location):
                              segment_duration = None
                              if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
                                  segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
-                                representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
+                                representation_ms_info['total_number'] = int(math.ceil(
+                                    float_or_none(period_duration, segment_duration, default=0)))
                              representation_ms_info['fragments'] = [{
                                  media_location_key: media_template % {
                                      'Number': segment_number,
@@ -2857,10 +2981,16 @@ def add_segment_url():
                                  f['url'] = initialization_url
                              f['fragments'].append({location_key(initialization_url): initialization_url})
                          f['fragments'].extend(representation_ms_info['fragments'])
+                        if not period_duration:
+                            period_duration = try_get(
+                                representation_ms_info,
+                                lambda r: sum(frag['duration'] for frag in r['fragments']), float)
                      else:
                          # Assuming direct URL to unfragmented media.
                          f['url'] = base_url
-                    if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
+                    if content_type in ('video', 'audio', 'image/jpeg'):
+                        f['manifest_stream_number'] = stream_numbers[f['url']]
+                        stream_numbers[f['url']] += 1
                          formats.append(f)
                      elif content_type == 'text':
                          subtitles.setdefault(lang or 'und', []).append(f)
@@ -2949,13 +3079,6 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
                          })
                          fragment_ctx['time'] += fragment_ctx['duration']
  
-                format_id = []
-                if ism_id:
-                    format_id.append(ism_id)
-                if stream_name:
-                    format_id.append(stream_name)
-                format_id.append(compat_str(tbr))
-
                  if stream_type == 'text':
                      subtitles.setdefault(stream_language, []).append({
                          'ext': 'ismt',
@@ -2974,7 +3097,7 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
                      })
                  elif stream_type in ('video', 'audio'):
                      formats.append({
-                        'format_id': '-'.join(format_id),
+                        'format_id': join_nonempty(ism_id, stream_name, tbr),
                          'url': ism_url,
                          'manifest_url': ism_url,
                          'ext': 'ismv' if stream_type == 'video' else 'isma',
@@ -3004,7 +3127,7 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
                      })
          return formats, subtitles
  
-    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
+    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
          def absolute_url(item_url):
              return urljoin(base_url, item_url)
  
@@ -3019,7 +3142,8 @@ def parse_content_type(content_type):
                  return f
              return {}
  
-        def _media_formats(src, cur_media_type, type_info={}):
+        def _media_formats(src, cur_media_type, type_info=None):
+            type_info = type_info or {}
              full_url = absolute_url(src)
              ext = type_info.get('ext') or determine_ext(full_url)
              if ext == 'm3u8':
@@ -3037,6 +3161,7 @@ def _media_formats(src, cur_media_type, type_info={}):
                  formats = [{
                      'url': full_url,
                      'vcodec': 'none' if cur_media_type == 'audio' else None,
+                    'ext': ext,
                  }]
              return is_plain_url, formats
  
@@ -3063,7 +3188,8 @@ def _media_formats(src, cur_media_type, type_info={}):
              media_attributes = extract_attributes(media_tag)
              src = strip_or_none(media_attributes.get('src'))
              if src:
-                _, formats = _media_formats(src, media_type)
+                f = parse_content_type(media_attributes.get('type'))
+                _, formats = _media_formats(src, media_type, f)
                  media_info['formats'].extend(formats)
              media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
              if media_content:
@@ -3180,7 +3306,7 @@ def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}
                              http_f = f.copy()
                              del http_f['manifest_url']
                              http_url = re.sub(
-                                REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
+                                REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
                              http_f.update({
                                  'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
                                  'url': http_url,
@@ -3201,7 +3327,7 @@ def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native
          formats = []
  
          def manifest_url(manifest):
-            m_url = '%s/%s' % (http_base_url, manifest)
+            m_url = f'{http_base_url}/{manifest}'
              if query:
                  m_url += '?%s' % query
              return m_url
@@ -3238,7 +3364,7 @@ def manifest_url(manifest):
              for protocol in ('rtmp', 'rtsp'):
                  if protocol not in skip_protocols:
                      formats.append({
-                        'url': '%s:%s' % (protocol, url_base),
+                        'url': f'{protocol}:{url_base}',
                          'format_id': protocol,
                          'protocol': protocol,
                      })
@@ -3398,17 +3524,13 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
          return formats
  
      def _live_title(self, name):
-        """ Generate the title for a live video """
-        now = datetime.datetime.now()
-        now_str = now.strftime('%Y-%m-%d %H:%M')
-        return name + ' ' + now_str
+        self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
+        return name
  
      def _int(self, v, name, fatal=False, **kwargs):
          res = int_or_none(v, **kwargs)
-        if 'get_attr' in kwargs:
-            print(getattr(v, kwargs['get_attr']))
          if res is None:
-            msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
+            msg = f'Failed to extract {name}: Could not parse value {v!r}'
              if fatal:
                  raise ExtractorError(msg)
              else:
@@ -3418,7 +3540,7 @@ def _int(self, v, name, fatal=False, **kwargs):
      def _float(self, v, name, fatal=False, **kwargs):
          res = float_or_none(v, **kwargs)
          if res is None:
-            msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
+            msg = f'Failed to extract {name}: Could not parse value {v!r}'
              if fatal:
                  raise ExtractorError(msg)
              else:
@@ -3435,9 +3557,7 @@ def _set_cookie(self, domain, name, value, expire_time=None, port=None,
  
      def _get_cookies(self, url):
          """ Return a compat_cookies_SimpleCookie with the cookies for the url """
-        req = sanitized_Request(url)
-        self._downloader.cookiejar.add_cookie_header(req)
-        return compat_cookies_SimpleCookie(req.get_header('Cookie'))
+        return compat_cookies_SimpleCookie(self._downloader._calc_cookies(url))
  
      def _apply_first_set_cookie_header(self, url_handle, cookie):
          """
@@ -3456,9 +3576,7 @@ def _apply_first_set_cookie_header(self, url_handle, cookie):
          for header, cookies in url_handle.headers.items():
              if header.lower() != 'set-cookie':
                  continue
-            if sys.version_info[0] >= 3:
-                cookies = cookies.encode('iso-8859-1')
-            cookies = cookies.decode('utf-8')
+            cookies = cookies.encode('iso-8859-1').decode('utf-8')
              cookie_value = re.search(
                  r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
              if cookie_value:
@@ -3466,34 +3584,55 @@ def _apply_first_set_cookie_header(self, url_handle, cookie):
                  self._set_cookie(domain, cookie, value)
                  break
  
-    def get_testcases(self, include_onlymatching=False):
-        t = getattr(self, '_TEST', None)
+    @classmethod
+    def get_testcases(cls, include_onlymatching=False):
+        t = getattr(cls, '_TEST', None)
          if t:
-            assert not hasattr(self, '_TESTS'), \
-                '%s has _TEST and _TESTS' % type(self).__name__
+            assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
              tests = [t]
          else:
-            tests = getattr(self, '_TESTS', [])
+            tests = getattr(cls, '_TESTS', [])
          for t in tests:
              if not include_onlymatching and t.get('only_matching', False):
                  continue
-            t['name'] = type(self).__name__[:-len('IE')]
+            t['name'] = cls.ie_key()
              yield t
  
-    def is_suitable(self, age_limit):
-        """ Test whether the extractor is generally suitable for the given
-        age limit (i.e. pornographic sites are not, all others usually are) """
-
-        any_restricted = False
-        for tc in self.get_testcases(include_onlymatching=False):
-            if tc.get('playlist', []):
-                tc = tc['playlist'][0]
-            is_restricted = age_restricted(
-                tc.get('info_dict', {}).get('age_limit'), age_limit)
-            if not is_restricted:
-                return True
-            any_restricted = any_restricted or is_restricted
-        return not any_restricted
+    @classproperty
+    def age_limit(cls):
+        """Get age limit from the testcases"""
+        return max(traverse_obj(
+            tuple(cls.get_testcases(include_onlymatching=False)),
+            (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
+
+    @classmethod
+    def is_suitable(cls, age_limit):
+        """Test whether the extractor is generally suitable for the given age limit"""
+        return not age_restricted(cls.age_limit, age_limit)
+
+    @classmethod
+    def description(cls, *, markdown=True, search_examples=None):
+        """Description of the extractor"""
+        desc = ''
+        if cls._NETRC_MACHINE:
+            if markdown:
+                desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
+            else:
+                desc += f' [{cls._NETRC_MACHINE}]'
+        if cls.IE_DESC is False:
+            desc += ' [HIDDEN]'
+        elif cls.IE_DESC:
+            desc += f' {cls.IE_DESC}'
+        if cls.SEARCH_KEY:
+            desc += f'; "{cls.SEARCH_KEY}:" prefix'
+            if search_examples:
+                _COUNTS = ('', '5', '10', 'all')
+                desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
+        if not cls.working():
+            desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
+
+        name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
+        return f'{name}:{desc}' if desc else name
  
      def extract_subtitles(self, *args, **kwargs):
          if (self.get_param('writesubtitles', False)
@@ -3511,14 +3650,18 @@ def extract_comments(self, *args, **kwargs):
  
          def extractor():
              comments = []
+            interrupted = True
              try:
                  while True:
                      comments.append(next(generator))
-            except KeyboardInterrupt:
-                interrupted = True
-                self.to_screen('Interrupted by user')
              except StopIteration:
                  interrupted = False
+            except KeyboardInterrupt:
+                self.to_screen('Interrupted by user')
+            except Exception as e:
+                if self.get_param('ignoreerrors') is not True:
+                    raise
+                self._downloader.report_error(e)
              comment_count = len(comments)
              self.to_screen(f'Extracted {comment_count} comments')
              return {
@@ -3532,11 +3675,11 @@ def _get_comments(self, *args, **kwargs):
  
      @staticmethod
      def _merge_subtitle_items(subtitle_list1, subtitle_list2):
-        """ Merge subtitle items for one language. Items with duplicated URLs
+        """ Merge subtitle items for one language. Items with duplicated URLs/data
          will be dropped. """
-        list1_urls = set([item['url'] for item in subtitle_list1])
+        list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
          ret = list(subtitle_list1)
-        ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
+        ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
          return ret
  
      @classmethod
@@ -3558,12 +3701,15 @@ def extract_automatic_captions(self, *args, **kwargs):
      def _get_automatic_captions(self, *args, **kwargs):
          raise NotImplementedError('This method must be implemented by subclasses')
  
+    @functools.cached_property
+    def _cookies_passed(self):
+        """Whether cookies have been passed to YoutubeDL"""
+        return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
+
      def mark_watched(self, *args, **kwargs):
          if not self.get_param('mark_watched', False):
              return
-        if (self._get_login_info()[0] is not None
-                or self.get_param('cookiefile')
-                or self.get_param('cookiesfrombrowser')):
+        if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
              self._mark_watched(*args, **kwargs)
  
      def _mark_watched(self, *args, **kwargs):
@@ -3596,7 +3742,7 @@ def _availability(is_private=None, needs_premium=None, needs_subscription=None,
              else 'public' if all_known
              else None)
  
-    def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
+    def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
          '''
          @returns            A list of values for the extractor argument given by "key"
                              or "default" if no such key is present
@@ -3604,34 +3750,43 @@ def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
          @param casesense    When false, the values are converted to lower case
          '''
          val = traverse_obj(
-            self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
+            self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
          if val is None:
              return [] if default is NO_DEFAULT else default
          return list(val) if casesense else [x.lower() for x in val]
  
+    def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
+        if not playlist_id or not video_id:
+            return not video_id
+
+        no_playlist = (smuggled_data or {}).get('force_noplaylist')
+        if no_playlist is not None:
+            return not no_playlist
+
+        video_id = '' if video_id is True else f' {video_id}'
+        playlist_id = '' if playlist_id is True else f' {playlist_id}'
+        if self.get_param('noplaylist'):
+            self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
+            return False
+        self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
+        return True
+
  
  class SearchInfoExtractor(InfoExtractor):
      """
      Base class for paged search queries extractors.
      They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
-    Instances should define _SEARCH_KEY and _MAX_RESULTS.
+    Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
      """
  
+    _MAX_RESULTS = float('inf')
+
      @classmethod
      def _make_valid_url(cls):
          return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
  
-    @classmethod
-    def suitable(cls, url):
-        return re.match(cls._make_valid_url(), url) is not None
-
      def _real_extract(self, query):
-        mobj = re.match(self._make_valid_url(), query)
-        if mobj is None:
-            raise ExtractorError('Invalid search query "%s"' % query)
-
-        prefix = mobj.group('prefix')
-        query = mobj.group('query')
+        prefix, query = self._match_valid_url(query).group('prefix', 'query')
          if prefix == '':
              return self._get_n_results(query, 1)
          elif prefix == 'all':
@@ -3639,7 +3794,7 @@ def _real_extract(self, query):
          else:
              n = int(prefix)
              if n <= 0:
-                raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
+                raise ExtractorError(f'invalid download number {n} for query "{query}"')
              elif n > self._MAX_RESULTS:
                  self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
                  n = self._MAX_RESULTS
@@ -3656,6 +3811,6 @@ def _search_results(self, query):
          """Returns an iterator of search results"""
          raise NotImplementedError('This method must be implemented by subclasses')
  
-    @property
-    def SEARCH_KEY(self):
-        return self._SEARCH_KEY
+    @classproperty
+    def SEARCH_KEY(cls):
+        return cls._SEARCH_KEY