[docs,cleanup] Some minor refactoring and improve docs

[yt-dlp.git] / yt_dlp / extractor / youtube.py
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 15e0f8adbc0ac1fa3de56719b24d4867f5443715..159b0a3b9d36c2f359cbf42878636d79b72deab3 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -48,6 +48,7 @@
      parse_iso8601,
      parse_qs,
      qualities,
+    remove_end,
      remove_start,
      smuggle_url,
      str_or_none,
@@ -59,7 +60,6 @@
      unsmuggle_url,
      update_url_query,
      url_or_none,
-    urlencode_postdata,
      urljoin,
      variadic,
  )
@@ -117,6 +117,7 @@
              }
          },
          'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
+        'REQUIRE_JS_PLAYER': False
      },
      'android_embedded': {
          'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
@@ -126,7 +127,8 @@
                  'clientVersion': '16.20',
              },
          },
-        'INNERTUBE_CONTEXT_CLIENT_NAME': 55
+        'INNERTUBE_CONTEXT_CLIENT_NAME': 55,
+        'REQUIRE_JS_PLAYER': False
      },
      'android_music': {
          'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
@@ -138,6 +140,7 @@
              }
          },
          'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
+        'REQUIRE_JS_PLAYER': False
      },
      'android_creator': {
          'INNERTUBE_CONTEXT': {
@@ -146,7 +149,8 @@
                  'clientVersion': '21.24.100',
              },
          },
-        'INNERTUBE_CONTEXT_CLIENT_NAME': 14
+        'INNERTUBE_CONTEXT_CLIENT_NAME': 14,
+        'REQUIRE_JS_PLAYER': False
      },
      # ios has HLS live streams
      # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680
@@ -158,7 +162,8 @@
                  'clientVersion': '16.20',
              }
          },
-        'INNERTUBE_CONTEXT_CLIENT_NAME': 5
+        'INNERTUBE_CONTEXT_CLIENT_NAME': 5,
+        'REQUIRE_JS_PLAYER': False
      },
      'ios_embedded': {
          'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
@@ -168,7 +173,8 @@
                  'clientVersion': '16.20',
              },
          },
-        'INNERTUBE_CONTEXT_CLIENT_NAME': 66
+        'INNERTUBE_CONTEXT_CLIENT_NAME': 66,
+        'REQUIRE_JS_PLAYER': False
      },
      'ios_music': {
          'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og',
@@ -179,7 +185,8 @@
                  'clientVersion': '4.32',
              },
          },
-        'INNERTUBE_CONTEXT_CLIENT_NAME': 26
+        'INNERTUBE_CONTEXT_CLIENT_NAME': 26,
+        'REQUIRE_JS_PLAYER': False
      },
      'ios_creator': {
          'INNERTUBE_CONTEXT': {
@@ -188,7 +195,8 @@
                  'clientVersion': '21.24.100',
              },
          },
-        'INNERTUBE_CONTEXT_CLIENT_NAME': 15
+        'INNERTUBE_CONTEXT_CLIENT_NAME': 15,
+        'REQUIRE_JS_PLAYER': False
      },
      # mweb has 'ultralow' formats
      # See: https://github.com/yt-dlp/yt-dlp/pull/557
@@ -215,6 +223,7 @@ def build_innertube_clients():
      for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
          ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8')
          ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
+        ytcfg.setdefault('REQUIRE_JS_PLAYER', True)
          ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
          ytcfg['priority'] = 10 * priority(client.split('_', 1)[0])
  
@@ -237,7 +246,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
      """Provide base functions for Youtube extractors"""
  
      _RESERVED_NAMES = (
-        r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|'
+        r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|'
          r'shorts|movies|results|shared|hashtag|trending|feed|feeds|'
          r'browse|oembed|get_video_info|iframe_api|s/player|'
          r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
@@ -499,13 +508,6 @@ def _extract_client_name(self, ytcfg, default_client='web'):
              ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'],
                      lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), compat_str, default_client)
  
-    @staticmethod
-    def _extract_session_index(*data):
-        for ytcfg in data:
-            session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX']))
-            if session_index is not None:
-                return session_index
-
      def _extract_client_version(self, ytcfg, default_client='web'):
          return self._ytcfg_get_safe(
              ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'],
@@ -584,17 +586,27 @@ def extract_yt_initial_data(self, video_id, webpage):
                   self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
              video_id)
  
-    def _extract_identity_token(self, webpage, item_id):
-        if not webpage:
-            return None
-        ytcfg = self.extract_ytcfg(item_id, webpage)
+    @staticmethod
+    def _extract_session_index(*data):
+        """
+        Index of current account in account list.
+        See: https://github.com/yt-dlp/yt-dlp/pull/519
+        """
+        for ytcfg in data:
+            session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX']))
+            if session_index is not None:
+                return session_index
+
+    # Deprecated?
+    def _extract_identity_token(self, ytcfg=None, webpage=None):
          if ytcfg:
              token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
              if token:
                  return token
-        return self._search_regex(
-            r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
-            'identity token', default=None)
+        if webpage:
+            return self._search_regex(
+                r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
+                'identity token', default=None, fatal=False)
  
      @staticmethod
      def _extract_account_syncid(*args):
@@ -609,12 +621,16 @@ def _extract_account_syncid(*args):
                  return delegated_sid
              sync_ids = (try_get(
                  data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'],
-                       lambda x: x['DATASYNC_ID']), compat_str) or '').split("||")
+                       lambda x: x['DATASYNC_ID']), compat_str) or '').split('||')
              if len(sync_ids) >= 2 and sync_ids[1]:
                  # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
                  # and just "user_syncid||" for primary channel. We only want the channel_syncid
                  return sync_ids[0]
  
+    @property
+    def is_authenticated(self):
+        return bool(self._generate_sapisidhash_header())
+
      def extract_ytcfg(self, video_id, webpage):
          if not webpage:
              return {}
@@ -624,33 +640,30 @@ def extract_ytcfg(self, video_id, webpage):
                  default='{}'), video_id, fatal=False) or {}
  
      def generate_api_headers(
-            self, ytcfg=None, identity_token=None, account_syncid=None,
-            visitor_data=None, api_hostname=None, default_client='web', session_index=None):
+            self, *, ytcfg=None, account_syncid=None, session_index=None,
+            visitor_data=None, identity_token=None, api_hostname=None, default_client='web'):
+
          origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client))
          headers = {
              'X-YouTube-Client-Name': compat_str(
                  self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)),
              'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client),
-            'Origin': origin
-        }
-        if not visitor_data and ytcfg:
-            visitor_data = try_get(
+            'Origin': origin,
+            'X-Youtube-Identity-Token': identity_token or self._extract_identity_token(ytcfg),
+            'X-Goog-PageId': account_syncid or self._extract_account_syncid(ytcfg),
+            'X-Goog-Visitor-Id': visitor_data or try_get(
                  self._extract_context(ytcfg, default_client), lambda x: x['client']['visitorData'], compat_str)
-        if identity_token:
-            headers['X-Youtube-Identity-Token'] = identity_token
-        if account_syncid:
-            headers['X-Goog-PageId'] = account_syncid
-        if session_index is None and ytcfg:
+        }
+        if session_index is None:
              session_index = self._extract_session_index(ytcfg)
          if account_syncid or session_index is not None:
              headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0
-        if visitor_data:
-            headers['X-Goog-Visitor-Id'] = visitor_data
+
          auth = self._generate_sapisidhash_header(origin)
          if auth is not None:
              headers['Authorization'] = auth
              headers['X-Origin'] = origin
-        return headers
+        return {h: v for h, v in headers.items() if v is not None}
  
      @staticmethod
      def _build_api_continuation_query(continuation, ctp=None):
@@ -721,7 +734,7 @@ def _extract_alerts(cls, data):
                  if message:
                      yield alert_type, message
  
-    def _report_alerts(self, alerts, expected=True, fatal=True):
+    def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False):
          errors = []
          warnings = []
          for alert_type, alert_message in alerts:
@@ -731,7 +744,7 @@ def _report_alerts(self, alerts, expected=True, fatal=True):
                  warnings.append([alert_type, alert_message])
  
          for alert_type, alert_message in (warnings + errors[:-1]):
-            self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
+            self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message), only_once=only_once)
          if errors:
              raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected)
  
@@ -780,7 +793,7 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers
          while count < retries:
              count += 1
              if last_error:
-                self.report_warning('%s. Retrying ...' % last_error)
+                self.report_warning('%s. Retrying ...' % remove_end(last_error, '.'))
              try:
                  response = self._call_api(
                      ep=ep, fatal=True, headers=headers,
@@ -803,7 +816,7 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers
                      # We also want to catch all other network exceptions since errors in later pages can be troublesome
                      # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210
                      if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429):
-                        last_error = error_to_compat_str(e.cause or e)
+                        last_error = error_to_compat_str(e.cause or e.msg)
                          if count < retries:
                              continue
                  if fatal:
@@ -815,8 +828,13 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers
              else:
                  # Youtube may send alerts if there was an issue with the continuation page
                  try:
-                    self._extract_and_report_alerts(response, expected=False)
+                    self._extract_and_report_alerts(response, expected=False, only_once=True)
                  except ExtractorError as e:
+                    # YouTube servers may return errors we want to retry on in a 200 OK response
+                    # See: https://github.com/yt-dlp/yt-dlp/issues/839
+                    if 'unknown error' in e.msg.lower():
+                        last_error = e.msg
+                        continue
                      if fatal:
                          raise
                      self.report_warning(error_to_compat_str(e))
@@ -855,7 +873,7 @@ def _extract_video(self, renderer):
              '_type': 'url',
              'ie_key': YoutubeIE.ie_key(),
              'id': video_id,
-            'url': video_id,
+            'url': f'https://www.youtube.com/watch?v={video_id}',
              'title': title,
              'description': description,
              'duration': duration,
@@ -1069,10 +1087,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          '_rtmp': {'protocol': 'rtmp'},
  
          # av01 video only formats sometimes served with "unknown" codecs
-        '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
-        '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
-        '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
-        '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+        '394': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'av01.0.00M.08'},
+        '395': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'av01.0.00M.08'},
+        '396': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'av01.0.01M.08'},
+        '397': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'av01.0.04M.08'},
+        '398': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'av01.0.05M.08'},
+        '399': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'av01.0.08M.08'},
+        '400': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'},
+        '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'},
      }
      _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
  
@@ -1837,8 +1859,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
  
      @classmethod
      def suitable(cls, url):
-        # Hack for lazy extractors until more generic solution is implemented
-        # (see #28780)
          from ..utils import parse_qs
  
          qs = parse_qs(url)
@@ -1851,14 +1871,12 @@ def __init__(self, *args, **kwargs):
          self._code_cache = {}
          self._player_cache = {}
  
-    def _extract_player_url(self, ytcfg=None, webpage=None):
-        player_url = try_get(ytcfg, (lambda x: x['PLAYER_JS_URL']), str)
-        if not player_url and webpage:
-            player_url = self._search_regex(
-                r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
-                webpage, 'player URL', fatal=False)
+    def _extract_player_url(self, *ytcfgs, webpage=None):
+        player_url = traverse_obj(
+            ytcfgs, (..., 'PLAYER_JS_URL'), (..., 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'jsUrl'),
+            get_all=False, expected_type=compat_str)
          if not player_url:
-            return None
+            return
          if player_url.startswith('//'):
              player_url = 'https:' + player_url
          elif not re.match(r'https?://', player_url):
@@ -1866,6 +1884,16 @@ def _extract_player_url(self, ytcfg=None, webpage=None):
                  'https://www.youtube.com', player_url)
          return player_url
  
+    def _download_player_url(self, video_id, fatal=False):
+        res = self._download_webpage(
+            'https://www.youtube.com/iframe_api',
+            note='Downloading iframe API JS', video_id=video_id, fatal=fatal)
+        if res:
+            player_version = self._search_regex(
+                r'player\\?/([0-9a-fA-F]{8})\\?/', res, 'player version', fatal=fatal)
+            if player_version:
+                return f'https://www.youtube.com/s/player/{player_version}/player_ias.vflset/en_US/base.js'
+
      def _signature_cache_id(self, example_sig):
          """ Return a string representation of a signature """
          return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
@@ -2200,8 +2228,7 @@ def _extract_comment(self, comment_renderer, parent=None):
              'parent': parent or 'root'
          }
  
-    def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
-                         ytcfg, video_id, parent=None, comment_counts=None):
+    def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, comment_counts=None):
  
          def extract_header(contents):
              _total_comments = 0
@@ -2259,8 +2286,8 @@ def extract_thread(contents):
                  if comment_replies_renderer:
                      comment_counts[2] += 1
                      comment_entries_iter = self._comment_entries(
-                        comment_replies_renderer, identity_token, account_syncid, ytcfg,
-                        video_id, parent=comment.get('id'), comment_counts=comment_counts)
+                        comment_replies_renderer, ytcfg, video_id,
+                        parent=comment.get('id'), comment_counts=comment_counts)
  
                      for reply_comment in comment_entries_iter:
                          yield reply_comment
@@ -2285,7 +2312,7 @@ def extract_thread(contents):
          for page_num in itertools.count(0):
              if not continuation:
                  break
-            headers = self.generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
+            headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=visitor_data)
              comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
              if page_num == 0:
                  if is_first_continuation:
@@ -2385,18 +2412,10 @@ def _generate_comment_continuation(video_id):
      def _extract_comments(self, ytcfg, video_id, contents, webpage):
          """Entry for comment extraction"""
          def _real_comment_extract(contents):
-            if isinstance(contents, list):
-                for entry in contents:
-                    for key, renderer in entry.items():
-                        if key not in known_entry_comment_renderers:
-                            continue
-                        yield from self._comment_entries(
-                            renderer, video_id=video_id, ytcfg=ytcfg,
-                            identity_token=self._extract_identity_token(webpage, item_id=video_id),
-                            account_syncid=self._extract_account_syncid(ytcfg))
-                        break
+            yield from self._comment_entries(
+                traverse_obj(contents, (..., 'itemSectionRenderer'), get_all=False), ytcfg, video_id)
+
          comments = []
-        known_entry_comment_renderers = ('itemSectionRenderer',)
          estimated_total = 0
          max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf')
          # Force English regardless of account setting to prevent parsing issues
@@ -2421,7 +2440,11 @@ def _real_comment_extract(contents):
          }
  
      @staticmethod
-    def _generate_player_context(sts=None):
+    def _get_checkok_params():
+        return {'contentCheckOk': True, 'racyCheckOk': True}
+
+    @classmethod
+    def _generate_player_context(cls, sts=None):
          context = {
              'html5Preference': 'HTML5_PREF_WANTS',
          }
@@ -2431,8 +2454,7 @@ def _generate_player_context(sts=None):
              'playbackContext': {
                  'contentPlaybackContext': context
              },
-            'contentCheckOk': True,
-            'racyCheckOk': True
+            **cls._get_checkok_params()
          }
  
      @staticmethod
@@ -2451,14 +2473,13 @@ def _is_agegated(player_response):
      def _is_unplayable(player_response):
          return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE'
  
-    def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr):
+    def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr):
  
          session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
          syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr)
-        sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False)
+        sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None
          headers = self.generate_api_headers(
-            player_ytcfg, identity_token, syncid,
-            default_client=client, session_index=session_index)
+            ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client)
  
          yt_query = {'videoId': video_id}
          yt_query.update(self._generate_player_context(sts))
@@ -2500,7 +2521,7 @@ def _extract_player_ytcfg(self, client, video_id):
          webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config')
          return self.extract_ytcfg(video_id, webpage) or {}
  
-    def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, player_url, identity_token):
+    def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg):
          initial_pr = None
          if webpage:
              initial_pr = self._extract_yt_initial_variable(
@@ -2509,6 +2530,7 @@ def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, pl
  
          original_clients = clients
          clients = clients[::-1]
+        prs = []
  
          def append_client(client_name):
              if client_name in INNERTUBE_CLIENTS and client_name not in original_clients:
@@ -2518,23 +2540,33 @@ def append_client(client_name):
          # extraction of some data. So we return the initial_pr with formats
          # stripped out even if not requested by the user
          # See: https://github.com/yt-dlp/yt-dlp/issues/501
-        yielded_pr = False
          if initial_pr:
              pr = dict(initial_pr)
              pr['streamingData'] = None
-            yielded_pr = True
-            yield pr
+            prs.append(pr)
  
          last_error = None
+        tried_iframe_fallback = False
+        player_url = None
          while clients:
              client = clients.pop()
              player_ytcfg = master_ytcfg if client == 'web' else {}
              if 'configs' not in self._configuration_arg('player_skip'):
                  player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg
  
+            player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage)
+            require_js_player = self._get_default_ytcfg(client).get('REQUIRE_JS_PLAYER')
+            if 'js' in self._configuration_arg('player_skip'):
+                require_js_player = False
+                player_url = None
+
+            if not player_url and not tried_iframe_fallback and require_js_player:
+                player_url = self._download_player_url(video_id)
+                tried_iframe_fallback = True
+
              try:
                  pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response(
-                    client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr)
+                    client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, player_url if require_js_player else None, initial_pr)
              except ExtractorError as e:
                  if last_error:
                      self.report_warning(last_error)
@@ -2542,19 +2574,19 @@ def append_client(client_name):
                  continue
  
              if pr:
-                yielded_pr = True
-                yield pr
+                prs.append(pr)
  
              # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in
-            if client.endswith('_agegate') and self._is_unplayable(pr) and self._generate_sapisidhash_header():
+            if client.endswith('_agegate') and self._is_unplayable(pr) and self.is_authenticated:
                  append_client(client.replace('_agegate', '_creator'))
              elif self._is_agegated(pr):
                  append_client(f'{client}_agegate')
  
          if last_error:
-            if not yielded_pr:
+            if not len(prs):
                  raise last_error
              self.report_warning(last_error)
+        return prs, player_url
  
      def _extract_formats(self, streaming_data, video_id, player_url, is_live):
          itags, stream_ids = [], []
@@ -2621,7 +2653,8 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live):
                  'filesize': int_or_none(fmt.get('contentLength')),
                  'format_id': itag,
                  'format_note': ', '.join(filter(None, (
-                    audio_track.get('displayName'),
+                    '%s%s' % (audio_track.get('displayName') or '',
+                              ' (default)' if audio_track.get('audioIsDefault') else ''),
                      fmt.get('qualityLabel') or quality.replace('audio_quality_', '')))),
                  'fps': int_or_none(fmt.get('fps')),
                  'height': height,
@@ -2630,6 +2663,7 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live):
                  'url': fmt_url,
                  'width': int_or_none(fmt.get('width')),
                  'language': audio_track.get('id', '').split('.')[0],
+                'language_preference': 1 if audio_track.get('audioIsDefault') else -1,
              }
              mime_mobj = re.match(
                  r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
@@ -2699,16 +2733,16 @@ def _real_extract(self, url):
  
          base_url = self.http_scheme() + '//www.youtube.com/'
          webpage_url = base_url + 'watch?v=' + video_id
-        webpage = self._download_webpage(
-            webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
+        webpage = None
+        if 'webpage' not in self._configuration_arg('player_skip'):
+            webpage = self._download_webpage(
+                webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
  
          master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg()
-        player_url = self._extract_player_url(master_ytcfg, webpage)
-        identity_token = self._extract_identity_token(webpage, video_id)
  
-        player_responses = list(self._extract_player_responses(
+        player_responses, player_url = self._extract_player_responses(
              self._get_requested_clients(url, smuggled_data),
-            video_id, webpage, master_ytcfg, player_url, identity_token))
+            video_id, webpage, master_ytcfg)
  
          get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
  
@@ -2791,8 +2825,7 @@ def feed_entry(name):
  
          if not formats:
              if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
-                self.raise_no_formats(
-                    'This video is DRM protected.', expected=True)
+                self.report_drm(video_id)
              pemr = get_first(
                  playability_statuses,
                  ('errorScreen', 'playerErrorMessageRenderer'), expected_type=dict) or {}
@@ -2817,7 +2850,7 @@ def feed_entry(name):
  
          # Source is given priority since formats that throttle are given lower source_preference
          # When throttling issue is fully fixed, remove this
-        self._sort_formats(formats, ('quality', 'height', 'fps', 'source'))
+        self._sort_formats(formats, ('quality', 'res', 'fps', 'source', 'codec:vp9.2', 'lang'))
  
          keywords = get_first(video_details, 'keywords', expected_type=list) or []
          if not keywords and webpage:
@@ -2975,7 +3008,7 @@ def process_language(container, base_url, lang_code, sub_name, query):
                          continue
                      process_language(
                          subtitles, base_url, lang_code,
-                        traverse_obj(caption_track, ('name', 'simpleText')),
+                        traverse_obj(caption_track, ('name', 'simpleText'), ('name', 'runs', ..., 'text'), get_all=False),
                          {})
                      continue
                  automatic_captions = {}
@@ -3022,13 +3055,12 @@ def process_language(container, base_url, lang_code, sub_name, query):
                  webpage, self._YT_INITIAL_DATA_RE, video_id,
                  'yt initial data')
          if not initial_data:
-            headers = self.generate_api_headers(
-                master_ytcfg, identity_token, self._extract_account_syncid(master_ytcfg),
-                session_index=self._extract_session_index(master_ytcfg))
-
+            query = {'videoId': video_id}
+            query.update(self._get_checkok_params())
              initial_data = self._extract_response(
                  item_id=video_id, ep='next', fatal=False,
-                ytcfg=master_ytcfg, headers=headers, query={'videoId': video_id},
+                ytcfg=master_ytcfg, query=query,
+                headers=self.generate_api_headers(ytcfg=master_ytcfg),
                  note='Downloading initial data API JSON')
  
          try:
@@ -3167,40 +3199,7 @@ def process_language(container, base_url, lang_code, sub_name, query):
              needs_auth=info['age_limit'] >= 18,
              is_unlisted=None if is_private is None else is_unlisted)
  
-        # get xsrf for annotations or comments
-        get_annotations = self.get_param('writeannotations', False)
-        get_comments = self.get_param('getcomments', False)
-        if get_annotations or get_comments:
-            xsrf_token = None
-            if master_ytcfg:
-                xsrf_token = try_get(master_ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
-            if not xsrf_token:
-                xsrf_token = self._search_regex(
-                    r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
-                    webpage, 'xsrf token', group='xsrf_token', fatal=False)
-
-        # annotations
-        if get_annotations:
-            invideo_url = get_first(
-                player_responses,
-                ('annotations', 0, 'playerAnnotationsUrlsRenderer', 'invideoUrl'),
-                expected_type=str)
-            if xsrf_token and invideo_url:
-                xsrf_field_name = None
-                if master_ytcfg:
-                    xsrf_field_name = try_get(master_ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
-                if not xsrf_field_name:
-                    xsrf_field_name = self._search_regex(
-                        r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
-                        webpage, 'xsrf field name',
-                        group='xsrf_field_name', default='session_token')
-                info['annotations'] = self._download_webpage(
-                    self._proto_relative_url(invideo_url),
-                    video_id, note='Downloading annotations',
-                    errnote='Unable to download video annotations', fatal=False,
-                    data=urlencode_postdata({xsrf_field_name: xsrf_token}))
-
-        if get_comments:
+        if self.get_param('getcomments', False):
              info['__post_extractor'] = lambda: self._extract_comments(master_ytcfg, video_id, contents, webpage)
  
          self.mark_watched(video_id, player_responses)
@@ -3833,7 +3832,7 @@ def _rich_grid_entries(self, contents):
                  if entry:
                      yield entry
      '''
-    def _entries(self, tab, item_id, identity_token, account_syncid, ytcfg):
+    def _entries(self, tab, item_id, account_syncid, ytcfg):
  
          def extract_entries(parent_renderer):  # this needs to called again for continuation to work with feeds
              contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
@@ -3890,7 +3889,8 @@ def extract_entries(parent_renderer):  # this needs to called again for continua
          for page_num in itertools.count(1):
              if not continuation:
                  break
-            headers = self.generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
+            headers = self.generate_api_headers(
+                ytcfg=ytcfg, account_syncid=account_syncid, visitor_data=visitor_data)
              response = self._extract_response(
                  item_id='%s page %s' % (item_id, page_num),
                  query=continuation, headers=headers, ytcfg=ytcfg,
@@ -4044,7 +4044,6 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs):
          return self.playlist_result(
              self._entries(
                  selected_tab, playlist_id,
-                self._extract_identity_token(webpage, item_id),
                  self._extract_account_syncid(ytcfg, data), ytcfg),
              **metadata)
  
@@ -4052,8 +4051,7 @@ def _extract_mix_playlist(self, playlist, playlist_id, data, webpage):
          first_id = last_id = None
          ytcfg = self.extract_ytcfg(playlist_id, webpage)
          headers = self.generate_api_headers(
-            ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
-            identity_token=self._extract_identity_token(webpage, item_id=playlist_id))
+            ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data))
          for page_num in itertools.count(1):
              videos = list(self._playlist_entries(playlist))
              if not videos:
@@ -4169,10 +4167,8 @@ def _reload_with_unavailable_videos(self, item_id, data, webpage):
  
          ytcfg = self.extract_ytcfg(item_id, webpage)
          headers = self.generate_api_headers(
-            ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
-            identity_token=self._extract_identity_token(webpage, item_id=item_id),
-            visitor_data=try_get(
-                self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
+            ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
+            visitor_data=try_get(self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
          query = {
              'params': params or 'wgYCCAA=',
              'browseId': browse_id or 'VL%s' % item_id
@@ -4280,7 +4276,7 @@ def get_mobj(url):
          if video_id and playlist_id:
              if self.get_param('noplaylist'):
                  self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
-                return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
+                return self.url_result(f'https://www.youtube.com/watch?v={video_id}', ie=YoutubeIE.ie_key(), video_id=video_id)
              self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
  
          webpage, data = self._extract_webpage(url, item_id)
@@ -4316,7 +4312,7 @@ def get_mobj(url):
          # YouTube sometimes provides a button to reload playlist with unavailable videos.
          if 'no-youtube-unavailable-videos' not in compat_opts:
              data = self._reload_with_unavailable_videos(item_id, data, webpage) or data
-        self._extract_and_report_alerts(data)
+        self._extract_and_report_alerts(data, only_once=True)
          tabs = try_get(
              data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
          if tabs:
@@ -4333,7 +4329,7 @@ def get_mobj(url):
          if video_id:
              if mobj['tab'] != '/live':  # live tab is expected to redirect to video
                  self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
-            return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
+            return self.url_result(f'https://www.youtube.com/watch?v={video_id}', ie=YoutubeIE.ie_key(), video_id=video_id)
  
          raise ExtractorError('Unable to recognize tab page')
  
@@ -4723,6 +4719,16 @@ def _real_extract(self, url):
              expected=True)
  
  
+class YoutubeClipIE(InfoExtractor):
+    IE_NAME = 'youtube:clip'
+    IE_DESC = False  # Do not list
+    _VALID_URL = r'https?://(?:www\.)?youtube\.com/clip/'
+
+    def _real_extract(self, url):
+        self.report_warning('YouTube clips are not currently supported. The entire video will be downloaded instead')
+        return self.url_result(url, 'Generic')
+
+
  class YoutubeTruncatedIDIE(InfoExtractor):
      IE_NAME = 'youtube:truncated_id'
      IE_DESC = False  # Do not list