[ie/crunchyroll] Fix stream extraction (#10005)

[yt-dlp.git] / yt_dlp / extractor / youtube.py
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index a5fe179c293c42461927e2b57392925698d05c59..e676c5cde24c96e4f49935b765ca9ebaafdff2f5 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -2353,6 +2353,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'format': '17',  # 3gp format available on android
                  'extractor_args': {'youtube': {'player_client': ['android']}},
              },
+            'skip': 'android client broken',
          },
          {
              # Skip download of additional client configs (remix client config in this case)
@@ -2730,7 +2731,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'heatmap': 'count:100',
              },
              'params': {
-                'extractor_args': {'youtube': {'player_client': ['android'], 'player_skip': ['webpage']}},
+                'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}},
              },
          },
      ]
@@ -3317,7 +3318,36 @@ def _extract_heatmap(self, data):
                  'value': ('intensityScoreNormalized', {float_or_none}),
              })) or None
  
-    def _extract_comment(self, comment_renderer, parent=None):
+    def _extract_comment(self, entities, parent=None):
+        comment_entity_payload = get_first(entities, ('payload', 'commentEntityPayload', {dict}))
+        if not (comment_id := traverse_obj(comment_entity_payload, ('properties', 'commentId', {str}))):
+            return
+
+        toolbar_entity_payload = get_first(entities, ('payload', 'engagementToolbarStateEntityPayload', {dict}))
+        time_text = traverse_obj(comment_entity_payload, ('properties', 'publishedTime', {str})) or ''
+
+        return {
+            'id': comment_id,
+            'parent': parent or 'root',
+            **traverse_obj(comment_entity_payload, {
+                'text': ('properties', 'content', 'content', {str}),
+                'like_count': ('toolbar', 'likeCountA11y', {parse_count}),
+                'author_id': ('author', 'channelId', {self.ucid_or_none}),
+                'author': ('author', 'displayName', {str}),
+                'author_thumbnail': ('author', 'avatarThumbnailUrl', {url_or_none}),
+                'author_is_uploader': ('author', 'isCreator', {bool}),
+                'author_is_verified': ('author', 'isVerified', {bool}),
+                'author_url': ('author', 'channelCommand', 'innertubeCommand', (
+                    ('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url')
+                ), {lambda x: urljoin('https://www.youtube.com', x)}),
+            }, get_all=False),
+            'is_favorited': (None if toolbar_entity_payload is None else
+                             toolbar_entity_payload.get('heartState') == 'TOOLBAR_HEART_STATE_HEARTED'),
+            '_time_text': time_text,  # FIXME: non-standard, but we need a way of showing that it is an estimate.
+            'timestamp': self._parse_time_text(time_text),
+        }
+
+    def _extract_comment_old(self, comment_renderer, parent=None):
          comment_id = comment_renderer.get('commentId')
          if not comment_id:
              return
@@ -3398,21 +3428,39 @@ def extract_header(contents):
                  break
              return _continuation
  
-        def extract_thread(contents):
+        def extract_thread(contents, entity_payloads):
              if not parent:
                  tracker['current_page_thread'] = 0
              for content in contents:
                  if not parent and tracker['total_parent_comments'] >= max_parents:
                      yield
                  comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
-                comment_renderer = get_first(
-                    (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
-                    expected_type=dict, default={})
  
-                comment = self._extract_comment(comment_renderer, parent)
+                # old comment format
+                if not entity_payloads:
+                    comment_renderer = get_first(
+                        (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
+                        expected_type=dict, default={})
+
+                    comment = self._extract_comment_old(comment_renderer, parent)
+
+                # new comment format
+                else:
+                    view_model = (
+                        traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel', {dict}))
+                        or traverse_obj(content, ('commentViewModel', {dict})))
+                    comment_keys = traverse_obj(view_model, (('commentKey', 'toolbarStateKey'), {str}))
+                    if not comment_keys:
+                        continue
+                    entities = traverse_obj(entity_payloads, lambda _, v: v['entityKey'] in comment_keys)
+                    comment = self._extract_comment(entities, parent)
+                    if comment:
+                        comment['is_pinned'] = traverse_obj(view_model, ('pinnedText', {str})) is not None
+
                  if not comment:
                      continue
                  comment_id = comment['id']
+
                  if comment.get('is_pinned'):
                      tracker['pinned_comment_ids'].add(comment_id)
                  # Sometimes YouTube may break and give us infinite looping comments.
@@ -3505,7 +3553,7 @@ def extract_thread(contents):
              check_get_keys = None
              if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0):
                  check_get_keys = [[*continuation_items_path, ..., (
-                    'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]]
+                    'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentViewModel', 'commentRenderer'))]]
              try:
                  response = self._extract_response(
                      item_id=None, query=continuation,
@@ -3529,6 +3577,7 @@ def extract_thread(contents):
                  raise
              is_forced_continuation = False
              continuation = None
+            mutations = traverse_obj(response, ('frameworkUpdates', 'entityBatchUpdate', 'mutations', ..., {dict}))
              for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]):
                  if is_first_continuation:
                      continuation = extract_header(continuation_items)
@@ -3537,7 +3586,7 @@ def extract_thread(contents):
                          break
                      continue
  
-                for entry in extract_thread(continuation_items):
+                for entry in extract_thread(continuation_items, mutations):
                      if not entry:
                          return
                      yield entry
@@ -3614,8 +3663,6 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg,
          yt_query = {
              'videoId': video_id,
          }
-        if _split_innertube_client(client)[0] in ('android', 'android_embedscreen'):
-            yt_query['params'] = 'CgIIAQ=='
  
          pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0]
          if pp_arg:
@@ -3631,19 +3678,24 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg,
  
      def _get_requested_clients(self, url, smuggled_data):
          requested_clients = []
-        default = ['ios', 'android', 'web']
+        android_clients = []
+        default = ['ios', 'web']
          allowed_clients = sorted(
              (client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'),
              key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True)
          for client in self._configuration_arg('player_client'):
-            if client in allowed_clients:
-                requested_clients.append(client)
-            elif client == 'default':
+            if client == 'default':
                  requested_clients.extend(default)
              elif client == 'all':
                  requested_clients.extend(allowed_clients)
-            else:
+            elif client not in allowed_clients:
                  self.report_warning(f'Skipping unsupported client {client}')
+            elif client.startswith('android'):
+                android_clients.append(client)
+            else:
+                requested_clients.append(client)
+        # Force deprioritization of broken Android clients for format de-duplication
+        requested_clients.extend(android_clients)
          if not requested_clients:
              requested_clients = default
  
@@ -3862,6 +3914,14 @@ def build_fragments(f):
                      f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)
  
              client_name = fmt.get(STREAMING_DATA_CLIENT_NAME)
+            # Android client formats are broken due to integrity check enforcement
+            # Ref: https://github.com/yt-dlp/yt-dlp/issues/9554
+            is_broken = client_name and client_name.startswith(short_client_name('android'))
+            if is_broken:
+                self.report_warning(
+                    f'{video_id}: Android client formats are broken and may yield HTTP Error 403. '
+                    'They will be deprioritized', only_once=True)
+
              name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or ''
              fps = int_or_none(fmt.get('fps')) or 0
              dct = {
@@ -3874,7 +3934,7 @@ def build_fragments(f):
                      name, fmt.get('isDrc') and 'DRC',
                      try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()),
                      try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()),
-                    throttled and 'THROTTLED', is_damaged and 'DAMAGED',
+                    throttled and 'THROTTLED', is_damaged and 'DAMAGED', is_broken and 'BROKEN',
                      (self.get_param('verbose') or all_formats) and client_name,
                      delim=', '),
                  # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372
@@ -3892,8 +3952,8 @@ def build_fragments(f):
                  'language': join_nonempty(audio_track.get('id', '').split('.')[0],
                                            'desc' if language_preference < -1 else '') or None,
                  'language_preference': language_preference,
-                # Strictly de-prioritize damaged and 3gp formats
-                'preference': -10 if is_damaged else -2 if itag == '17' else None,
+                # Strictly de-prioritize broken, damaged and 3gp formats
+                'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None,
              }
              mime_mobj = re.match(
                  r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')