[extractor/youtube] Fix comment loop detection for pinned comments (#6714)

[yt-dlp.git] / yt_dlp / extractor / youtube.py
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 4165d795cbf7cad12b281dd08984d8b237173e64..6dc36f9b998db35131d1e8daf5f1ed84d23a3134 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -3316,9 +3316,17 @@ def extract_thread(contents):
                  comment = self._extract_comment(comment_renderer, parent)
                  if not comment:
                      continue
+                is_pinned = bool(traverse_obj(comment_renderer, 'pinnedCommentBadge'))
+                comment_id = comment['id']
+                if is_pinned:
+                    tracker['pinned_comment_ids'].add(comment_id)
                  # Sometimes YouTube may break and give us infinite looping comments.
                  # See: https://github.com/yt-dlp/yt-dlp/issues/6290
-                if comment['id'] in tracker['seen_comment_ids']:
+                if comment_id in tracker['seen_comment_ids']:
+                    if comment_id in tracker['pinned_comment_ids'] and not is_pinned:
+                        # Pinned comments may appear a second time in newest first sort
+                        # See: https://github.com/yt-dlp/yt-dlp/issues/6712
+                        continue
                      self.report_warning('Detected YouTube comments looping. Stopping comment extraction as we probably cannot get any more.')
                      yield
                  else:
@@ -3348,7 +3356,9 @@ def extract_thread(contents):
                  current_page_thread=0,
                  total_parent_comments=0,
                  total_reply_comments=0,
-                seen_comment_ids=set())
+                seen_comment_ids=set(),
+                pinned_comment_ids=set()
+            )
  
          # TODO: Deprecated
          # YouTube comments have a max depth of 2
@@ -3630,6 +3640,7 @@ def _needs_live_processing(self, live_status, duration):
              return live_status
  
      def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration):
+        CHUNK_SIZE = 10 << 20
          itags, stream_ids = collections.defaultdict(set), []
          itag_qualities, res_qualities = {}, {0: None}
          q = qualities([
@@ -3642,6 +3653,13 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l
          streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...))
          all_formats = self._configuration_arg('include_duplicate_formats')
  
+        def build_fragments(f):
+            return LazyList({
+                'url': update_url_query(f['url'], {
+                    'range': f'{range_start}-{min(range_start + CHUNK_SIZE - 1, f["filesize"])}'
+                })
+            } for range_start in range(0, f['filesize'], CHUNK_SIZE))
+
          for fmt in streaming_formats:
              if fmt.get('targetDurationSec'):
                  continue
@@ -3771,17 +3789,12 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l
              if single_stream and dct.get('ext'):
                  dct['container'] = dct['ext'] + '_dash'
  
-            CHUNK_SIZE = 10 << 20
              if dct['filesize']:
                  yield {
                      **dct,
                      'format_id': f'{dct["format_id"]}-dashy' if all_formats else dct['format_id'],
                      'protocol': 'http_dash_segments',
-                    'fragments': LazyList({
-                        'url': update_url_query(dct['url'], {
-                            'range': f'{range_start}-{min(range_start + CHUNK_SIZE - 1, dct["filesize"])}'
-                        })
-                    } for range_start in range(0, dct['filesize'], CHUNK_SIZE))
+                    'fragments': build_fragments(dct),
                  }
                  if not all_formats:
                      continue
@@ -4251,22 +4264,25 @@ def process_language(container, base_url, lang_code, sub_name, query):
          initial_data = None
          if webpage:
              initial_data = self.extract_yt_initial_data(video_id, webpage, fatal=False)
+            if not traverse_obj(initial_data, 'contents'):
+                self.report_warning('Incomplete data received in embedded initial data; re-fetching using API.')
+                initial_data = None
          if not initial_data:
              query = {'videoId': video_id}
              query.update(self._get_checkok_params())
              initial_data = self._extract_response(
                  item_id=video_id, ep='next', fatal=False,
-                ytcfg=master_ytcfg, query=query,
+                ytcfg=master_ytcfg, query=query, check_get_keys='contents',
                  headers=self.generate_api_headers(ytcfg=master_ytcfg),
                  note='Downloading initial data API JSON')
  
          info['comment_count'] = traverse_obj(initial_data, (
              'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'itemSectionRenderer',
-            'contents', ..., 'commentsEntryPointHeaderRenderer', 'commentCount', 'simpleText'
+            'contents', ..., 'commentsEntryPointHeaderRenderer', 'commentCount'
          ), (
              'engagementPanels', lambda _, v: v['engagementPanelSectionListRenderer']['panelIdentifier'] == 'comment-item-section',
-            'engagementPanelSectionListRenderer', 'header', 'engagementPanelTitleHeaderRenderer', 'contextualInfo', 'runs', ..., 'text'
-        ), expected_type=int_or_none, get_all=False)
+            'engagementPanelSectionListRenderer', 'header', 'engagementPanelTitleHeaderRenderer', 'contextualInfo'
+        ), expected_type=self._get_count, get_all=False)
  
          try:  # This will error if there is no livechat
              initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']