[extractor/youtube] Fix continuation loop with no comments (#7148)

[yt-dlp.git] / yt_dlp / extractor / youtube.py
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 80edcd77dac8094976da18e6bf5d7acc1ebf5106..ae4b58205fdbd853f5369e4d9327a4bbbd211b4e 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -1038,6 +1038,13 @@ def _extract_video(self, renderer):
                        else self._get_count({'simpleText': view_count_text}))
          view_count_field = 'concurrent_view_count' if live_status in ('is_live', 'is_upcoming') else 'view_count'
  
+        channel = (self._get_text(renderer, 'ownerText', 'shortBylineText')
+                   or self._get_text(reel_header_renderer, 'channelTitleText'))
+
+        channel_handle = traverse_obj(renderer, (
+            'shortBylineText', 'runs', ..., 'navigationEndpoint',
+            (('commandMetadata', 'webCommandMetadata', 'url'), ('browseEndpoint', 'canonicalBaseUrl'))),
+            expected_type=self.handle_from_url, get_all=False)
          return {
              '_type': 'url',
              'ie_key': YoutubeIE.ie_key(),
@@ -1047,9 +1054,11 @@ def _extract_video(self, renderer):
              'description': description,
              'duration': duration,
              'channel_id': channel_id,
-            'channel': (self._get_text(renderer, 'ownerText', 'shortBylineText')
-                        or self._get_text(reel_header_renderer, 'channelTitleText')),
+            'channel': channel,
              'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None,
+            'uploader': channel,
+            'uploader_id': channel_handle,
+            'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None),
              'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'),
              'timestamp': (self._parse_time_text(time_text)
                            if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE)
@@ -3305,7 +3314,7 @@ def extract_header(contents):
                  expected_comment_count = self._get_count(
                      comments_header_renderer, 'countText', 'commentsCount')
  
-                if expected_comment_count:
+                if expected_comment_count is not None:
                      tracker['est_total'] = expected_comment_count
                      self.to_screen(f'Downloading ~{expected_comment_count} comments')
                  comment_sort_index = int(get_single_config_arg('comment_sort') != 'top')  # 1 = new, 0 = top
@@ -3376,7 +3385,7 @@ def extract_thread(contents):
          if not tracker:
              tracker = dict(
                  running_total=0,
-                est_total=0,
+                est_total=None,
                  current_page_thread=0,
                  total_parent_comments=0,
                  total_reply_comments=0,
@@ -3409,11 +3418,13 @@ def extract_thread(contents):
              continuation = self._build_api_continuation_query(self._generate_comment_continuation(video_id))
              is_forced_continuation = True
  
+        continuation_items_path = (
+            'onResponseReceivedEndpoints', ..., ('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems')
          for page_num in itertools.count(0):
              if not continuation:
                  break
              headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response))
-            comment_prog_str = f"({tracker['running_total']}/{tracker['est_total']})"
+            comment_prog_str = f"({tracker['running_total']}/~{tracker['est_total']})"
              if page_num == 0:
                  if is_first_continuation:
                      note_prefix = 'Downloading comment section API JSON'
@@ -3424,11 +3435,18 @@ def extract_thread(contents):
                  note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
                      '       ' if parent else '', ' replies' if parent else '',
                      page_num, comment_prog_str)
+
+            # Do a deep check for incomplete data as sometimes YouTube may return no comments for a continuation
+            # Ignore check if YouTube says the comment count is 0.
+            check_get_keys = None
+            if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0):
+                check_get_keys = [[*continuation_items_path, ..., (
+                    'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]]
              try:
                  response = self._extract_response(
                      item_id=None, query=continuation,
                      ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
-                    check_get_keys='onResponseReceivedEndpoints' if not is_forced_continuation else None)
+                    check_get_keys=check_get_keys)
              except ExtractorError as e:
                  # Ignore incomplete data error for replies if retries didn't work.
                  # This is to allow any other parent comments and comment threads to be downloaded.
@@ -3440,15 +3458,8 @@ def extract_thread(contents):
                  else:
                      raise
              is_forced_continuation = False
-            continuation_contents = traverse_obj(
-                response, 'onResponseReceivedEndpoints', expected_type=list, default=[])
-
              continuation = None
-            for continuation_section in continuation_contents:
-                continuation_items = traverse_obj(
-                    continuation_section,
-                    (('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems'),
-                    get_all=False, expected_type=list) or []
+            for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]):
                  if is_first_continuation:
                      continuation = extract_header(continuation_items)
                      is_first_continuation = False
@@ -5851,7 +5862,25 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
              'uploader_id': '@colethedj1894',
              'uploader': 'colethedj',
          },
+        'playlist': [{
+            'info_dict': {
+                'title': 'youtube-dl test video "\'/\\ä↭𝕐',
+                'id': 'BaW_jenozKc',
+                '_type': 'url',
+                'ie_key': 'Youtube',
+                'duration': 10,
+                'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
+                'channel_url': 'https://www.youtube.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
+                'view_count': int,
+                'url': 'https://www.youtube.com/watch?v=BaW_jenozKc',
+                'channel': 'Philipp Hagemeister',
+                'uploader_id': '@PhilippHagemeister',
+                'uploader_url': 'https://www.youtube.com/@PhilippHagemeister',
+                'uploader': 'Philipp Hagemeister',
+            }
+        }],
          'playlist_count': 1,
+        'params': {'extract_flat': True},
      }, {
          'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData',
          'url': 'https://www.youtube.com/feed/recommended',
@@ -6152,6 +6181,9 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
                  'channel_url': str,
                  'concurrent_view_count': int,
                  'channel': str,
+                'uploader': str,
+                'uploader_url': str,
+                'uploader_id': str
              }
          }],
          'params': {'extract_flat': True, 'playlist_items': '1'},