[extractor] Generalize `getcomments` implementation

author pukkandan <redacted>

Tue, 12 Oct 2021 09:50:50 +0000 (15:20 +0530)

committer pukkandan <redacted>

Tue, 12 Oct 2021 09:51:30 +0000 (15:21 +0530)
author pukkandan <redacted>
Tue, 12 Oct 2021 09:50:50 +0000 (15:20 +0530)
committer pukkandan <redacted>
Tue, 12 Oct 2021 09:51:30 +0000 (15:21 +0530)
diff --git a/yt_dlp/extractor/bannedvideo.py b/yt_dlp/extractor/bannedvideo.py

index 8f8f5ef5f2ef22504d3a56c08fb9f80261e0d32c..3db1151f6d3a628b0823bdb9fc7cb8084fd9956a 100644 (file)
--- a/yt_dlp/extractor/bannedvideo.py
+++ b/yt_dlp/extractor/bannedvideo.py
@@ -97,21 +97,16 @@ def _call_api(self, video_id, id, operation, note):
                  'query': self._GRAPHQL_QUERIES[operation]
              }).encode('utf8')).get('data')
  
-    def _extract_comments(self, video_id, comments, comment_data):
+    def _get_comments(self, video_id, comments, comment_data):
+        yield from comments
          for comment in comment_data.copy():
              comment_id = comment.get('_id')
              if comment.get('replyCount') > 0:
                  reply_json = self._call_api(
                      video_id, comment_id, 'GetCommentReplies',
                      f'Downloading replies for comment {comment_id}')
-                comments.extend(
-                    self._parse_comment(reply, comment_id)
-                    for reply in reply_json.get('getCommentReplies'))
-
-        return {
-            'comments': comments,
-            'comment_count': len(comments),
-        }
+                for reply in reply_json.get('getCommentReplies'):
+                    yield self._parse_comment(reply, comment_id)
  
      @staticmethod
      def _parse_comment(comment_data, parent):
@@ -159,7 +154,5 @@ def _real_extract(self, url):
              'tags': [tag.get('name') for tag in video_info.get('tags')],
              'availability': self._availability(is_unlisted=video_info.get('unlisted')),
              'comments': comments,
-            '__post_extractor': (
-                (lambda: self._extract_comments(video_id, comments, video_json.get('getVideoComments')))
-                if self.get_param('getcomments') else None)
+            '__post_extractor': self.extract_comments(video_id, comments, video_json.get('getVideoComments'))
          }
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index d02a808b6ba90308ea9115b89e35082ebc8adef8..5b7b8891aa48e6dbfc82aacfb4d96c7bdd93c507 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -3502,6 +3502,32 @@ def extract_subtitles(self, *args, **kwargs):
      def _get_subtitles(self, *args, **kwargs):
          raise NotImplementedError('This method must be implemented by subclasses')
  
+    def extract_comments(self, *args, **kwargs):
+        if not self.get_param('getcomments'):
+            return None
+        generator = self._get_comments(*args, **kwargs)
+
+        def extractor():
+            comments = []
+            try:
+                while True:
+                    comments.append(next(generator))
+            except KeyboardInterrupt:
+                interrupted = True
+                self.to_screen('Interrupted by user')
+            except StopIteration:
+                interrupted = False
+            comment_count = len(comments)
+            self.to_screen(f'Extracted {comment_count} comments')
+            return {
+                'comments': comments,
+                'comment_count': None if interrupted else comment_count
+            }
+        return extractor
+
+    def _get_comments(self, *args, **kwargs):
+        raise NotImplementedError('This method must be implemented by subclasses')
+
      @staticmethod
      def _merge_subtitle_items(subtitle_list1, subtitle_list2):
          """ Merge subtitle items for one language. Items with duplicated URLs
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 41fd0aef7e8f2cd1f0cdca6e61b277af2388e34f..3e93c99342da9d2ffaf52a517598815786e87be7 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -2241,7 +2241,6 @@ def _extract_comment(self, comment_renderer, parent=None):
      def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, comment_counts=None):
  
          def extract_header(contents):
-            _total_comments = 0
              _continuation = None
              for content in contents:
                  comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
@@ -2251,7 +2250,6 @@ def extract_header(contents):
                  if expected_comment_count:
                      comment_counts[1] = expected_comment_count
                      self.to_screen('Downloading ~%d comments' % expected_comment_count)
-                    _total_comments = comment_counts[1]
                  sort_mode_str = self._configuration_arg('comment_sort', [''])[0]
                  comment_sort_index = int(sort_mode_str != 'top')  # 1 = new, 0 = top
  
@@ -2271,7 +2269,7 @@ def extract_header(contents):
                      sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
                  self.to_screen('Sorting comments by %s' % sort_text)
                  break
-            return _total_comments, _continuation
+            return _continuation
  
          def extract_thread(contents):
              if not parent:
@@ -2359,9 +2357,7 @@ def extract_thread(contents):
                           lambda x: x['appendContinuationItemsAction']['continuationItems']),
                          list) or []
                      if is_first_continuation:
-                        total_comments, continuation = extract_header(continuation_items)
-                        if total_comments:
-                            yield total_comments
+                        continuation = extract_header(continuation_items)
                          is_first_continuation = False
                          if continuation:
                              break
@@ -2389,9 +2385,7 @@ def extract_thread(contents):
                          continue
                      if is_first_continuation:
                          header_continuation_items = [continuation_renderer.get('header') or {}]
-                        total_comments, continuation = extract_header(header_continuation_items)
-                        if total_comments:
-                            yield total_comments
+                        continuation = extract_header(header_continuation_items)
                          is_first_continuation = False
                          if continuation:
                              break
@@ -2419,35 +2413,19 @@ def _generate_comment_continuation(video_id):
              [bytes_to_intlist(base64.b64decode(part)) for part in parts]))
          return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8')
  
-    def _extract_comments(self, ytcfg, video_id, contents, webpage):
+    def _get_comments(self, ytcfg, video_id, contents, webpage):
          """Entry for comment extraction"""
          def _real_comment_extract(contents):
              yield from self._comment_entries(
                  traverse_obj(contents, (..., 'itemSectionRenderer'), get_all=False), ytcfg, video_id)
  
-        comments = []
-        estimated_total = 0
-        max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf')
+        max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0])
          # Force English regardless of account setting to prevent parsing issues
          # See: https://github.com/yt-dlp/yt-dlp/issues/532
          ytcfg = copy.deepcopy(ytcfg)
          traverse_obj(
              ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en'
-        try:
-            for comment in _real_comment_extract(contents):
-                if len(comments) >= max_comments:
-                    break
-                if isinstance(comment, int):
-                    estimated_total = comment
-                    continue
-                comments.append(comment)
-        except KeyboardInterrupt:
-            self.to_screen('Interrupted by user')
-        self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
-        return {
-            'comments': comments,
-            'comment_count': len(comments),
-        }
+        return itertools.islice(_real_comment_extract(contents), 0, max_comments)
  
      @staticmethod
      def _get_checkok_params():
@@ -3209,8 +3187,7 @@ def process_language(container, base_url, lang_code, sub_name, query):
              needs_auth=info['age_limit'] >= 18,
              is_unlisted=None if is_private is None else is_unlisted)
  
-        if self.get_param('getcomments', False):
-            info['__post_extractor'] = lambda: self._extract_comments(master_ytcfg, video_id, contents, webpage)
+        info['__post_extractor'] = self.extract_comments(master_ytcfg, video_id, contents, webpage)
  
          self.mark_watched(video_id, player_responses)
author	pukkandan <redacted>
	Tue, 12 Oct 2021 09:50:50 +0000 (15:20 +0530)
committer	pukkandan <redacted>
	Tue, 12 Oct 2021 09:51:30 +0000 (15:21 +0530)
yt_dlp/extractor/bannedvideo.py		patch \| blob \| blame \| history
yt_dlp/extractor/common.py		patch \| blob \| blame \| history
yt_dlp/extractor/youtube.py		patch \| blob \| blame \| history