Extract comments only when needed #95 (Closes #94)

author pukkandan <redacted>

Sun, 28 Feb 2021 14:56:08 +0000 (20:26 +0530)

committer GitHub <redacted>

Sun, 28 Feb 2021 14:56:08 +0000 (20:26 +0530)
author pukkandan <redacted>
Sun, 28 Feb 2021 14:56:08 +0000 (20:26 +0530)
committer GitHub <redacted>
Sun, 28 Feb 2021 14:56:08 +0000 (20:26 +0530)
diff --git a/README.md b/README.md

index 8def97e4c1202ea945493ca88f6b68d0084ee2e4..4501ba426b7db72e6bac2c7d52521e3b18587d1a 100644 (file)
--- a/README.md
+++ b/README.md
@@ -245,7 +245,7 @@ ## Video Selection:
                                       "OUTPUT TEMPLATE" for a list of available
                                       keys) to match if the key is present, !key
                                       to check if the key is not present,
-                                     key>NUMBER (like "comment_count > 12", also
+                                     key>NUMBER (like "view_count > 12", also
                                       works with >=, <, <=, !=, =) to compare
                                       against a number, key = 'LITERAL' (like
                                       "uploader = 'Mike Smith'", also works with
@@ -403,7 +403,9 @@ ## Filesystem Options:
      --no-write-playlist-metafiles    Do not write playlist metadata when using
                                       --write-info-json, --write-description etc.
      --get-comments                   Retrieve video comments to be placed in the
-                                     .info.json file
+                                     .info.json file. The comments are fetched
+                                     even without this option if the extraction
+                                     is known to be quick
      --load-info-json FILE            JSON file containing the video information
                                       (created with the "--write-info-json"
                                       option)
@@ -814,7 +816,7 @@ # OUTPUT TEMPLATE
   - `dislike_count` (numeric): Number of negative ratings of the video
   - `repost_count` (numeric): Number of reposts of the video
   - `average_rating` (numeric): Average rating give by users, the scale used depends on the webpage
- - `comment_count` (numeric): Number of comments on the video
+ - `comment_count` (numeric): Number of comments on the video (For some extractors, comments are only downloaded at the end, and so this field cannot be used)
   - `age_limit` (numeric): Age restriction for the video (years)
   - `is_live` (boolean): Whether this video is a live stream or a fixed-length video
   - `was_live` (boolean): Whether this video was originally a live stream
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py

index 3c53f4cd88a3edeb97b9ba8448822fcbece4f0b8..e9cb7e18767f0d0ca0047871d2e17d308d82dc40 100644 (file)
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -2041,6 +2041,7 @@ def print_optional(field):
              self.to_stdout(formatSeconds(info_dict['duration']))
          print_mandatory('format')
          if self.params.get('forcejson', False):
+            self.post_extract(info_dict)
              self.to_stdout(json.dumps(info_dict))
  
      def process_info(self, info_dict):
@@ -2064,6 +2065,7 @@ def process_info(self, info_dict):
          if self._match_entry(info_dict, incomplete=False) is not None:
              return
  
+        self.post_extract(info_dict)
          self._num_downloads += 1
  
          info_dict = self.pre_process(info_dict)
@@ -2497,6 +2499,7 @@ def download(self, url_list):
                  raise
              else:
                  if self.params.get('dump_single_json', False):
+                    self.post_extract(res)
                      self.to_stdout(json.dumps(res))
  
          return self._download_retcode
@@ -2545,6 +2548,24 @@ def run_pp(self, pp, infodict, files_to_move={}):
                      del files_to_move[old_filename]
          return files_to_move, infodict
  
+    @staticmethod
+    def post_extract(info_dict):
+        def actual_post_extract(info_dict):
+            if info_dict.get('_type') in ('playlist', 'multi_video'):
+                for video_dict in info_dict.get('entries', {}):
+                    actual_post_extract(video_dict)
+                return
+
+            if '__post_extractor' not in info_dict:
+                return
+            post_extractor = info_dict['__post_extractor']
+            if post_extractor:
+                info_dict.update(post_extractor().items())
+            del info_dict['__post_extractor']
+            return
+
+        actual_post_extract(info_dict)
+
      def pre_process(self, ie_info):
          info = dict(ie_info)
          for pp in self._pps['beforedl']:
diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py

index be117a2bb44e3259178367dab673947ffbe12457..764ac4d3c9256da84023b25a90b7a003a9c482e5 100644 (file)
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@@ -255,10 +255,6 @@ def _real_extract(self, url):
              info['uploader'] = self._html_search_meta(
                  'author', webpage, 'uploader', default=None)
  
-        comments = None
-        if self._downloader.params.get('getcomments', False):
-            comments = self._get_all_comment_pages(video_id)
-
          raw_danmaku = self._get_raw_danmaku(video_id, cid)
  
          raw_tags = self._get_tags(video_id)
@@ -266,11 +262,18 @@ def _real_extract(self, url):
  
          top_level_info = {
              'raw_danmaku': raw_danmaku,
-            'comments': comments,
-            'comment_count': len(comments) if comments is not None else None,
              'tags': tags,
              'raw_tags': raw_tags,
          }
+        if self._downloader.params.get('getcomments', False):
+            def get_comments():
+                comments = self._get_all_comment_pages(video_id)
+                return {
+                    'comments': comments,
+                    'comment_count': len(comments)
+                }
+
+            top_level_info['__post_extractor'] = get_comments
  
          '''
          # Requires https://github.com/m13253/danmaku2ass which is licenced under GPL3
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index 47b91a00a7a6e0c18c434ce8dcf0c8ca48785055..3326d436bb817821b9cec6e1f88718cbaef00e84 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -294,6 +294,14 @@ class InfoExtractor(object):
                      players on other sites. Can be True (=always allowed),
                      False (=never allowed), None (=unknown), or a string
                      specifying the criteria for embedability (Eg: 'whitelist').
+    __post_extractor: A function to be called just before the metadata is
+                    written to either disk, logger or console. The function
+                    must return a dict which will be added to the info_dict.
+                    This is usefull for additional information that is
+                    time-consuming to extract. Note that the fields thus
+                    extracted will not be available to output template and
+                    match_filter. So, only "comments" and "comment_count" are
+                    currently allowed to be extracted via this method.
  
      The following fields should only be used when the video belongs to some logical
      chapter or section:
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 41b894776dbd4be95cdf9d1f63c7fbb7609e2449..804186b851e0520c56300193cda8acb765e84e7c 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -2012,9 +2012,10 @@ def chapter_time(mmlir):
  
          # Get comments
          # TODO: Refactor and move to seperate function
-        if get_comments:
+        def extract_comments():
              expected_video_comment_count = 0
              video_comments = []
+            comment_xsrf = xsrf_token
  
              def find_value(html, key, num_chars=2, separator='"'):
                  pos_begin = html.find(key) + len(key) + num_chars
@@ -2083,7 +2084,7 @@ def get_continuation(continuation, session_token, replies=False):
              self.to_screen('Downloading comments')
              while continuations:
                  continuation = continuations.pop()
-                comment_response = get_continuation(continuation, xsrf_token)
+                comment_response = get_continuation(continuation, comment_xsrf)
                  if not comment_response:
                      continue
                  if list(search_dict(comment_response, 'externalErrorMessage')):
@@ -2094,7 +2095,7 @@ def get_continuation(continuation, session_token, replies=False):
                      continue
                  # not sure if this actually helps
                  if 'xsrf_token' in comment_response:
-                    xsrf_token = comment_response['xsrf_token']
+                    comment_xsrf = comment_response['xsrf_token']
  
                  item_section = comment_response['response']['continuationContents']['itemSectionContinuation']
                  if first_continuation:
@@ -2123,7 +2124,7 @@ def get_continuation(continuation, session_token, replies=False):
                      while reply_continuations:
                          time.sleep(1)
                          continuation = reply_continuations.pop()
-                        replies_data = get_continuation(continuation, xsrf_token, True)
+                        replies_data = get_continuation(continuation, comment_xsrf, True)
                          if not replies_data or 'continuationContents' not in replies_data[1]['response']:
                              continue
  
@@ -2152,10 +2153,13 @@ def get_continuation(continuation, session_token, replies=False):
                  time.sleep(1)
  
              self.to_screen('Total comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count))
-            info.update({
+            return {
                  'comments': video_comments,
                  'comment_count': expected_video_comment_count
-            })
+            }
+
+        if get_comments:
+            info['__post_extractor'] = extract_comments
  
          self.mark_watched(video_id, player_response)
  
diff --git a/yt_dlp/options.py b/yt_dlp/options.py

index ae11e6b8bc5ecbda35d4a4916b0cf20e47f2ce60..2694990228b20ebf940e765d69e49ef460c9b553 100644 (file)
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@@ -347,7 +347,7 @@ def _dict_from_multiple_values_options_callback(
              'Specify any key (see "OUTPUT TEMPLATE" for a list of available keys) to '
              'match if the key is present, '
              '!key to check if the key is not present, '
-            'key>NUMBER (like "comment_count > 12", also works with '
+            'key>NUMBER (like "view_count > 12", also works with '
              '>=, <, <=, !=, =) to compare against a number, '
              'key = \'LITERAL\' (like "uploader = \'Mike Smith\'", also works with !=) '
              'to match against a string literal '
@@ -985,7 +985,9 @@ def _dict_from_multiple_values_options_callback(
      filesystem.add_option(
          '--get-comments',
          action='store_true', dest='getcomments', default=False,
-        help='Retrieve video comments to be placed in the .info.json file')
+        help=(
+            'Retrieve video comments to be placed in the .info.json file. '
+            'The comments are fetched even without this option if the extraction is known to be quick'))
      filesystem.add_option(
          '--load-info-json', '--load-info',
          dest='load_info_filename', metavar='FILE',
author	pukkandan <redacted>
	Sun, 28 Feb 2021 14:56:08 +0000 (20:26 +0530)
committer	GitHub <redacted>
	Sun, 28 Feb 2021 14:56:08 +0000 (20:26 +0530)
README.md		patch \| blob \| blame \| history
yt_dlp/YoutubeDL.py		patch \| blob \| blame \| history
yt_dlp/extractor/bilibili.py		patch \| blob \| blame \| history
yt_dlp/extractor/common.py		patch \| blob \| blame \| history
yt_dlp/extractor/youtube.py		patch \| blob \| blame \| history
yt_dlp/options.py		patch \| blob \| blame \| history