[extractor/biliIntl] Add comment extraction (#6079)

author HobbyistDev <redacted>

Wed, 12 Apr 2023 18:21:57 +0000 (03:21 +0900)

committer GitHub <redacted>

Wed, 12 Apr 2023 18:21:57 +0000 (23:51 +0530)
author HobbyistDev <redacted>
Wed, 12 Apr 2023 18:21:57 +0000 (03:21 +0900)
committer GitHub <redacted>
Wed, 12 Apr 2023 18:21:57 +0000 (23:51 +0530)
diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py

index c344397792f44fdcd355aacbb227c4c3702e65cc..91d436dd851f70c0ec5ae513fc4c487799b203d3 100644 (file)
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@@ -26,6 +26,7 @@
      srt_subtitles_timecode,
      str_or_none,
      traverse_obj,
+    unified_timestamp,
      unsmuggle_url,
      url_or_none,
      urlencode_postdata,
@@ -996,6 +997,53 @@ class BiliIntlIE(BiliIntlBaseIE):
              'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$',
              'upload_date': '20221212',
              'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation',
+        },
+    }, {
+        # episode comment extraction
+        'url': 'https://www.bilibili.tv/en/play/34580/340317',
+        'info_dict': {
+            'id': '340317',
+            'ext': 'mp4',
+            'timestamp': 1604057820,
+            'upload_date': '20201030',
+            'episode_number': 5,
+            'title': 'E5 - My Own Steel',
+            'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2',
+            'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$',
+            'episode': 'Episode 5',
+            'comment_count': int,
+            'chapters': [{
+                'start_time': 0,
+                'end_time': 61.0,
+                'title': '<Untitled Chapter 1>'
+            }, {
+                'start_time': 61.0,
+                'end_time': 134.0,
+                'title': 'Intro'
+            }, {
+                'start_time': 1290.0,
+                'end_time': 1379.0,
+                'title': 'Outro'
+            }],
+        },
+        'params': {
+            'getcomments': True
+        }
+    }, {
+        # user generated content comment extraction
+        'url': 'https://www.bilibili.tv/en/video/2045730385',
+        'info_dict': {
+            'id': '2045730385',
+            'ext': 'mp4',
+            'description': 'md5:693b6f3967fb4e7e7764ea817857c33a',
+            'timestamp': 1667891924,
+            'upload_date': '20221108',
+            'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation',
+            'comment_count': int,
+            'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg',
+        },
+        'params': {
+            'getcomments': True
          }
      }, {
          # episode id without intro and outro
@@ -1055,11 +1103,69 @@ def _extract_video_metadata(self, url, video_id, season_id):
  
          # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found
          return merge_dicts(
-            self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id), {
+            self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), {
                  'title': self._html_search_meta('og:title', webpage),
                  'description': self._html_search_meta('og:description', webpage)
              })
  
+    def _get_comments_reply(self, root_id, next_id=0, display_id=None):
+        comment_api_raw_data = self._download_json(
+            'https://api.bilibili.tv/reply/web/detail', display_id,
+            note=f'Downloading reply comment of {root_id} - {next_id}',
+            query={
+                'platform': 'web',
+                'ps': 20,  # comment's reply per page (default: 3)
+                'root': root_id,
+                'next': next_id,
+            })
+
+        for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
+            yield {
+                'author': traverse_obj(replies, ('member', 'name')),
+                'author_id': traverse_obj(replies, ('member', 'mid')),
+                'author_thumbnail': traverse_obj(replies, ('member', 'face')),
+                'text': traverse_obj(replies, ('content', 'message')),
+                'id': replies.get('rpid'),
+                'like_count': int_or_none(replies.get('like_count')),
+                'parent': replies.get('parent'),
+                'timestamp': unified_timestamp(replies.get('ctime_text'))
+            }
+
+        if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
+            yield from self._get_comments_reply(
+                root_id, comment_api_raw_data['data']['cursor']['next'], display_id)
+
+    def _get_comments(self, video_id, ep_id):
+        for i in itertools.count(0):
+            comment_api_raw_data = self._download_json(
+                'https://api.bilibili.tv/reply/web/root', video_id,
+                note=f'Downloading comment page {i + 1}',
+                query={
+                    'platform': 'web',
+                    'pn': i,  # page number
+                    'ps': 20,  # comment per page (default: 20)
+                    'oid': video_id,
+                    'type': 3 if ep_id else 1,  # 1: user generated content, 3: series content
+                    'sort_type': 1,  # 1: best, 2: recent
+                })
+
+            for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
+                yield {
+                    'author': traverse_obj(replies, ('member', 'name')),
+                    'author_id': traverse_obj(replies, ('member', 'mid')),
+                    'author_thumbnail': traverse_obj(replies, ('member', 'face')),
+                    'text': traverse_obj(replies, ('content', 'message')),
+                    'id': replies.get('rpid'),
+                    'like_count': int_or_none(replies.get('like_count')),
+                    'timestamp': unified_timestamp(replies.get('ctime_text')),
+                    'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))),
+                }
+                if replies.get('count'):
+                    yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id)
+
+            if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
+                break
+
      def _real_extract(self, url):
          season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')
          video_id = ep_id or aid
@@ -1087,7 +1193,8 @@ def _real_extract(self, url):
              **self._extract_video_metadata(url, video_id, season_id),
              'formats': self._get_formats(ep_id=ep_id, aid=aid),
              'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid),
-            'chapters': chapters
+            'chapters': chapters,
+            '__post_extractor': self.extract_comments(video_id, ep_id)
          }
author	HobbyistDev <redacted>
	Wed, 12 Apr 2023 18:21:57 +0000 (03:21 +0900)
committer	GitHub <redacted>
	Wed, 12 Apr 2023 18:21:57 +0000 (23:51 +0530)