[bilibili] Add anthology support

author pukkandan <redacted>

Sat, 20 Mar 2021 09:07:43 +0000 (14:37 +0530)

committer pukkandan <redacted>

Sat, 20 Mar 2021 09:26:42 +0000 (14:56 +0530)
author pukkandan <redacted>
Sat, 20 Mar 2021 09:07:43 +0000 (14:37 +0530)
committer pukkandan <redacted>
Sat, 20 Mar 2021 09:26:42 +0000 (14:56 +0530)
diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py

index 6fcc4ac9323153d533330225cc2a92d48f6a95ec..554a5700593924d5b15be0c14180257fd7c5ccd5 100644 (file)
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@@ -7,6 +7,7 @@
  
  from .common import InfoExtractor, SearchInfoExtractor
  from ..compat import (
+    compat_str,
      compat_parse_qs,
      compat_urlparse,
  )
@@ -15,6 +16,7 @@
      int_or_none,
      float_or_none,
      parse_iso8601,
+    try_get,
      smuggle_url,
      str_or_none,
      strip_jsonp,
@@ -113,6 +115,13 @@ class BiliBiliIE(InfoExtractor):
          # new BV video id format
          'url': 'https://www.bilibili.com/video/BV1JE411F741',
          'only_matching': True,
+    }, {
+        # Anthology
+        'url': 'https://www.bilibili.com/video/BV1bK411W797',
+        'info_dict': {
+            'id': 'BV1bK411W797',
+        },
+        'playlist_count': 17,
      }]
  
      _APP_KEY = 'iVGUTjsxvpLeuDCf'
@@ -139,9 +148,19 @@ def _real_extract(self, url):
          page_id = mobj.group('page')
          webpage = self._download_webpage(url, video_id)
  
+        # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself.
+        # If the video has no page argument, check to see if it's an anthology
+        if page_id is None:
+            if not self._downloader.params.get('noplaylist'):
+                r = self._extract_anthology_entries(bv_id, video_id, webpage)
+                if r is not None:
+                    self.to_screen('Downloading anthology %s - add --no-playlist to just download video' % video_id)
+                    return r
+            self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
          if 'anime/' not in url:
              cid = self._search_regex(
-                r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + str(page_id), webpage, 'cid',
+                r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + compat_str(page_id), webpage, 'cid',
                  default=None
              ) or self._search_regex(
                  r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid',
@@ -224,7 +243,18 @@ def _real_extract(self, url):
          title = self._html_search_regex(
              (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
               r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
-            group='title') + ('_p' + str(page_id) if page_id is not None else '')
+            group='title')
+
+        # Get part title for anthologies
+        if page_id is not None:
+            # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video
+            part_title = try_get(
+                self._download_json(
+                    "https://api.bilibili.com/x/player/pagelist?bvid=%s&jsonp=jsonp" % bv_id,
+                    video_id, note='Extracting videos in anthology'),
+                lambda x: x['data'][int(page_id) - 1]['part'])
+            title = part_title or title
+
          description = self._html_search_meta('description', webpage)
          timestamp = unified_timestamp(self._html_search_regex(
              r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time',
@@ -234,7 +264,7 @@ def _real_extract(self, url):
  
          # TODO 'view_count' requires deobfuscating Javascript
          info = {
-            'id': str(video_id) if page_id is None else '%s_p%s' % (video_id, page_id),
+            'id': compat_str(video_id) if page_id is None else '%s_p%s' % (video_id, page_id),
              'cid': cid,
              'title': title,
              'description': description,
@@ -300,7 +330,7 @@ def get_comments():
  
              global_info = {
                  '_type': 'multi_video',
-                'id': video_id,
+                'id': compat_str(video_id),
                  'bv_id': bv_id,
                  'title': title,
                  'description': description,
@@ -312,6 +342,20 @@ def get_comments():
  
              return global_info
  
+    def _extract_anthology_entries(self, bv_id, video_id, webpage):
+        title = self._html_search_regex(
+            (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
+             r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
+            group='title')
+        json_data = self._download_json(
+            "https://api.bilibili.com/x/player/pagelist?bvid=%s&jsonp=jsonp" % bv_id,
+            video_id, note='Extracting videos in anthology')
+
+        if len(json_data['data']) > 1:
+            return self.playlist_from_matches(
+                json_data['data'], bv_id, title, ie=BiliBiliIE.ie_key(),
+                getter=lambda entry: 'https://www.bilibili.com/video/%s?p=%d' % (bv_id, entry['page']))
+
      def _get_video_id_set(self, id, is_bv):
          query = {'bvid': id} if is_bv else {'aid': id}
          response = self._download_json(
@@ -506,7 +550,7 @@ def _get_n_results(self, query, n):
  
              videos = data['result']
              for video in videos:
-                e = self.url_result(video['arcurl'], 'BiliBili', str(video['aid']))
+                e = self.url_result(video['arcurl'], 'BiliBili', compat_str(video['aid']))
                  entries.append(e)
  
              if(len(entries) >= n or len(videos) >= BiliBiliSearchIE.MAX_NUMBER_OF_RESULTS):
author	pukkandan <redacted>
	Sat, 20 Mar 2021 09:07:43 +0000 (14:37 +0530)
committer	pukkandan <redacted>
	Sat, 20 Mar 2021 09:26:42 +0000 (14:56 +0530)