[ie/crunchyroll] Fix stream extraction (#10005)

[yt-dlp.git] / yt_dlp / extractor / bilibili.py
diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py

index bc25dc75e2b22556d4f069f4af60b25817d53b77..b38c90b1d1041cce4710fd2ed5e4783fea26b534 100644 (file)
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@@ -7,6 +7,7 @@
  import re
  import time
  import urllib.parse
+import uuid
  
  from .common import InfoExtractor, SearchInfoExtractor
  from ..dependencies import Cryptodome
@@ -18,6 +19,7 @@
      OnDemandPagedList,
      bool_or_none,
      clean_html,
+    determine_ext,
      filter_dict,
      float_or_none,
      format_field,
@@ -91,11 +93,11 @@ def extract_formats(self, play_info):
  
          return formats
  
-    def _download_playinfo(self, video_id, cid):
+    def _download_playinfo(self, video_id, cid, headers=None):
          return self._download_json(
              'https://api.bilibili.com/x/player/playurl', video_id,
              query={'bvid': video_id, 'cid': cid, 'fnval': 4048},
-            note=f'Downloading video formats for cid {cid}')['data']
+            note=f'Downloading video formats for cid {cid}', headers=headers)['data']
  
      def json2srt(self, json_data):
          srt_data = ''
@@ -491,7 +493,8 @@ class BiliBiliIE(BilibiliBaseIE):
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
-        webpage, urlh = self._download_webpage_handle(url, video_id)
+        headers = self.geo_verification_headers()
+        webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers)
          if not self._match_valid_url(urlh.url):
              return self.url_result(urlh.url)
  
@@ -529,7 +532,7 @@ def _real_extract(self, url):
              self._download_json(
                  'https://api.bilibili.com/x/player/pagelist', video_id,
                  fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'},
-                note='Extracting videos in anthology'),
+                note='Extracting videos in anthology', headers=headers),
              'data', expected_type=list) or []
          is_anthology = len(page_list_json) > 1
  
@@ -550,7 +553,7 @@ def _real_extract(self, url):
  
          festival_info = {}
          if is_festival:
-            play_info = self._download_playinfo(video_id, cid)
+            play_info = self._download_playinfo(video_id, cid, headers=headers)
  
              festival_info = traverse_obj(initial_state, {
                  'uploader': ('videoInfo', 'upName'),
@@ -664,14 +667,15 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
  
      def _real_extract(self, url):
          episode_id = self._match_id(url)
-        webpage = self._download_webpage(url, episode_id)
+        headers = self.geo_verification_headers()
+        webpage = self._download_webpage(url, episode_id, headers=headers)
  
          if '您所在的地区无法观看本片' in webpage:
              raise GeoRestrictedError('This video is restricted')
          elif '正在观看预览，大会员免费看全片' in webpage:
              self.raise_login_required('This video is for premium members only')
  
-        headers = {'Referer': url, **self.geo_verification_headers()}
+        headers['Referer'] = url
          play_info = self._download_json(
              'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id,
              'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id},
@@ -722,7 +726,7 @@ def _real_extract(self, url):
              'duration': float_or_none(play_info.get('timelength'), scale=1000),
              'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid),
              '__post_extractor': self.extract_comments(aid),
-            'http_headers': headers,
+            'http_headers': {'Referer': url},
          }
  
  
@@ -1041,15 +1045,17 @@ def fetch_page(page_idx):
  
              try:
                  response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search',
-                                               playlist_id, note=f'Downloading page {page_idx}', query=query)
+                                               playlist_id, note=f'Downloading page {page_idx}', query=query,
+                                               headers={'referer': url})
              except ExtractorError as e:
                  if isinstance(e.cause, HTTPError) and e.cause.status == 412:
                      raise ExtractorError(
                          'Request is blocked by server (412), please add cookies, wait and try later.', expected=True)
                  raise
-            if response['code'] == -401:
+            if response['code'] in (-352, -401):
                  raise ExtractorError(
-                    'Request is blocked by server (401), please add cookies, wait and try later.', expected=True)
+                    f'Request is blocked by server ({-response["code"]}), '
+                    'please add cookies, wait and try later.', expected=True)
              return response['data']
  
          def get_metadata(page_data):
@@ -1303,6 +1309,26 @@ class BilibiliPlaylistIE(BilibiliSpaceListBaseIE):
              'upload_date': '20211127',
          },
          'playlist_mincount': 513,
+    }, {
+        'url': 'https://www.bilibili.com/list/1958703906?sid=547718&oid=687146339&bvid=BV1DU4y1r7tz',
+        'info_dict': {
+            'id': 'BV1DU4y1r7tz',
+            'ext': 'mp4',
+            'title': '【直播回放】8.20晚9:30 3d发布喵 2022年8月20日21点场',
+            'upload_date': '20220820',
+            'description': '',
+            'timestamp': 1661016330,
+            'uploader_id': '1958703906',
+            'uploader': '靡烟miya',
+            'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
+            'duration': 9552.903,
+            'tags': list,
+            'comment_count': int,
+            'view_count': int,
+            'like_count': int,
+            '_old_archive_ids': ['bilibili 687146339_part1'],
+        },
+        'params': {'noplaylist': True},
      }, {
          'url': 'https://www.bilibili.com/medialist/play/1958703906?business=space_series&business_id=547718&desc=1',
          'info_dict': {
@@ -1354,6 +1380,11 @@ def _extract_medialist(self, query, list_id):
  
      def _real_extract(self, url):
          list_id = self._match_id(url)
+
+        bvid = traverse_obj(parse_qs(url), ('bvid', 0))
+        if not self._yes_playlist(list_id, bvid):
+            return self.url_result(f'https://www.bilibili.com/video/{bvid}', BiliBiliIE)
+
          webpage = self._download_webpage(url, list_id)
          initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', list_id)
          if traverse_obj(initial_state, ('error', 'code', {int_or_none})) != 200:
@@ -1463,8 +1494,37 @@ class BiliBiliSearchIE(SearchInfoExtractor):
      IE_DESC = 'Bilibili video search'
      _MAX_RESULTS = 100000
      _SEARCH_KEY = 'bilisearch'
+    _TESTS = [{
+        'url': 'bilisearch3:靡烟 出道一年，我怎么还在等你单推的女人睡觉后开播啊',
+        'playlist_count': 3,
+        'info_dict': {
+            'id': '靡烟 出道一年，我怎么还在等你单推的女人睡觉后开播啊',
+            'title': '靡烟 出道一年，我怎么还在等你单推的女人睡觉后开播啊',
+        },
+        'playlist': [{
+            'info_dict': {
+                'id': 'BV1n44y1Q7sc',
+                'ext': 'mp4',
+                'title': '“出道一年，我怎么还在等你单推的女人睡觉后开播啊？”【一分钟了解靡烟miya】',
+                'timestamp': 1669889987,
+                'upload_date': '20221201',
+                'description': 'md5:43343c0973defff527b5a4b403b4abf9',
+                'tags': list,
+                'uploader': '靡烟miya',
+                'duration': 123.156,
+                'uploader_id': '1958703906',
+                'comment_count': int,
+                'view_count': int,
+                'like_count': int,
+                'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
+                '_old_archive_ids': ['bilibili 988222410_part1'],
+            },
+        }],
+    }]
  
      def _search_results(self, query):
+        if not self._get_cookies('https://api.bilibili.com').get('buvid3'):
+            self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc')
          for page_num in itertools.count(1):
              videos = self._download_json(
                  'https://api.bilibili.com/x/web-interface/search/type', query,
@@ -1621,6 +1681,7 @@ def _real_extract(self, url):
  class BiliIntlBaseIE(InfoExtractor):
      _API_URL = 'https://api.bilibili.tv/intl/gateway'
      _NETRC_MACHINE = 'biliintl'
+    _HEADERS = {'Referer': 'https://www.bilibili.com/'}
  
      def _call_api(self, endpoint, *args, **kwargs):
          json = self._download_json(self._API_URL + endpoint, *args, **kwargs)
@@ -1658,19 +1719,34 @@ def _get_subtitles(self, *, ep_id=None, aid=None):
                  'aid': aid,
              })) or {}
          subtitles = {}
-        for sub in sub_json.get('subtitles') or []:
-            sub_url = sub.get('url')
-            if not sub_url:
-                continue
-            sub_data = self._download_json(
-                sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False,
-                note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '')
-            if not sub_data:
-                continue
-            subtitles.setdefault(sub.get('lang_key', 'en'), []).append({
-                'ext': 'srt',
-                'data': self.json2srt(sub_data)
-            })
+        fetched_urls = set()
+        for sub in traverse_obj(sub_json, (('subtitles', 'video_subtitle'), ..., {dict})):
+            for url in traverse_obj(sub, ((None, 'ass', 'srt'), 'url', {url_or_none})):
+                if url in fetched_urls:
+                    continue
+                fetched_urls.add(url)
+                sub_ext = determine_ext(url)
+                sub_lang = sub.get('lang_key') or 'en'
+
+                if sub_ext == 'ass':
+                    subtitles.setdefault(sub_lang, []).append({
+                        'ext': 'ass',
+                        'url': url,
+                    })
+                elif sub_ext == 'json':
+                    sub_data = self._download_json(
+                        url, ep_id or aid, fatal=False,
+                        note=f'Downloading subtitles{format_field(sub, "lang", " for %s")} ({sub_lang})',
+                        errnote='Unable to download subtitles')
+
+                    if sub_data:
+                        subtitles.setdefault(sub_lang, []).append({
+                            'ext': 'srt',
+                            'data': self.json2srt(sub_data),
+                        })
+                else:
+                    self.report_warning('Unexpected subtitle extension', ep_id or aid)
+
          return subtitles
  
      def _get_formats(self, *, ep_id=None, aid=None):
@@ -1716,7 +1792,9 @@ def _get_formats(self, *, ep_id=None, aid=None):
      def _parse_video_metadata(self, video_data):
          return {
              'title': video_data.get('title_display') or video_data.get('title'),
+            'description': video_data.get('desc'),
              'thumbnail': video_data.get('cover'),
+            'timestamp': unified_timestamp(video_data.get('formatted_pub_date')),
              'episode_number': int_or_none(self._search_regex(
                  r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)),
          }
@@ -1813,17 +1891,6 @@ class BiliIntlIE(BiliIntlBaseIE):
              'episode_number': 140,
          },
          'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.'
-    }, {
-        'url': 'https://www.bilibili.tv/en/video/2041863208',
-        'info_dict': {
-            'id': '2041863208',
-            'ext': 'mp4',
-            'timestamp': 1670874843,
-            'description': 'Scheduled for April 2023.\nStudio: ufotable',
-            'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$',
-            'upload_date': '20221212',
-            'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation',
-        },
      }, {
          # episode comment extraction
          'url': 'https://www.bilibili.tv/en/play/34580/340317',
@@ -1864,9 +1931,9 @@ class BiliIntlIE(BiliIntlBaseIE):
              'description': 'md5:693b6f3967fb4e7e7764ea817857c33a',
              'timestamp': 1667891924,
              'upload_date': '20221108',
-            'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation',
+            'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan',
              'comment_count': int,
-            'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg',
+            'thumbnail': r're:https://pic\.bstarstatic\.(?:com|net)/ugc/f6c363659efd2eabe5683fbb906b1582\.jpg',
          },
          'params': {
              'getcomments': True
@@ -1902,6 +1969,7 @@ class BiliIntlIE(BiliIntlBaseIE):
          'only_matching': True,
      }]
  
+    @staticmethod
      def _make_url(video_id, series_id=None):
          if series_id:
              return f'https://www.bilibili.tv/en/play/{series_id}/{video_id}'
@@ -1929,10 +1997,12 @@ def _extract_video_metadata(self, url, video_id, season_id):
  
          # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found
          return merge_dicts(
-            self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), {
-                'title': self._html_search_meta('og:title', webpage),
-                'description': self._html_search_meta('og:description', webpage)
-            })
+            self._parse_video_metadata(video_data), {
+                'title': get_element_by_class(
+                    'bstar-meta__title', webpage) or self._html_search_meta('og:title', webpage),
+                'description': get_element_by_class(
+                    'bstar-meta__desc', webpage) or self._html_search_meta('og:description', webpage),
+            }, self._search_json_ld(webpage, video_id, default={}))
  
      def _get_comments_reply(self, root_id, next_id=0, display_id=None):
          comment_api_raw_data = self._download_json(
@@ -2020,7 +2090,8 @@ def _real_extract(self, url):
              'formats': self._get_formats(ep_id=ep_id, aid=aid),
              'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid),
              'chapters': chapters,
-            '__post_extractor': self.extract_comments(video_id, ep_id)
+            '__post_extractor': self.extract_comments(video_id, ep_id),
+            'http_headers': self._HEADERS,
          }