import re
import time
import urllib.parse
+import uuid
from .common import InfoExtractor, SearchInfoExtractor
from ..dependencies import Cryptodome
OnDemandPagedList,
bool_or_none,
clean_html,
+ determine_ext,
filter_dict,
float_or_none,
format_field,
return formats
- def _download_playinfo(self, video_id, cid):
+ def _download_playinfo(self, video_id, cid, headers=None):
return self._download_json(
'https://api.bilibili.com/x/player/playurl', video_id,
query={'bvid': video_id, 'cid': cid, 'fnval': 4048},
- note=f'Downloading video formats for cid {cid}')['data']
+ note=f'Downloading video formats for cid {cid}', headers=headers)['data']
def json2srt(self, json_data):
srt_data = ''
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage, urlh = self._download_webpage_handle(url, video_id)
+ headers = self.geo_verification_headers()
+ webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers)
if not self._match_valid_url(urlh.url):
return self.url_result(urlh.url)
self._download_json(
'https://api.bilibili.com/x/player/pagelist', video_id,
fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'},
- note='Extracting videos in anthology'),
+ note='Extracting videos in anthology', headers=headers),
'data', expected_type=list) or []
is_anthology = len(page_list_json) > 1
festival_info = {}
if is_festival:
- play_info = self._download_playinfo(video_id, cid)
+ play_info = self._download_playinfo(video_id, cid, headers=headers)
festival_info = traverse_obj(initial_state, {
'uploader': ('videoInfo', 'upName'),
def _real_extract(self, url):
episode_id = self._match_id(url)
- webpage = self._download_webpage(url, episode_id)
+ headers = self.geo_verification_headers()
+ webpage = self._download_webpage(url, episode_id, headers=headers)
if '您所在的地区无法观看本片' in webpage:
raise GeoRestrictedError('This video is restricted')
elif '正在观看预览,大会员免费看全片' in webpage:
self.raise_login_required('This video is for premium members only')
- headers = {'Referer': url, **self.geo_verification_headers()}
+ headers['Referer'] = url
play_info = self._download_json(
'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id,
'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id},
'duration': float_or_none(play_info.get('timelength'), scale=1000),
'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid),
'__post_extractor': self.extract_comments(aid),
- 'http_headers': headers,
+ 'http_headers': {'Referer': url},
}
try:
response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search',
- playlist_id, note=f'Downloading page {page_idx}', query=query)
+ playlist_id, note=f'Downloading page {page_idx}', query=query,
+ headers={'referer': url})
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 412:
raise ExtractorError(
'Request is blocked by server (412), please add cookies, wait and try later.', expected=True)
raise
- if response['code'] == -401:
+ if response['code'] in (-352, -401):
raise ExtractorError(
- 'Request is blocked by server (401), please add cookies, wait and try later.', expected=True)
+ f'Request is blocked by server ({-response["code"]}), '
+ 'please add cookies, wait and try later.', expected=True)
return response['data']
def get_metadata(page_data):
'upload_date': '20211127',
},
'playlist_mincount': 513,
+ }, {
+ 'url': 'https://www.bilibili.com/list/1958703906?sid=547718&oid=687146339&bvid=BV1DU4y1r7tz',
+ 'info_dict': {
+ 'id': 'BV1DU4y1r7tz',
+ 'ext': 'mp4',
+ 'title': '【直播回放】8.20晚9:30 3d发布喵 2022年8月20日21点场',
+ 'upload_date': '20220820',
+ 'description': '',
+ 'timestamp': 1661016330,
+ 'uploader_id': '1958703906',
+ 'uploader': '靡烟miya',
+ 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
+ 'duration': 9552.903,
+ 'tags': list,
+ 'comment_count': int,
+ 'view_count': int,
+ 'like_count': int,
+ '_old_archive_ids': ['bilibili 687146339_part1'],
+ },
+ 'params': {'noplaylist': True},
}, {
'url': 'https://www.bilibili.com/medialist/play/1958703906?business=space_series&business_id=547718&desc=1',
'info_dict': {
def _real_extract(self, url):
list_id = self._match_id(url)
+
+ bvid = traverse_obj(parse_qs(url), ('bvid', 0))
+ if not self._yes_playlist(list_id, bvid):
+ return self.url_result(f'https://www.bilibili.com/video/{bvid}', BiliBiliIE)
+
webpage = self._download_webpage(url, list_id)
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', list_id)
if traverse_obj(initial_state, ('error', 'code', {int_or_none})) != 200:
IE_DESC = 'Bilibili video search'
_MAX_RESULTS = 100000
_SEARCH_KEY = 'bilisearch'
+ _TESTS = [{
+ 'url': 'bilisearch3:靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊',
+ 'playlist_count': 3,
+ 'info_dict': {
+ 'id': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊',
+ 'title': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'BV1n44y1Q7sc',
+ 'ext': 'mp4',
+ 'title': '“出道一年,我怎么还在等你单推的女人睡觉后开播啊?”【一分钟了解靡烟miya】',
+ 'timestamp': 1669889987,
+ 'upload_date': '20221201',
+ 'description': 'md5:43343c0973defff527b5a4b403b4abf9',
+ 'tags': list,
+ 'uploader': '靡烟miya',
+ 'duration': 123.156,
+ 'uploader_id': '1958703906',
+ 'comment_count': int,
+ 'view_count': int,
+ 'like_count': int,
+ 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
+ '_old_archive_ids': ['bilibili 988222410_part1'],
+ },
+ }],
+ }]
def _search_results(self, query):
+ if not self._get_cookies('https://api.bilibili.com').get('buvid3'):
+ self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc')
for page_num in itertools.count(1):
videos = self._download_json(
'https://api.bilibili.com/x/web-interface/search/type', query,
class BiliIntlBaseIE(InfoExtractor):
_API_URL = 'https://api.bilibili.tv/intl/gateway'
_NETRC_MACHINE = 'biliintl'
+ _HEADERS = {'Referer': 'https://www.bilibili.com/'}
def _call_api(self, endpoint, *args, **kwargs):
json = self._download_json(self._API_URL + endpoint, *args, **kwargs)
'aid': aid,
})) or {}
subtitles = {}
- for sub in sub_json.get('subtitles') or []:
- sub_url = sub.get('url')
- if not sub_url:
- continue
- sub_data = self._download_json(
- sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False,
- note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '')
- if not sub_data:
- continue
- subtitles.setdefault(sub.get('lang_key', 'en'), []).append({
- 'ext': 'srt',
- 'data': self.json2srt(sub_data)
- })
+ fetched_urls = set()
+ for sub in traverse_obj(sub_json, (('subtitles', 'video_subtitle'), ..., {dict})):
+ for url in traverse_obj(sub, ((None, 'ass', 'srt'), 'url', {url_or_none})):
+ if url in fetched_urls:
+ continue
+ fetched_urls.add(url)
+ sub_ext = determine_ext(url)
+ sub_lang = sub.get('lang_key') or 'en'
+
+ if sub_ext == 'ass':
+ subtitles.setdefault(sub_lang, []).append({
+ 'ext': 'ass',
+ 'url': url,
+ })
+ elif sub_ext == 'json':
+ sub_data = self._download_json(
+ url, ep_id or aid, fatal=False,
+ note=f'Downloading subtitles{format_field(sub, "lang", " for %s")} ({sub_lang})',
+ errnote='Unable to download subtitles')
+
+ if sub_data:
+ subtitles.setdefault(sub_lang, []).append({
+ 'ext': 'srt',
+ 'data': self.json2srt(sub_data),
+ })
+ else:
+ self.report_warning('Unexpected subtitle extension', ep_id or aid)
+
return subtitles
def _get_formats(self, *, ep_id=None, aid=None):
def _parse_video_metadata(self, video_data):
return {
'title': video_data.get('title_display') or video_data.get('title'),
+ 'description': video_data.get('desc'),
'thumbnail': video_data.get('cover'),
+ 'timestamp': unified_timestamp(video_data.get('formatted_pub_date')),
'episode_number': int_or_none(self._search_regex(
r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)),
}
'episode_number': 140,
},
'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.'
- }, {
- 'url': 'https://www.bilibili.tv/en/video/2041863208',
- 'info_dict': {
- 'id': '2041863208',
- 'ext': 'mp4',
- 'timestamp': 1670874843,
- 'description': 'Scheduled for April 2023.\nStudio: ufotable',
- 'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$',
- 'upload_date': '20221212',
- 'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation',
- },
}, {
# episode comment extraction
'url': 'https://www.bilibili.tv/en/play/34580/340317',
'description': 'md5:693b6f3967fb4e7e7764ea817857c33a',
'timestamp': 1667891924,
'upload_date': '20221108',
- 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation',
+ 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan',
'comment_count': int,
- 'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg',
+ 'thumbnail': r're:https://pic\.bstarstatic\.(?:com|net)/ugc/f6c363659efd2eabe5683fbb906b1582\.jpg',
},
'params': {
'getcomments': True
'only_matching': True,
}]
+ @staticmethod
def _make_url(video_id, series_id=None):
if series_id:
return f'https://www.bilibili.tv/en/play/{series_id}/{video_id}'
# XXX: webpage metadata may not accurate, it just used to not crash when video_data not found
return merge_dicts(
- self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), {
- 'title': self._html_search_meta('og:title', webpage),
- 'description': self._html_search_meta('og:description', webpage)
- })
+ self._parse_video_metadata(video_data), {
+ 'title': get_element_by_class(
+ 'bstar-meta__title', webpage) or self._html_search_meta('og:title', webpage),
+ 'description': get_element_by_class(
+ 'bstar-meta__desc', webpage) or self._html_search_meta('og:description', webpage),
+ }, self._search_json_ld(webpage, video_id, default={}))
def _get_comments_reply(self, root_id, next_id=0, display_id=None):
comment_api_raw_data = self._download_json(
'formats': self._get_formats(ep_id=ep_id, aid=aid),
'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid),
'chapters': chapters,
- '__post_extractor': self.extract_comments(video_id, ep_id)
+ '__post_extractor': self.extract_comments(video_id, ep_id),
+ 'http_headers': self._HEADERS,
}