]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/bilibili.py
cb7ab2a17477c4ca60297a7e9dc80077a8753146
[yt-dlp.git] / yt_dlp / extractor / bilibili.py
1 import base64
2 import functools
3 import hashlib
4 import itertools
5 import math
6 import time
7 import urllib.parse
8
9 from .common import InfoExtractor, SearchInfoExtractor
10 from ..dependencies import Cryptodome
11 from ..networking.exceptions import HTTPError
12 from ..utils import (
13 ExtractorError,
14 GeoRestrictedError,
15 InAdvancePagedList,
16 OnDemandPagedList,
17 filter_dict,
18 float_or_none,
19 format_field,
20 int_or_none,
21 join_nonempty,
22 make_archive_id,
23 merge_dicts,
24 mimetype2ext,
25 parse_count,
26 parse_qs,
27 qualities,
28 smuggle_url,
29 srt_subtitles_timecode,
30 str_or_none,
31 traverse_obj,
32 try_call,
33 unified_timestamp,
34 unsmuggle_url,
35 url_or_none,
36 urlencode_postdata,
37 )
38
39
40 class BilibiliBaseIE(InfoExtractor):
41 def extract_formats(self, play_info):
42 format_names = {
43 r['quality']: traverse_obj(r, 'new_description', 'display_desc')
44 for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality']))
45 }
46
47 audios = traverse_obj(play_info, ('dash', 'audio', ...))
48 flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio'))
49 if flac_audio:
50 audios.append(flac_audio)
51 formats = [{
52 'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'),
53 'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')),
54 'acodec': audio.get('codecs'),
55 'vcodec': 'none',
56 'tbr': float_or_none(audio.get('bandwidth'), scale=1000),
57 'filesize': int_or_none(audio.get('size'))
58 } for audio in audios]
59
60 formats.extend({
61 'url': traverse_obj(video, 'baseUrl', 'base_url', 'url'),
62 'ext': mimetype2ext(traverse_obj(video, 'mimeType', 'mime_type')),
63 'fps': float_or_none(traverse_obj(video, 'frameRate', 'frame_rate')),
64 'width': int_or_none(video.get('width')),
65 'height': int_or_none(video.get('height')),
66 'vcodec': video.get('codecs'),
67 'acodec': 'none' if audios else None,
68 'tbr': float_or_none(video.get('bandwidth'), scale=1000),
69 'filesize': int_or_none(video.get('size')),
70 'quality': int_or_none(video.get('id')),
71 'format': format_names.get(video.get('id')),
72 } for video in traverse_obj(play_info, ('dash', 'video', ...)))
73
74 missing_formats = format_names.keys() - set(traverse_obj(formats, (..., 'quality')))
75 if missing_formats:
76 self.to_screen(f'Format(s) {", ".join(format_names[i] for i in missing_formats)} are missing; '
77 f'you have to login or become premium member to download them. {self._login_hint()}')
78
79 return formats
80
81 def json2srt(self, json_data):
82 srt_data = ''
83 for idx, line in enumerate(json_data.get('body') or []):
84 srt_data += (f'{idx + 1}\n'
85 f'{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n'
86 f'{line["content"]}\n\n')
87 return srt_data
88
89 def _get_subtitles(self, video_id, aid, cid):
90 subtitles = {
91 'danmaku': [{
92 'ext': 'xml',
93 'url': f'https://comment.bilibili.com/{cid}.xml',
94 }]
95 }
96
97 video_info_json = self._download_json(f'https://api.bilibili.com/x/player/v2?aid={aid}&cid={cid}', video_id)
98 for s in traverse_obj(video_info_json, ('data', 'subtitle', 'subtitles', ...)):
99 subtitles.setdefault(s['lan'], []).append({
100 'ext': 'srt',
101 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id))
102 })
103 return subtitles
104
105 def _get_chapters(self, aid, cid):
106 chapters = aid and cid and self._download_json(
107 'https://api.bilibili.com/x/player/v2', aid, query={'aid': aid, 'cid': cid},
108 note='Extracting chapters', fatal=False)
109 return traverse_obj(chapters, ('data', 'view_points', ..., {
110 'title': 'content',
111 'start_time': 'from',
112 'end_time': 'to',
113 })) or None
114
115 def _get_comments(self, aid):
116 for idx in itertools.count(1):
117 replies = traverse_obj(
118 self._download_json(
119 f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={aid}&type=1&jsonp=jsonp&sort=2&_=1567227301685',
120 aid, note=f'Extracting comments from page {idx}', fatal=False),
121 ('data', 'replies'))
122 if not replies:
123 return
124 for children in map(self._get_all_children, replies):
125 yield from children
126
127 def _get_all_children(self, reply):
128 yield {
129 'author': traverse_obj(reply, ('member', 'uname')),
130 'author_id': traverse_obj(reply, ('member', 'mid')),
131 'id': reply.get('rpid'),
132 'text': traverse_obj(reply, ('content', 'message')),
133 'timestamp': reply.get('ctime'),
134 'parent': reply.get('parent') or 'root',
135 }
136 for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))):
137 yield from children
138
139 def _get_episodes_from_season(self, ss_id, url):
140 season_info = self._download_json(
141 'https://api.bilibili.com/pgc/web/season/section', ss_id,
142 note='Downloading season info', query={'season_id': ss_id},
143 headers={'Referer': url, **self.geo_verification_headers()})
144
145 for entry in traverse_obj(season_info, (
146 'result', 'main_section', 'episodes',
147 lambda _, v: url_or_none(v['share_url']) and v['id'])):
148 yield self.url_result(entry['share_url'], BiliBiliBangumiIE, f'ep{entry["id"]}')
149
150
151 class BiliBiliIE(BilibiliBaseIE):
152 _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)'
153
154 _TESTS = [{
155 'url': 'https://www.bilibili.com/video/BV13x41117TL',
156 'info_dict': {
157 'id': 'BV13x41117TL',
158 'title': '阿滴英文|英文歌分享#6 "Closer',
159 'ext': 'mp4',
160 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文',
161 'uploader_id': '65880958',
162 'uploader': '阿滴英文',
163 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
164 'duration': 554.117,
165 'tags': list,
166 'comment_count': int,
167 'upload_date': '20170301',
168 'timestamp': 1488353834,
169 'like_count': int,
170 'view_count': int,
171 },
172 }, {
173 # old av URL version
174 'url': 'http://www.bilibili.com/video/av1074402/',
175 'info_dict': {
176 'thumbnail': r're:^https?://.*\.(jpg|jpeg)$',
177 'ext': 'mp4',
178 'uploader': '菊子桑',
179 'uploader_id': '156160',
180 'id': 'BV11x411K7CN',
181 'title': '【金坷垃】金泡沫',
182 'duration': 308.36,
183 'upload_date': '20140420',
184 'timestamp': 1397983878,
185 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
186 'like_count': int,
187 'comment_count': int,
188 'view_count': int,
189 'tags': list,
190 },
191 'params': {'skip_download': True},
192 }, {
193 'note': 'Anthology',
194 'url': 'https://www.bilibili.com/video/BV1bK411W797',
195 'info_dict': {
196 'id': 'BV1bK411W797',
197 'title': '物语中的人物是如何吐槽自己的OP的'
198 },
199 'playlist_count': 18,
200 'playlist': [{
201 'info_dict': {
202 'id': 'BV1bK411W797_p1',
203 'ext': 'mp4',
204 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川',
205 'tags': 'count:11',
206 'timestamp': 1589601697,
207 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
208 'uploader': '打牌还是打桩',
209 'uploader_id': '150259984',
210 'like_count': int,
211 'comment_count': int,
212 'upload_date': '20200516',
213 'view_count': int,
214 'description': 'md5:e3c401cf7bc363118d1783dd74068a68',
215 'duration': 90.314,
216 }
217 }]
218 }, {
219 'note': 'Specific page of Anthology',
220 'url': 'https://www.bilibili.com/video/BV1bK411W797?p=1',
221 'info_dict': {
222 'id': 'BV1bK411W797_p1',
223 'ext': 'mp4',
224 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川',
225 'tags': 'count:11',
226 'timestamp': 1589601697,
227 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
228 'uploader': '打牌还是打桩',
229 'uploader_id': '150259984',
230 'like_count': int,
231 'comment_count': int,
232 'upload_date': '20200516',
233 'view_count': int,
234 'description': 'md5:e3c401cf7bc363118d1783dd74068a68',
235 'duration': 90.314,
236 }
237 }, {
238 'note': 'video has subtitles',
239 'url': 'https://www.bilibili.com/video/BV12N4y1M7rh',
240 'info_dict': {
241 'id': 'BV12N4y1M7rh',
242 'ext': 'mp4',
243 'title': 'md5:96e8bb42c2b432c0d4ce3434a61479c1',
244 'tags': list,
245 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4',
246 'duration': 313.557,
247 'upload_date': '20220709',
248 'uploader': '小夫Tech',
249 'timestamp': 1657347907,
250 'uploader_id': '1326814124',
251 'comment_count': int,
252 'view_count': int,
253 'like_count': int,
254 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
255 'subtitles': 'count:2'
256 },
257 'params': {'listsubtitles': True},
258 }, {
259 'url': 'https://www.bilibili.com/video/av8903802/',
260 'info_dict': {
261 'id': 'BV13x41117TL',
262 'ext': 'mp4',
263 'title': '阿滴英文|英文歌分享#6 "Closer',
264 'upload_date': '20170301',
265 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
266 'timestamp': 1488353834,
267 'uploader_id': '65880958',
268 'uploader': '阿滴英文',
269 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
270 'duration': 554.117,
271 'tags': list,
272 'comment_count': int,
273 'view_count': int,
274 'like_count': int,
275 },
276 'params': {
277 'skip_download': True,
278 },
279 }, {
280 'note': 'video has chapter',
281 'url': 'https://www.bilibili.com/video/BV1vL411G7N7/',
282 'info_dict': {
283 'id': 'BV1vL411G7N7',
284 'ext': 'mp4',
285 'title': '如何为你的B站视频添加进度条分段',
286 'timestamp': 1634554558,
287 'upload_date': '20211018',
288 'description': 'md5:a9a3d6702b3a94518d419b2e9c320a6d',
289 'tags': list,
290 'uploader': '爱喝咖啡的当麻',
291 'duration': 669.482,
292 'uploader_id': '1680903',
293 'chapters': 'count:6',
294 'comment_count': int,
295 'view_count': int,
296 'like_count': int,
297 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
298 },
299 'params': {'skip_download': True},
300 }, {
301 'note': 'video redirects to festival page',
302 'url': 'https://www.bilibili.com/video/BV1wP4y1P72h',
303 'info_dict': {
304 'id': 'BV1wP4y1P72h',
305 'ext': 'mp4',
306 'title': '牛虎年相交之际,一首传统民族打击乐《牛斗虎》祝大家新春快乐,虎年大吉!【bilibili音乐虎闹新春】',
307 'timestamp': 1643947497,
308 'upload_date': '20220204',
309 'description': 'md5:8681a0d4d2c06b4ae27e59c8080a7fe6',
310 'uploader': '叨叨冯聊音乐',
311 'duration': 246.719,
312 'uploader_id': '528182630',
313 'view_count': int,
314 'like_count': int,
315 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
316 },
317 'params': {'skip_download': True},
318 }, {
319 'note': 'newer festival video',
320 'url': 'https://www.bilibili.com/festival/2023honkaiimpact3gala?bvid=BV1ay4y1d77f',
321 'info_dict': {
322 'id': 'BV1ay4y1d77f',
323 'ext': 'mp4',
324 'title': '【崩坏3新春剧场】为特别的你送上祝福!',
325 'timestamp': 1674273600,
326 'upload_date': '20230121',
327 'description': 'md5:58af66d15c6a0122dc30c8adfd828dd8',
328 'uploader': '果蝇轰',
329 'duration': 1111.722,
330 'uploader_id': '8469526',
331 'view_count': int,
332 'like_count': int,
333 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
334 },
335 'params': {'skip_download': True},
336 }]
337
338 def _real_extract(self, url):
339 video_id = self._match_id(url)
340 webpage = self._download_webpage(url, video_id)
341 initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)
342
343 is_festival = 'videoData' not in initial_state
344 if is_festival:
345 video_data = initial_state['videoInfo']
346 else:
347 play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data']
348 video_data = initial_state['videoData']
349
350 video_id, title = video_data['bvid'], video_data.get('title')
351
352 # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself.
353 page_list_json = not is_festival and traverse_obj(
354 self._download_json(
355 'https://api.bilibili.com/x/player/pagelist', video_id,
356 fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'},
357 note='Extracting videos in anthology'),
358 'data', expected_type=list) or []
359 is_anthology = len(page_list_json) > 1
360
361 part_id = int_or_none(parse_qs(url).get('p', [None])[-1])
362 if is_anthology and not part_id and self._yes_playlist(video_id, video_id):
363 return self.playlist_from_matches(
364 page_list_json, video_id, title, ie=BiliBiliIE,
365 getter=lambda entry: f'https://www.bilibili.com/video/{video_id}?p={entry["page"]}')
366
367 if is_anthology:
368 part_id = part_id or 1
369 title += f' p{part_id:02d} {traverse_obj(page_list_json, (part_id - 1, "part")) or ""}'
370
371 aid = video_data.get('aid')
372 old_video_id = format_field(aid, None, f'%s_part{part_id or 1}')
373
374 cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid')
375
376 festival_info = {}
377 if is_festival:
378 play_info = self._download_json(
379 'https://api.bilibili.com/x/player/playurl', video_id,
380 query={'bvid': video_id, 'cid': cid, 'fnval': 4048},
381 note='Extracting festival video formats')['data']
382
383 festival_info = traverse_obj(initial_state, {
384 'uploader': ('videoInfo', 'upName'),
385 'uploader_id': ('videoInfo', 'upMid', {str_or_none}),
386 'like_count': ('videoStatus', 'like', {int_or_none}),
387 'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'),
388 }, get_all=False)
389
390 return {
391 **traverse_obj(initial_state, {
392 'uploader': ('upData', 'name'),
393 'uploader_id': ('upData', 'mid', {str_or_none}),
394 'like_count': ('videoData', 'stat', 'like', {int_or_none}),
395 'tags': ('tags', ..., 'tag_name'),
396 'thumbnail': ('videoData', 'pic', {url_or_none}),
397 }),
398 **festival_info,
399 **traverse_obj(video_data, {
400 'description': 'desc',
401 'timestamp': ('pubdate', {int_or_none}),
402 'view_count': (('viewCount', ('stat', 'view')), {int_or_none}),
403 'comment_count': ('stat', 'reply', {int_or_none}),
404 }, get_all=False),
405 'id': f'{video_id}{format_field(part_id, None, "_p%d")}',
406 'formats': self.extract_formats(play_info),
407 '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None,
408 'title': title,
409 'duration': float_or_none(play_info.get('timelength'), scale=1000),
410 'chapters': self._get_chapters(aid, cid),
411 'subtitles': self.extract_subtitles(video_id, aid, cid),
412 '__post_extractor': self.extract_comments(aid),
413 'http_headers': {'Referer': url},
414 }
415
416
417 class BiliBiliBangumiIE(BilibiliBaseIE):
418 _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/(?P<id>ep\d+)'
419
420 _TESTS = [{
421 'url': 'https://www.bilibili.com/bangumi/play/ep267851',
422 'info_dict': {
423 'id': '267851',
424 'ext': 'mp4',
425 'series': '鬼灭之刃',
426 'series_id': '4358',
427 'season': '鬼灭之刃',
428 'season_id': '26801',
429 'season_number': 1,
430 'episode': '残酷',
431 'episode_id': '267851',
432 'episode_number': 1,
433 'title': '1 残酷',
434 'duration': 1425.256,
435 'timestamp': 1554566400,
436 'upload_date': '20190406',
437 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$'
438 },
439 'skip': 'According to the copyright owner\'s request, you may only watch the video after you are premium member.'
440 }]
441
442 def _real_extract(self, url):
443 video_id = self._match_id(url)
444 episode_id = video_id[2:]
445 webpage = self._download_webpage(url, video_id)
446
447 if '您所在的地区无法观看本片' in webpage:
448 raise GeoRestrictedError('This video is restricted')
449 elif '正在观看预览,大会员免费看全片' in webpage:
450 self.raise_login_required('This video is for premium members only')
451
452 headers = {'Referer': url, **self.geo_verification_headers()}
453 play_info = self._download_json(
454 'https://api.bilibili.com/pgc/player/web/v2/playurl', video_id,
455 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id},
456 headers=headers)
457 premium_only = play_info.get('code') == -10403
458 play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {}
459
460 formats = self.extract_formats(play_info)
461 if not formats and (premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage):
462 self.raise_login_required('This video is for premium members only')
463
464 bangumi_info = self._download_json(
465 'https://api.bilibili.com/pgc/view/web/season', video_id, 'Get episode details',
466 query={'ep_id': episode_id}, headers=headers)['result']
467
468 episode_number, episode_info = next((
469 (idx, ep) for idx, ep in enumerate(traverse_obj(
470 bangumi_info, ('episodes', ..., {dict})), 1)
471 if str_or_none(ep.get('id')) == episode_id), (1, {}))
472
473 season_id = bangumi_info.get('season_id')
474 season_number = season_id and next((
475 idx + 1 for idx, e in enumerate(
476 traverse_obj(bangumi_info, ('seasons', ...)))
477 if e.get('season_id') == season_id
478 ), None)
479
480 aid = episode_info.get('aid')
481
482 return {
483 'id': video_id,
484 'formats': formats,
485 **traverse_obj(bangumi_info, {
486 'series': ('series', 'series_title', {str}),
487 'series_id': ('series', 'series_id', {str_or_none}),
488 'thumbnail': ('square_cover', {url_or_none}),
489 }),
490 'title': join_nonempty('title', 'long_title', delim=' ', from_dict=episode_info),
491 'episode': episode_info.get('long_title'),
492 'episode_id': episode_id,
493 'episode_number': int_or_none(episode_info.get('title')) or episode_number,
494 'season_id': str_or_none(season_id),
495 'season_number': season_number,
496 'timestamp': int_or_none(episode_info.get('pub_time')),
497 'duration': float_or_none(play_info.get('timelength'), scale=1000),
498 'subtitles': self.extract_subtitles(video_id, aid, episode_info.get('cid')),
499 '__post_extractor': self.extract_comments(aid),
500 'http_headers': headers,
501 }
502
503
504 class BiliBiliBangumiMediaIE(BilibiliBaseIE):
505 _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)'
506 _TESTS = [{
507 'url': 'https://www.bilibili.com/bangumi/media/md24097891',
508 'info_dict': {
509 'id': '24097891',
510 },
511 'playlist_mincount': 25,
512 }]
513
514 def _real_extract(self, url):
515 media_id = self._match_id(url)
516 webpage = self._download_webpage(url, media_id)
517 ss_id = self._search_json(
518 r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)['mediaInfo']['season_id']
519
520 return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id)
521
522
523 class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
524 _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P<id>\d+)'
525 _TESTS = [{
526 'url': 'https://www.bilibili.com/bangumi/play/ss26801',
527 'info_dict': {
528 'id': '26801'
529 },
530 'playlist_mincount': 26
531 }]
532
533 def _real_extract(self, url):
534 ss_id = self._match_id(url)
535
536 return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id)
537
538
539 class BilibiliSpaceBaseIE(InfoExtractor):
540 def _extract_playlist(self, fetch_page, get_metadata, get_entries):
541 first_page = fetch_page(0)
542 metadata = get_metadata(first_page)
543
544 paged_list = InAdvancePagedList(
545 lambda idx: get_entries(fetch_page(idx) if idx else first_page),
546 metadata['page_count'], metadata['page_size'])
547
548 return metadata, paged_list
549
550
551 class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE):
552 _VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)(?P<video>/video)?/?(?:[?#]|$)'
553 _TESTS = [{
554 'url': 'https://space.bilibili.com/3985676/video',
555 'info_dict': {
556 'id': '3985676',
557 },
558 'playlist_mincount': 178,
559 }, {
560 'url': 'https://space.bilibili.com/313580179/video',
561 'info_dict': {
562 'id': '313580179',
563 },
564 'playlist_mincount': 92,
565 }]
566
567 def _extract_signature(self, playlist_id):
568 session_data = self._download_json('https://api.bilibili.com/x/web-interface/nav', playlist_id, fatal=False)
569
570 key_from_url = lambda x: x[x.rfind('/') + 1:].split('.')[0]
571 img_key = traverse_obj(
572 session_data, ('data', 'wbi_img', 'img_url', {key_from_url})) or '34478ba821254d9d93542680e3b86100'
573 sub_key = traverse_obj(
574 session_data, ('data', 'wbi_img', 'sub_url', {key_from_url})) or '7e16a90d190a4355a78fd00b32a38de6'
575
576 session_key = img_key + sub_key
577
578 signature_values = []
579 for position in (
580 46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39,
581 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63,
582 57, 62, 11, 36, 20, 34, 44, 52
583 ):
584 char_at_position = try_call(lambda: session_key[position])
585 if char_at_position:
586 signature_values.append(char_at_position)
587
588 return ''.join(signature_values)[:32]
589
590 def _real_extract(self, url):
591 playlist_id, is_video_url = self._match_valid_url(url).group('id', 'video')
592 if not is_video_url:
593 self.to_screen('A channel URL was given. Only the channel\'s videos will be downloaded. '
594 'To download audios, add a "/audio" to the URL')
595
596 signature = self._extract_signature(playlist_id)
597
598 def fetch_page(page_idx):
599 query = {
600 'keyword': '',
601 'mid': playlist_id,
602 'order': 'pubdate',
603 'order_avoided': 'true',
604 'platform': 'web',
605 'pn': page_idx + 1,
606 'ps': 30,
607 'tid': 0,
608 'web_location': 1550101,
609 'wts': int(time.time()),
610 }
611 query['w_rid'] = hashlib.md5(f'{urllib.parse.urlencode(query)}{signature}'.encode()).hexdigest()
612
613 try:
614 response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search',
615 playlist_id, note=f'Downloading page {page_idx}', query=query)
616 except ExtractorError as e:
617 if isinstance(e.cause, HTTPError) and e.cause.status == 412:
618 raise ExtractorError(
619 'Request is blocked by server (412), please add cookies, wait and try later.', expected=True)
620 raise
621 if response['code'] == -401:
622 raise ExtractorError(
623 'Request is blocked by server (401), please add cookies, wait and try later.', expected=True)
624 return response['data']
625
626 def get_metadata(page_data):
627 page_size = page_data['page']['ps']
628 entry_count = page_data['page']['count']
629 return {
630 'page_count': math.ceil(entry_count / page_size),
631 'page_size': page_size,
632 }
633
634 def get_entries(page_data):
635 for entry in traverse_obj(page_data, ('list', 'vlist')) or []:
636 yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}', BiliBiliIE, entry['bvid'])
637
638 metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
639 return self.playlist_result(paged_list, playlist_id)
640
641
642 class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE):
643 _VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)/audio'
644 _TESTS = [{
645 'url': 'https://space.bilibili.com/313580179/audio',
646 'info_dict': {
647 'id': '313580179',
648 },
649 'playlist_mincount': 1,
650 }]
651
652 def _real_extract(self, url):
653 playlist_id = self._match_id(url)
654
655 def fetch_page(page_idx):
656 return self._download_json(
657 'https://api.bilibili.com/audio/music-service/web/song/upper', playlist_id,
658 note=f'Downloading page {page_idx}',
659 query={'uid': playlist_id, 'pn': page_idx + 1, 'ps': 30, 'order': 1, 'jsonp': 'jsonp'})['data']
660
661 def get_metadata(page_data):
662 return {
663 'page_count': page_data['pageCount'],
664 'page_size': page_data['pageSize'],
665 }
666
667 def get_entries(page_data):
668 for entry in page_data.get('data', []):
669 yield self.url_result(f'https://www.bilibili.com/audio/au{entry["id"]}', BilibiliAudioIE, entry['id'])
670
671 metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
672 return self.playlist_result(paged_list, playlist_id)
673
674
675 class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE):
676 _VALID_URL = r'https?://space.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail\?sid=(?P<sid>\d+)'
677 _TESTS = [{
678 'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445',
679 'info_dict': {
680 'id': '2142762_57445',
681 'title': '《底特律 变人》'
682 },
683 'playlist_mincount': 31,
684 }]
685
686 def _real_extract(self, url):
687 mid, sid = self._match_valid_url(url).group('mid', 'sid')
688 playlist_id = f'{mid}_{sid}'
689
690 def fetch_page(page_idx):
691 return self._download_json(
692 'https://api.bilibili.com/x/polymer/space/seasons_archives_list',
693 playlist_id, note=f'Downloading page {page_idx}',
694 query={'mid': mid, 'season_id': sid, 'page_num': page_idx + 1, 'page_size': 30})['data']
695
696 def get_metadata(page_data):
697 page_size = page_data['page']['page_size']
698 entry_count = page_data['page']['total']
699 return {
700 'page_count': math.ceil(entry_count / page_size),
701 'page_size': page_size,
702 'title': traverse_obj(page_data, ('meta', 'name'))
703 }
704
705 def get_entries(page_data):
706 for entry in page_data.get('archives', []):
707 yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}',
708 BiliBiliIE, entry['bvid'])
709
710 metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
711 return self.playlist_result(paged_list, playlist_id, metadata['title'])
712
713
714 class BilibiliCategoryIE(InfoExtractor):
715 IE_NAME = 'Bilibili category extractor'
716 _MAX_RESULTS = 1000000
717 _VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+'
718 _TESTS = [{
719 'url': 'https://www.bilibili.com/v/kichiku/mad',
720 'info_dict': {
721 'id': 'kichiku: mad',
722 'title': 'kichiku: mad'
723 },
724 'playlist_mincount': 45,
725 'params': {
726 'playlistend': 45
727 }
728 }]
729
730 def _fetch_page(self, api_url, num_pages, query, page_num):
731 parsed_json = self._download_json(
732 api_url, query, query={'Search_key': query, 'pn': page_num},
733 note='Extracting results from page %s of %s' % (page_num, num_pages))
734
735 video_list = traverse_obj(parsed_json, ('data', 'archives'), expected_type=list)
736 if not video_list:
737 raise ExtractorError('Failed to retrieve video list for page %d' % page_num)
738
739 for video in video_list:
740 yield self.url_result(
741 'https://www.bilibili.com/video/%s' % video['bvid'], 'BiliBili', video['bvid'])
742
743 def _entries(self, category, subcategory, query):
744 # map of categories : subcategories : RIDs
745 rid_map = {
746 'kichiku': {
747 'mad': 26,
748 'manual_vocaloid': 126,
749 'guide': 22,
750 'theatre': 216,
751 'course': 127
752 },
753 }
754
755 if category not in rid_map:
756 raise ExtractorError(
757 f'The category {category} isn\'t supported. Supported categories: {list(rid_map.keys())}')
758 if subcategory not in rid_map[category]:
759 raise ExtractorError(
760 f'The subcategory {subcategory} isn\'t supported for this category. Supported subcategories: {list(rid_map[category].keys())}')
761 rid_value = rid_map[category][subcategory]
762
763 api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value
764 page_json = self._download_json(api_url, query, query={'Search_key': query, 'pn': '1'})
765 page_data = traverse_obj(page_json, ('data', 'page'), expected_type=dict)
766 count, size = int_or_none(page_data.get('count')), int_or_none(page_data.get('size'))
767 if count is None or not size:
768 raise ExtractorError('Failed to calculate either page count or size')
769
770 num_pages = math.ceil(count / size)
771
772 return OnDemandPagedList(functools.partial(
773 self._fetch_page, api_url, num_pages, query), size)
774
775 def _real_extract(self, url):
776 category, subcategory = urllib.parse.urlparse(url).path.split('/')[2:4]
777 query = '%s: %s' % (category, subcategory)
778
779 return self.playlist_result(self._entries(category, subcategory, query), query, query)
780
781
782 class BiliBiliSearchIE(SearchInfoExtractor):
783 IE_DESC = 'Bilibili video search'
784 _MAX_RESULTS = 100000
785 _SEARCH_KEY = 'bilisearch'
786
787 def _search_results(self, query):
788 for page_num in itertools.count(1):
789 videos = self._download_json(
790 'https://api.bilibili.com/x/web-interface/search/type', query,
791 note=f'Extracting results from page {page_num}', query={
792 'Search_key': query,
793 'keyword': query,
794 'page': page_num,
795 'context': '',
796 'duration': 0,
797 'tids_2': '',
798 '__refresh__': 'true',
799 'search_type': 'video',
800 'tids': 0,
801 'highlight': 1,
802 })['data'].get('result')
803 if not videos:
804 break
805 for video in videos:
806 yield self.url_result(video['arcurl'], 'BiliBili', str(video['aid']))
807
808
809 class BilibiliAudioBaseIE(InfoExtractor):
810 def _call_api(self, path, sid, query=None):
811 if not query:
812 query = {'sid': sid}
813 return self._download_json(
814 'https://www.bilibili.com/audio/music-service-c/web/' + path,
815 sid, query=query)['data']
816
817
818 class BilibiliAudioIE(BilibiliAudioBaseIE):
819 _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P<id>\d+)'
820 _TEST = {
821 'url': 'https://www.bilibili.com/audio/au1003142',
822 'md5': 'fec4987014ec94ef9e666d4d158ad03b',
823 'info_dict': {
824 'id': '1003142',
825 'ext': 'm4a',
826 'title': '【tsukimi】YELLOW / 神山羊',
827 'artist': 'tsukimi',
828 'comment_count': int,
829 'description': 'YELLOW的mp3版!',
830 'duration': 183,
831 'subtitles': {
832 'origin': [{
833 'ext': 'lrc',
834 }],
835 },
836 'thumbnail': r're:^https?://.+\.jpg',
837 'timestamp': 1564836614,
838 'upload_date': '20190803',
839 'uploader': 'tsukimi-つきみぐー',
840 'view_count': int,
841 },
842 }
843
844 def _real_extract(self, url):
845 au_id = self._match_id(url)
846
847 play_data = self._call_api('url', au_id)
848 formats = [{
849 'url': play_data['cdns'][0],
850 'filesize': int_or_none(play_data.get('size')),
851 'vcodec': 'none'
852 }]
853
854 for a_format in formats:
855 a_format.setdefault('http_headers', {}).update({
856 'Referer': url,
857 })
858
859 song = self._call_api('song/info', au_id)
860 title = song['title']
861 statistic = song.get('statistic') or {}
862
863 subtitles = None
864 lyric = song.get('lyric')
865 if lyric:
866 subtitles = {
867 'origin': [{
868 'url': lyric,
869 }]
870 }
871
872 return {
873 'id': au_id,
874 'title': title,
875 'formats': formats,
876 'artist': song.get('author'),
877 'comment_count': int_or_none(statistic.get('comment')),
878 'description': song.get('intro'),
879 'duration': int_or_none(song.get('duration')),
880 'subtitles': subtitles,
881 'thumbnail': song.get('cover'),
882 'timestamp': int_or_none(song.get('passtime')),
883 'uploader': song.get('uname'),
884 'view_count': int_or_none(statistic.get('play')),
885 }
886
887
888 class BilibiliAudioAlbumIE(BilibiliAudioBaseIE):
889 _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P<id>\d+)'
890 _TEST = {
891 'url': 'https://www.bilibili.com/audio/am10624',
892 'info_dict': {
893 'id': '10624',
894 'title': '每日新曲推荐(每日11:00更新)',
895 'description': '每天11:00更新,为你推送最新音乐',
896 },
897 'playlist_count': 19,
898 }
899
900 def _real_extract(self, url):
901 am_id = self._match_id(url)
902
903 songs = self._call_api(
904 'song/of-menu', am_id, {'sid': am_id, 'pn': 1, 'ps': 100})['data']
905
906 entries = []
907 for song in songs:
908 sid = str_or_none(song.get('id'))
909 if not sid:
910 continue
911 entries.append(self.url_result(
912 'https://www.bilibili.com/audio/au' + sid,
913 BilibiliAudioIE.ie_key(), sid))
914
915 if entries:
916 album_data = self._call_api('menu/info', am_id) or {}
917 album_title = album_data.get('title')
918 if album_title:
919 for entry in entries:
920 entry['album'] = album_title
921 return self.playlist_result(
922 entries, am_id, album_title, album_data.get('intro'))
923
924 return self.playlist_result(entries, am_id)
925
926
927 class BiliBiliPlayerIE(InfoExtractor):
928 _VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P<id>\d+)'
929 _TEST = {
930 'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1',
931 'only_matching': True,
932 }
933
934 def _real_extract(self, url):
935 video_id = self._match_id(url)
936 return self.url_result(
937 'http://www.bilibili.tv/video/av%s/' % video_id,
938 ie=BiliBiliIE.ie_key(), video_id=video_id)
939
940
941 class BiliIntlBaseIE(InfoExtractor):
942 _API_URL = 'https://api.bilibili.tv/intl/gateway'
943 _NETRC_MACHINE = 'biliintl'
944
945 def _call_api(self, endpoint, *args, **kwargs):
946 json = self._download_json(self._API_URL + endpoint, *args, **kwargs)
947 if json.get('code'):
948 if json['code'] in (10004004, 10004005, 10023006):
949 self.raise_login_required()
950 elif json['code'] == 10004001:
951 self.raise_geo_restricted()
952 else:
953 if json.get('message') and str(json['code']) != json['message']:
954 errmsg = f'{kwargs.get("errnote", "Unable to download JSON metadata")}: {self.IE_NAME} said: {json["message"]}'
955 else:
956 errmsg = kwargs.get('errnote', 'Unable to download JSON metadata')
957 if kwargs.get('fatal'):
958 raise ExtractorError(errmsg)
959 else:
960 self.report_warning(errmsg)
961 return json.get('data')
962
963 def json2srt(self, json):
964 data = '\n\n'.join(
965 f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}'
966 for i, line in enumerate(traverse_obj(json, (
967 'body', lambda _, l: l['content'] and l['from'] and l['to']))))
968 return data
969
970 def _get_subtitles(self, *, ep_id=None, aid=None):
971 sub_json = self._call_api(
972 '/web/v2/subtitle', ep_id or aid, fatal=False,
973 note='Downloading subtitles list', errnote='Unable to download subtitles list',
974 query=filter_dict({
975 'platform': 'web',
976 's_locale': 'en_US',
977 'episode_id': ep_id,
978 'aid': aid,
979 })) or {}
980 subtitles = {}
981 for sub in sub_json.get('subtitles') or []:
982 sub_url = sub.get('url')
983 if not sub_url:
984 continue
985 sub_data = self._download_json(
986 sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False,
987 note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '')
988 if not sub_data:
989 continue
990 subtitles.setdefault(sub.get('lang_key', 'en'), []).append({
991 'ext': 'srt',
992 'data': self.json2srt(sub_data)
993 })
994 return subtitles
995
996 def _get_formats(self, *, ep_id=None, aid=None):
997 video_json = self._call_api(
998 '/web/playurl', ep_id or aid, note='Downloading video formats',
999 errnote='Unable to download video formats', query=filter_dict({
1000 'platform': 'web',
1001 'ep_id': ep_id,
1002 'aid': aid,
1003 }))
1004 video_json = video_json['playurl']
1005 formats = []
1006 for vid in video_json.get('video') or []:
1007 video_res = vid.get('video_resource') or {}
1008 video_info = vid.get('stream_info') or {}
1009 if not video_res.get('url'):
1010 continue
1011 formats.append({
1012 'url': video_res['url'],
1013 'ext': 'mp4',
1014 'format_note': video_info.get('desc_words'),
1015 'width': video_res.get('width'),
1016 'height': video_res.get('height'),
1017 'vbr': video_res.get('bandwidth'),
1018 'acodec': 'none',
1019 'vcodec': video_res.get('codecs'),
1020 'filesize': video_res.get('size'),
1021 })
1022 for aud in video_json.get('audio_resource') or []:
1023 if not aud.get('url'):
1024 continue
1025 formats.append({
1026 'url': aud['url'],
1027 'ext': 'mp4',
1028 'abr': aud.get('bandwidth'),
1029 'acodec': aud.get('codecs'),
1030 'vcodec': 'none',
1031 'filesize': aud.get('size'),
1032 })
1033
1034 return formats
1035
1036 def _parse_video_metadata(self, video_data):
1037 return {
1038 'title': video_data.get('title_display') or video_data.get('title'),
1039 'thumbnail': video_data.get('cover'),
1040 'episode_number': int_or_none(self._search_regex(
1041 r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)),
1042 }
1043
1044 def _perform_login(self, username, password):
1045 if not Cryptodome.RSA:
1046 raise ExtractorError('pycryptodomex not found. Please install', expected=True)
1047
1048 key_data = self._download_json(
1049 'https://passport.bilibili.tv/x/intl/passport-login/web/key?lang=en-US', None,
1050 note='Downloading login key', errnote='Unable to download login key')['data']
1051
1052 public_key = Cryptodome.RSA.importKey(key_data['key'])
1053 password_hash = Cryptodome.PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8'))
1054 login_post = self._download_json(
1055 'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({
1056 'username': username,
1057 'password': base64.b64encode(password_hash).decode('ascii'),
1058 'keep_me': 'true',
1059 's_locale': 'en_US',
1060 'isTrusted': 'true'
1061 }), note='Logging in', errnote='Unable to log in')
1062 if login_post.get('code'):
1063 if login_post.get('message'):
1064 raise ExtractorError(f'Unable to log in: {self.IE_NAME} said: {login_post["message"]}', expected=True)
1065 else:
1066 raise ExtractorError('Unable to log in')
1067
1068
1069 class BiliIntlIE(BiliIntlBaseIE):
1070 _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?(play/(?P<season_id>\d+)/(?P<ep_id>\d+)|video/(?P<aid>\d+))'
1071 _TESTS = [{
1072 # Bstation page
1073 'url': 'https://www.bilibili.tv/en/play/34613/341736',
1074 'info_dict': {
1075 'id': '341736',
1076 'ext': 'mp4',
1077 'title': 'E2 - The First Night',
1078 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
1079 'episode_number': 2,
1080 'upload_date': '20201009',
1081 'episode': 'Episode 2',
1082 'timestamp': 1602259500,
1083 'description': 'md5:297b5a17155eb645e14a14b385ab547e',
1084 'chapters': [{
1085 'start_time': 0,
1086 'end_time': 76.242,
1087 'title': '<Untitled Chapter 1>'
1088 }, {
1089 'start_time': 76.242,
1090 'end_time': 161.161,
1091 'title': 'Intro'
1092 }, {
1093 'start_time': 1325.742,
1094 'end_time': 1403.903,
1095 'title': 'Outro'
1096 }],
1097 }
1098 }, {
1099 # Non-Bstation page
1100 'url': 'https://www.bilibili.tv/en/play/1033760/11005006',
1101 'info_dict': {
1102 'id': '11005006',
1103 'ext': 'mp4',
1104 'title': 'E3 - Who?',
1105 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
1106 'episode_number': 3,
1107 'description': 'md5:e1a775e71a35c43f141484715470ad09',
1108 'episode': 'Episode 3',
1109 'upload_date': '20211219',
1110 'timestamp': 1639928700,
1111 'chapters': [{
1112 'start_time': 0,
1113 'end_time': 88.0,
1114 'title': '<Untitled Chapter 1>'
1115 }, {
1116 'start_time': 88.0,
1117 'end_time': 156.0,
1118 'title': 'Intro'
1119 }, {
1120 'start_time': 1173.0,
1121 'end_time': 1259.535,
1122 'title': 'Outro'
1123 }],
1124 }
1125 }, {
1126 # Subtitle with empty content
1127 'url': 'https://www.bilibili.tv/en/play/1005144/10131790',
1128 'info_dict': {
1129 'id': '10131790',
1130 'ext': 'mp4',
1131 'title': 'E140 - Two Heartbeats: Kabuto\'s Trap',
1132 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
1133 'episode_number': 140,
1134 },
1135 'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.'
1136 }, {
1137 'url': 'https://www.bilibili.tv/en/video/2041863208',
1138 'info_dict': {
1139 'id': '2041863208',
1140 'ext': 'mp4',
1141 'timestamp': 1670874843,
1142 'description': 'Scheduled for April 2023.\nStudio: ufotable',
1143 'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$',
1144 'upload_date': '20221212',
1145 'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation',
1146 },
1147 }, {
1148 # episode comment extraction
1149 'url': 'https://www.bilibili.tv/en/play/34580/340317',
1150 'info_dict': {
1151 'id': '340317',
1152 'ext': 'mp4',
1153 'timestamp': 1604057820,
1154 'upload_date': '20201030',
1155 'episode_number': 5,
1156 'title': 'E5 - My Own Steel',
1157 'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2',
1158 'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$',
1159 'episode': 'Episode 5',
1160 'comment_count': int,
1161 'chapters': [{
1162 'start_time': 0,
1163 'end_time': 61.0,
1164 'title': '<Untitled Chapter 1>'
1165 }, {
1166 'start_time': 61.0,
1167 'end_time': 134.0,
1168 'title': 'Intro'
1169 }, {
1170 'start_time': 1290.0,
1171 'end_time': 1379.0,
1172 'title': 'Outro'
1173 }],
1174 },
1175 'params': {
1176 'getcomments': True
1177 }
1178 }, {
1179 # user generated content comment extraction
1180 'url': 'https://www.bilibili.tv/en/video/2045730385',
1181 'info_dict': {
1182 'id': '2045730385',
1183 'ext': 'mp4',
1184 'description': 'md5:693b6f3967fb4e7e7764ea817857c33a',
1185 'timestamp': 1667891924,
1186 'upload_date': '20221108',
1187 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation',
1188 'comment_count': int,
1189 'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg',
1190 },
1191 'params': {
1192 'getcomments': True
1193 }
1194 }, {
1195 # episode id without intro and outro
1196 'url': 'https://www.bilibili.tv/en/play/1048837/11246489',
1197 'info_dict': {
1198 'id': '11246489',
1199 'ext': 'mp4',
1200 'title': 'E1 - Operation \'Strix\' <Owl>',
1201 'description': 'md5:b4434eb1a9a97ad2bccb779514b89f17',
1202 'timestamp': 1649516400,
1203 'thumbnail': 'https://pic.bstarstatic.com/ogv/62cb1de23ada17fb70fbe7bdd6ff29c29da02a64.png',
1204 'episode': 'Episode 1',
1205 'episode_number': 1,
1206 'upload_date': '20220409',
1207 },
1208 }, {
1209 'url': 'https://www.biliintl.com/en/play/34613/341736',
1210 'only_matching': True,
1211 }, {
1212 # User-generated content (as opposed to a series licensed from a studio)
1213 'url': 'https://bilibili.tv/en/video/2019955076',
1214 'only_matching': True,
1215 }, {
1216 # No language in URL
1217 'url': 'https://www.bilibili.tv/video/2019955076',
1218 'only_matching': True,
1219 }, {
1220 # Uppercase language in URL
1221 'url': 'https://www.bilibili.tv/EN/video/2019955076',
1222 'only_matching': True,
1223 }]
1224
1225 def _make_url(video_id, series_id=None):
1226 if series_id:
1227 return f'https://www.bilibili.tv/en/play/{series_id}/{video_id}'
1228 return f'https://www.bilibili.tv/en/video/{video_id}'
1229
1230 def _extract_video_metadata(self, url, video_id, season_id):
1231 url, smuggled_data = unsmuggle_url(url, {})
1232 if smuggled_data.get('title'):
1233 return smuggled_data
1234
1235 webpage = self._download_webpage(url, video_id)
1236 # Bstation layout
1237 initial_data = (
1238 self._search_json(r'window\.__INITIAL_(?:DATA|STATE)__\s*=', webpage, 'preload state', video_id, default={})
1239 or self._search_nuxt_data(webpage, video_id, '__initialState', fatal=False, traverse=None))
1240 video_data = traverse_obj(
1241 initial_data, ('OgvVideo', 'epDetail'), ('UgcVideo', 'videoData'), ('ugc', 'archive'), expected_type=dict) or {}
1242
1243 if season_id and not video_data:
1244 # Non-Bstation layout, read through episode list
1245 season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id)
1246 video_data = traverse_obj(season_json, (
1247 'sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == video_id
1248 ), expected_type=dict, get_all=False)
1249
1250 # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found
1251 return merge_dicts(
1252 self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), {
1253 'title': self._html_search_meta('og:title', webpage),
1254 'description': self._html_search_meta('og:description', webpage)
1255 })
1256
1257 def _get_comments_reply(self, root_id, next_id=0, display_id=None):
1258 comment_api_raw_data = self._download_json(
1259 'https://api.bilibili.tv/reply/web/detail', display_id,
1260 note=f'Downloading reply comment of {root_id} - {next_id}',
1261 query={
1262 'platform': 'web',
1263 'ps': 20, # comment's reply per page (default: 3)
1264 'root': root_id,
1265 'next': next_id,
1266 })
1267
1268 for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
1269 yield {
1270 'author': traverse_obj(replies, ('member', 'name')),
1271 'author_id': traverse_obj(replies, ('member', 'mid')),
1272 'author_thumbnail': traverse_obj(replies, ('member', 'face')),
1273 'text': traverse_obj(replies, ('content', 'message')),
1274 'id': replies.get('rpid'),
1275 'like_count': int_or_none(replies.get('like_count')),
1276 'parent': replies.get('parent'),
1277 'timestamp': unified_timestamp(replies.get('ctime_text'))
1278 }
1279
1280 if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
1281 yield from self._get_comments_reply(
1282 root_id, comment_api_raw_data['data']['cursor']['next'], display_id)
1283
1284 def _get_comments(self, video_id, ep_id):
1285 for i in itertools.count(0):
1286 comment_api_raw_data = self._download_json(
1287 'https://api.bilibili.tv/reply/web/root', video_id,
1288 note=f'Downloading comment page {i + 1}',
1289 query={
1290 'platform': 'web',
1291 'pn': i, # page number
1292 'ps': 20, # comment per page (default: 20)
1293 'oid': video_id,
1294 'type': 3 if ep_id else 1, # 1: user generated content, 3: series content
1295 'sort_type': 1, # 1: best, 2: recent
1296 })
1297
1298 for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
1299 yield {
1300 'author': traverse_obj(replies, ('member', 'name')),
1301 'author_id': traverse_obj(replies, ('member', 'mid')),
1302 'author_thumbnail': traverse_obj(replies, ('member', 'face')),
1303 'text': traverse_obj(replies, ('content', 'message')),
1304 'id': replies.get('rpid'),
1305 'like_count': int_or_none(replies.get('like_count')),
1306 'timestamp': unified_timestamp(replies.get('ctime_text')),
1307 'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))),
1308 }
1309 if replies.get('count'):
1310 yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id)
1311
1312 if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
1313 break
1314
1315 def _real_extract(self, url):
1316 season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')
1317 video_id = ep_id or aid
1318 chapters = None
1319
1320 if ep_id:
1321 intro_ending_json = self._call_api(
1322 f'/web/v2/ogv/play/episode?episode_id={ep_id}&platform=web',
1323 video_id, fatal=False) or {}
1324 if intro_ending_json.get('skip'):
1325 # FIXME: start time and end time seems a bit off a few second even it corrext based on ogv.*.js
1326 # ref: https://p.bstarstatic.com/fe-static/bstar-web-new/assets/ogv.2b147442.js
1327 chapters = [{
1328 'start_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'opening_start_time')), 1000),
1329 'end_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'opening_end_time')), 1000),
1330 'title': 'Intro'
1331 }, {
1332 'start_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'ending_start_time')), 1000),
1333 'end_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'ending_end_time')), 1000),
1334 'title': 'Outro'
1335 }]
1336
1337 return {
1338 'id': video_id,
1339 **self._extract_video_metadata(url, video_id, season_id),
1340 'formats': self._get_formats(ep_id=ep_id, aid=aid),
1341 'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid),
1342 'chapters': chapters,
1343 '__post_extractor': self.extract_comments(video_id, ep_id)
1344 }
1345
1346
1347 class BiliIntlSeriesIE(BiliIntlBaseIE):
1348 IE_NAME = 'biliIntl:series'
1349 _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?(?:play|media)/(?P<id>\d+)/?(?:[?#]|$)'
1350 _TESTS = [{
1351 'url': 'https://www.bilibili.tv/en/play/34613',
1352 'playlist_mincount': 15,
1353 'info_dict': {
1354 'id': '34613',
1355 'title': 'TONIKAWA: Over the Moon For You',
1356 'description': 'md5:297b5a17155eb645e14a14b385ab547e',
1357 'categories': ['Slice of life', 'Comedy', 'Romance'],
1358 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
1359 'view_count': int,
1360 },
1361 'params': {
1362 'skip_download': True,
1363 },
1364 }, {
1365 'url': 'https://www.bilibili.tv/en/media/1048837',
1366 'info_dict': {
1367 'id': '1048837',
1368 'title': 'SPY×FAMILY',
1369 'description': 'md5:b4434eb1a9a97ad2bccb779514b89f17',
1370 'categories': ['Adventure', 'Action', 'Comedy'],
1371 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.jpg$',
1372 'view_count': int,
1373 },
1374 'playlist_mincount': 25,
1375 }, {
1376 'url': 'https://www.biliintl.com/en/play/34613',
1377 'only_matching': True,
1378 }, {
1379 'url': 'https://www.biliintl.com/EN/play/34613',
1380 'only_matching': True,
1381 }]
1382
1383 def _entries(self, series_id):
1384 series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id)
1385 for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict):
1386 episode_id = str(episode['episode_id'])
1387 yield self.url_result(smuggle_url(
1388 BiliIntlIE._make_url(episode_id, series_id),
1389 self._parse_video_metadata(episode)
1390 ), BiliIntlIE, episode_id)
1391
1392 def _real_extract(self, url):
1393 series_id = self._match_id(url)
1394 series_info = self._call_api(f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {}
1395 return self.playlist_result(
1396 self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'),
1397 categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none),
1398 thumbnail=url_or_none(series_info.get('horizontal_cover')), view_count=parse_count(series_info.get('view')))
1399
1400
1401 class BiliLiveIE(InfoExtractor):
1402 _VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P<id>\d+)'
1403
1404 _TESTS = [{
1405 'url': 'https://live.bilibili.com/196',
1406 'info_dict': {
1407 'id': '33989',
1408 'description': "周六杂谈回,其他时候随机游戏。 | \n录播:@下播型泛式录播组。 | \n直播通知群(全员禁言):666906670,902092584,59971⑧481 (功能一样,别多加)",
1409 'ext': 'flv',
1410 'title': "太空狼人杀联动,不被爆杀就算赢",
1411 'thumbnail': "https://i0.hdslb.com/bfs/live/new_room_cover/e607bc1529057ef4b332e1026e62cf46984c314d.jpg",
1412 'timestamp': 1650802769,
1413 },
1414 'skip': 'not live'
1415 }, {
1416 'url': 'https://live.bilibili.com/196?broadcast_type=0&is_room_feed=1?spm_id_from=333.999.space_home.strengthen_live_card.click',
1417 'only_matching': True
1418 }, {
1419 'url': 'https://live.bilibili.com/blanc/196',
1420 'only_matching': True
1421 }]
1422
1423 _FORMATS = {
1424 80: {'format_id': 'low', 'format_note': '流畅'},
1425 150: {'format_id': 'high_res', 'format_note': '高清'},
1426 250: {'format_id': 'ultra_high_res', 'format_note': '超清'},
1427 400: {'format_id': 'blue_ray', 'format_note': '蓝光'},
1428 10000: {'format_id': 'source', 'format_note': '原画'},
1429 20000: {'format_id': '4K', 'format_note': '4K'},
1430 30000: {'format_id': 'dolby', 'format_note': '杜比'},
1431 }
1432
1433 _quality = staticmethod(qualities(list(_FORMATS)))
1434
1435 def _call_api(self, path, room_id, query):
1436 api_result = self._download_json(f'https://api.live.bilibili.com/{path}', room_id, query=query)
1437 if api_result.get('code') != 0:
1438 raise ExtractorError(api_result.get('message') or 'Unable to download JSON metadata')
1439 return api_result.get('data') or {}
1440
1441 def _parse_formats(self, qn, fmt):
1442 for codec in fmt.get('codec') or []:
1443 if codec.get('current_qn') != qn:
1444 continue
1445 for url_info in codec['url_info']:
1446 yield {
1447 'url': f'{url_info["host"]}{codec["base_url"]}{url_info["extra"]}',
1448 'ext': fmt.get('format_name'),
1449 'vcodec': codec.get('codec_name'),
1450 'quality': self._quality(qn),
1451 **self._FORMATS[qn],
1452 }
1453
1454 def _real_extract(self, url):
1455 room_id = self._match_id(url)
1456 room_data = self._call_api('room/v1/Room/get_info', room_id, {'id': room_id})
1457 if room_data.get('live_status') == 0:
1458 raise ExtractorError('Streamer is not live', expected=True)
1459
1460 formats = []
1461 for qn in self._FORMATS.keys():
1462 stream_data = self._call_api('xlive/web-room/v2/index/getRoomPlayInfo', room_id, {
1463 'room_id': room_id,
1464 'qn': qn,
1465 'codec': '0,1',
1466 'format': '0,2',
1467 'mask': '0',
1468 'no_playurl': '0',
1469 'platform': 'web',
1470 'protocol': '0,1',
1471 })
1472 for fmt in traverse_obj(stream_data, ('playurl_info', 'playurl', 'stream', ..., 'format', ...)) or []:
1473 formats.extend(self._parse_formats(qn, fmt))
1474
1475 return {
1476 'id': room_id,
1477 'title': room_data.get('title'),
1478 'description': room_data.get('description'),
1479 'thumbnail': room_data.get('user_cover'),
1480 'timestamp': stream_data.get('live_time'),
1481 'formats': formats,
1482 'is_live': True,
1483 'http_headers': {
1484 'Referer': url,
1485 },
1486 }