]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/bilibili.py
[ie/BiliBiliBangumi] Fix extractors (#7337)
[yt-dlp.git] / yt_dlp / extractor / bilibili.py
CommitLineData
cfcf60ea 1import base64
c34f505b 2import functools
6f10cdcf 3import hashlib
ad974876 4import itertools
c34f505b 5import math
6f10cdcf 6import time
ad974876
L
7import urllib.error
8import urllib.parse
28746fbd 9
06167fbb 10from .common import InfoExtractor, SearchInfoExtractor
f6a765ce 11from ..dependencies import Cryptodome
28746fbd 12from ..utils import (
bd8f48c7 13 ExtractorError,
ad974876 14 GeoRestrictedError,
2b9d0216
L
15 InAdvancePagedList,
16 OnDemandPagedList,
f5f15c99 17 filter_dict,
6461f2b7 18 float_or_none,
ad974876 19 format_field,
2b9d0216 20 int_or_none,
bdd0b75e 21 join_nonempty,
ad974876 22 make_archive_id,
d37422f1 23 merge_dicts,
f8580bf0 24 mimetype2ext,
2b9d0216 25 parse_count,
ad974876 26 parse_qs,
b4f53662 27 qualities,
26fdfc37 28 smuggle_url,
efc947fb 29 srt_subtitles_timecode,
4bc15a68 30 str_or_none,
2b9d0216 31 traverse_obj,
6f10cdcf 32 try_call,
b093c38c 33 unified_timestamp,
26fdfc37 34 unsmuggle_url,
c62ecf0d 35 url_or_none,
ad974876 36 urlencode_postdata,
28746fbd
PH
37)
38
39
ad974876
L
40class BilibiliBaseIE(InfoExtractor):
41 def extract_formats(self, play_info):
42 format_names = {
43 r['quality']: traverse_obj(r, 'new_description', 'display_desc')
44 for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality']))
45 }
46
47 audios = traverse_obj(play_info, ('dash', 'audio', ...))
48 flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio'))
49 if flac_audio:
50 audios.append(flac_audio)
51 formats = [{
52 'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'),
53 'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')),
54 'acodec': audio.get('codecs'),
55 'vcodec': 'none',
56 'tbr': float_or_none(audio.get('bandwidth'), scale=1000),
57 'filesize': int_or_none(audio.get('size'))
58 } for audio in audios]
59
60 formats.extend({
61 'url': traverse_obj(video, 'baseUrl', 'base_url', 'url'),
62 'ext': mimetype2ext(traverse_obj(video, 'mimeType', 'mime_type')),
63 'fps': float_or_none(traverse_obj(video, 'frameRate', 'frame_rate')),
64 'width': int_or_none(video.get('width')),
65 'height': int_or_none(video.get('height')),
66 'vcodec': video.get('codecs'),
67 'acodec': 'none' if audios else None,
68 'tbr': float_or_none(video.get('bandwidth'), scale=1000),
69 'filesize': int_or_none(video.get('size')),
70 'quality': int_or_none(video.get('id')),
71 'format': format_names.get(video.get('id')),
72 } for video in traverse_obj(play_info, ('dash', 'video', ...)))
73
74 missing_formats = format_names.keys() - set(traverse_obj(formats, (..., 'quality')))
75 if missing_formats:
76 self.to_screen(f'Format(s) {", ".join(format_names[i] for i in missing_formats)} are missing; '
6368e2e6 77 f'you have to login or become premium member to download them. {self._login_hint()}')
ad974876 78
ad974876
L
79 return formats
80
81 def json2srt(self, json_data):
82 srt_data = ''
83 for idx, line in enumerate(json_data.get('body') or []):
84 srt_data += (f'{idx + 1}\n'
85 f'{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n'
86 f'{line["content"]}\n\n')
87 return srt_data
88
8a83baaf 89 def _get_subtitles(self, video_id, aid, cid):
ad974876
L
90 subtitles = {
91 'danmaku': [{
92 'ext': 'xml',
93 'url': f'https://comment.bilibili.com/{cid}.xml',
94 }]
95 }
96
8a83baaf
L
97 video_info_json = self._download_json(f'https://api.bilibili.com/x/player/v2?aid={aid}&cid={cid}', video_id)
98 for s in traverse_obj(video_info_json, ('data', 'subtitle', 'subtitles', ...)):
ad974876
L
99 subtitles.setdefault(s['lan'], []).append({
100 'ext': 'srt',
101 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id))
102 })
103 return subtitles
104
c90c5b9b 105 def _get_chapters(self, aid, cid):
106 chapters = aid and cid and self._download_json(
107 'https://api.bilibili.com/x/player/v2', aid, query={'aid': aid, 'cid': cid},
108 note='Extracting chapters', fatal=False)
109 return traverse_obj(chapters, ('data', 'view_points', ..., {
110 'title': 'content',
111 'start_time': 'from',
112 'end_time': 'to',
113 })) or None
114
ad974876
L
115 def _get_comments(self, aid):
116 for idx in itertools.count(1):
117 replies = traverse_obj(
118 self._download_json(
119 f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={aid}&type=1&jsonp=jsonp&sort=2&_=1567227301685',
120 aid, note=f'Extracting comments from page {idx}', fatal=False),
121 ('data', 'replies'))
122 if not replies:
123 return
124 for children in map(self._get_all_children, replies):
125 yield from children
126
127 def _get_all_children(self, reply):
128 yield {
129 'author': traverse_obj(reply, ('member', 'uname')),
130 'author_id': traverse_obj(reply, ('member', 'mid')),
131 'id': reply.get('rpid'),
132 'text': traverse_obj(reply, ('content', 'message')),
133 'timestamp': reply.get('ctime'),
134 'parent': reply.get('parent') or 'root',
135 }
136 for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))):
137 yield from children
138
bdd0b75e
GS
139 def _get_episodes_from_season(self, ss_id, url):
140 season_info = self._download_json(
141 'https://api.bilibili.com/pgc/web/season/section', ss_id,
142 note='Downloading season info', query={'season_id': ss_id},
143 headers={'Referer': url, **self.geo_verification_headers()})
144
145 for entry in traverse_obj(season_info, (
146 'result', 'main_section', 'episodes',
147 lambda _, v: url_or_none(v['share_url']) and v['id'])):
148 yield self.url_result(entry['share_url'], BiliBiliBangumiIE, f'ep{entry["id"]}')
149
ad974876
L
150
151class BiliBiliIE(BilibiliBaseIE):
ab29e470 152 _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)'
28746fbd 153
bd8f48c7 154 _TESTS = [{
ad974876
L
155 'url': 'https://www.bilibili.com/video/BV13x41117TL',
156 'info_dict': {
157 'id': 'BV13x41117TL',
158 'title': '阿滴英文|英文歌分享#6 "Closer',
159 'ext': 'mp4',
160 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文',
161 'uploader_id': '65880958',
162 'uploader': '阿滴英文',
163 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
164 'duration': 554.117,
165 'tags': list,
166 'comment_count': int,
167 'upload_date': '20170301',
168 'timestamp': 1488353834,
169 'like_count': int,
170 'view_count': int,
171 },
172 }, {
173 # old av URL version
06167fbb 174 'url': 'http://www.bilibili.com/video/av1074402/',
28746fbd 175 'info_dict': {
ad974876 176 'thumbnail': r're:^https?://.*\.(jpg|jpeg)$',
f8580bf0 177 'ext': 'mp4',
f8580bf0 178 'uploader': '菊子桑',
ad974876
L
179 'uploader_id': '156160',
180 'id': 'BV11x411K7CN',
181 'title': '【金坷垃】金泡沫',
182 'duration': 308.36,
f8580bf0 183 'upload_date': '20140420',
ad974876 184 'timestamp': 1397983878,
6461f2b7 185 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
ad974876
L
186 'like_count': int,
187 'comment_count': int,
188 'view_count': int,
189 'tags': list,
190 },
c90c5b9b 191 'params': {'skip_download': True},
bd8f48c7 192 }, {
ad974876
L
193 'note': 'Anthology',
194 'url': 'https://www.bilibili.com/video/BV1bK411W797',
195 'info_dict': {
196 'id': 'BV1bK411W797',
197 'title': '物语中的人物是如何吐槽自己的OP的'
198 },
199 'playlist_count': 18,
200 'playlist': [{
201 'info_dict': {
202 'id': 'BV1bK411W797_p1',
203 'ext': 'mp4',
204 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川',
205 'tags': 'count:11',
206 'timestamp': 1589601697,
207 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
208 'uploader': '打牌还是打桩',
209 'uploader_id': '150259984',
210 'like_count': int,
211 'comment_count': int,
212 'upload_date': '20200516',
213 'view_count': int,
214 'description': 'md5:e3c401cf7bc363118d1783dd74068a68',
215 'duration': 90.314,
216 }
217 }]
06167fbb 218 }, {
ad974876
L
219 'note': 'Specific page of Anthology',
220 'url': 'https://www.bilibili.com/video/BV1bK411W797?p=1',
221 'info_dict': {
222 'id': 'BV1bK411W797_p1',
223 'ext': 'mp4',
224 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川',
225 'tags': 'count:11',
226 'timestamp': 1589601697,
227 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
228 'uploader': '打牌还是打桩',
229 'uploader_id': '150259984',
230 'like_count': int,
231 'comment_count': int,
232 'upload_date': '20200516',
233 'view_count': int,
234 'description': 'md5:e3c401cf7bc363118d1783dd74068a68',
235 'duration': 90.314,
236 }
bd8f48c7 237 }, {
ad974876
L
238 'note': 'video has subtitles',
239 'url': 'https://www.bilibili.com/video/BV12N4y1M7rh',
bd8f48c7 240 'info_dict': {
ad974876 241 'id': 'BV12N4y1M7rh',
bd8f48c7 242 'ext': 'mp4',
c90c5b9b 243 'title': 'md5:96e8bb42c2b432c0d4ce3434a61479c1',
ad974876
L
244 'tags': list,
245 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4',
246 'duration': 313.557,
247 'upload_date': '20220709',
248 'uploader': '小夫Tech',
249 'timestamp': 1657347907,
250 'uploader_id': '1326814124',
251 'comment_count': int,
252 'view_count': int,
253 'like_count': int,
254 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
255 'subtitles': 'count:2'
bd8f48c7 256 },
ad974876 257 'params': {'listsubtitles': True},
ca270371 258 }, {
ad974876 259 'url': 'https://www.bilibili.com/video/av8903802/',
ca270371 260 'info_dict': {
ad974876 261 'id': 'BV13x41117TL',
f8580bf0 262 'ext': 'mp4',
ca270371 263 'title': '阿滴英文|英文歌分享#6 "Closer',
f8580bf0 264 'upload_date': '20170301',
c90c5b9b 265 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
ad974876 266 'timestamp': 1488353834,
f8580bf0 267 'uploader_id': '65880958',
268 'uploader': '阿滴英文',
ad974876 269 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
89fabf11 270 'duration': 554.117,
ad974876
L
271 'tags': list,
272 'comment_count': int,
273 'view_count': int,
274 'like_count': int,
89fabf11
JN
275 },
276 'params': {
277 'skip_download': True,
278 },
c90c5b9b 279 }, {
280 'note': 'video has chapter',
281 'url': 'https://www.bilibili.com/video/BV1vL411G7N7/',
282 'info_dict': {
283 'id': 'BV1vL411G7N7',
284 'ext': 'mp4',
285 'title': '如何为你的B站视频添加进度条分段',
286 'timestamp': 1634554558,
287 'upload_date': '20211018',
288 'description': 'md5:a9a3d6702b3a94518d419b2e9c320a6d',
289 'tags': list,
290 'uploader': '爱喝咖啡的当麻',
291 'duration': 669.482,
292 'uploader_id': '1680903',
293 'chapters': 'count:6',
294 'comment_count': int,
295 'view_count': int,
296 'like_count': int,
297 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
298 },
299 'params': {'skip_download': True},
ab29e470 300 }, {
301 'note': 'video redirects to festival page',
302 'url': 'https://www.bilibili.com/video/BV1wP4y1P72h',
303 'info_dict': {
304 'id': 'BV1wP4y1P72h',
305 'ext': 'mp4',
306 'title': '牛虎年相交之际,一首传统民族打击乐《牛斗虎》祝大家新春快乐,虎年大吉!【bilibili音乐虎闹新春】',
307 'timestamp': 1643947497,
308 'upload_date': '20220204',
309 'description': 'md5:8681a0d4d2c06b4ae27e59c8080a7fe6',
310 'uploader': '叨叨冯聊音乐',
311 'duration': 246.719,
312 'uploader_id': '528182630',
313 'view_count': int,
314 'like_count': int,
315 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
316 },
317 'params': {'skip_download': True},
318 }, {
319 'note': 'newer festival video',
320 'url': 'https://www.bilibili.com/festival/2023honkaiimpact3gala?bvid=BV1ay4y1d77f',
321 'info_dict': {
322 'id': 'BV1ay4y1d77f',
323 'ext': 'mp4',
324 'title': '【崩坏3新春剧场】为特别的你送上祝福!',
325 'timestamp': 1674273600,
326 'upload_date': '20230121',
327 'description': 'md5:58af66d15c6a0122dc30c8adfd828dd8',
328 'uploader': '果蝇轰',
329 'duration': 1111.722,
330 'uploader_id': '8469526',
331 'view_count': int,
332 'like_count': int,
333 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
334 },
335 'params': {'skip_download': True},
bd8f48c7 336 }]
28746fbd 337
520e7533 338 def _real_extract(self, url):
ad974876 339 video_id = self._match_id(url)
6461f2b7 340 webpage = self._download_webpage(url, video_id)
c90c5b9b 341 initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)
ad974876 342
ab29e470 343 is_festival = 'videoData' not in initial_state
344 if is_festival:
345 video_data = initial_state['videoInfo']
346 else:
347 play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data']
348 video_data = initial_state['videoData']
349
ad974876 350 video_id, title = video_data['bvid'], video_data.get('title')
6461f2b7 351
adc74b3c 352 # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself.
ab29e470 353 page_list_json = not is_festival and traverse_obj(
ad974876
L
354 self._download_json(
355 'https://api.bilibili.com/x/player/pagelist', video_id,
356 fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'},
357 note='Extracting videos in anthology'),
358 'data', expected_type=list) or []
359 is_anthology = len(page_list_json) > 1
360
361 part_id = int_or_none(parse_qs(url).get('p', [None])[-1])
362 if is_anthology and not part_id and self._yes_playlist(video_id, video_id):
363 return self.playlist_from_matches(
364 page_list_json, video_id, title, ie=BiliBiliIE,
365 getter=lambda entry: f'https://www.bilibili.com/video/{video_id}?p={entry["page"]}')
10db0d2f 366
ad974876 367 if is_anthology:
f74371a9 368 part_id = part_id or 1
369 title += f' p{part_id:02d} {traverse_obj(page_list_json, (part_id - 1, "part")) or ""}'
f8580bf0 370
ad974876
L
371 aid = video_data.get('aid')
372 old_video_id = format_field(aid, None, f'%s_part{part_id or 1}')
f8580bf0 373
c90c5b9b 374 cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid')
375
ab29e470 376 festival_info = {}
377 if is_festival:
378 play_info = self._download_json(
379 'https://api.bilibili.com/x/player/playurl', video_id,
380 query={'bvid': video_id, 'cid': cid, 'fnval': 4048},
381 note='Extracting festival video formats')['data']
382
383 festival_info = traverse_obj(initial_state, {
384 'uploader': ('videoInfo', 'upName'),
385 'uploader_id': ('videoInfo', 'upMid', {str_or_none}),
386 'like_count': ('videoStatus', 'like', {int_or_none}),
387 'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'),
388 }, get_all=False)
389
ad974876 390 return {
ab29e470 391 **traverse_obj(initial_state, {
392 'uploader': ('upData', 'name'),
393 'uploader_id': ('upData', 'mid', {str_or_none}),
394 'like_count': ('videoData', 'stat', 'like', {int_or_none}),
395 'tags': ('tags', ..., 'tag_name'),
396 'thumbnail': ('videoData', 'pic', {url_or_none}),
397 }),
398 **festival_info,
399 **traverse_obj(video_data, {
400 'description': 'desc',
401 'timestamp': ('pubdate', {int_or_none}),
402 'view_count': (('viewCount', ('stat', 'view')), {int_or_none}),
403 'comment_count': ('stat', 'reply', {int_or_none}),
404 }, get_all=False),
ad974876
L
405 'id': f'{video_id}{format_field(part_id, None, "_p%d")}',
406 'formats': self.extract_formats(play_info),
407 '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None,
d90e4030 408 'title': title,
c90c5b9b 409 'duration': float_or_none(play_info.get('timelength'), scale=1000),
410 'chapters': self._get_chapters(aid, cid),
8a83baaf 411 'subtitles': self.extract_subtitles(video_id, aid, cid),
c90c5b9b 412 '__post_extractor': self.extract_comments(aid),
413 'http_headers': {'Referer': url},
06167fbb 414 }
277d6ff5 415
06167fbb 416
ad974876 417class BiliBiliBangumiIE(BilibiliBaseIE):
bdd0b75e 418 _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/(?P<id>ep\d+)'
e88d44c6 419
ad974876 420 _TESTS = [{
bdd0b75e 421 'url': 'https://www.bilibili.com/bangumi/play/ep267851',
ad974876 422 'info_dict': {
bdd0b75e 423 'id': '267851',
ad974876 424 'ext': 'mp4',
bdd0b75e
GS
425 'series': '鬼灭之刃',
426 'series_id': '4358',
427 'season': '鬼灭之刃',
428 'season_id': '26801',
ad974876 429 'season_number': 1,
bdd0b75e
GS
430 'episode': '残酷',
431 'episode_id': '267851',
432 'episode_number': 1,
433 'title': '1 残酷',
434 'duration': 1425.256,
435 'timestamp': 1554566400,
436 'upload_date': '20190406',
437 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$'
ad974876 438 },
bdd0b75e 439 'skip': 'According to the copyright owner\'s request, you may only watch the video after you are premium member.'
ad974876 440 }]
06167fbb 441
ad974876
L
442 def _real_extract(self, url):
443 video_id = self._match_id(url)
bdd0b75e 444 episode_id = video_id[2:]
ad974876 445 webpage = self._download_webpage(url, video_id)
e88d44c6 446
ad974876
L
447 if '您所在的地区无法观看本片' in webpage:
448 raise GeoRestrictedError('This video is restricted')
bdd0b75e 449 elif '正在观看预览,大会员免费看全片' in webpage:
ad974876 450 self.raise_login_required('This video is for premium members only')
6461f2b7 451
bdd0b75e
GS
452 headers = {'Referer': url, **self.geo_verification_headers()}
453 play_info = self._download_json(
454 'https://api.bilibili.com/pgc/player/web/v2/playurl', video_id,
455 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id},
456 headers=headers)
457 premium_only = play_info.get('code') == -10403
458 play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {}
459
ad974876 460 formats = self.extract_formats(play_info)
bdd0b75e 461 if not formats and (premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage):
ad974876 462 self.raise_login_required('This video is for premium members only')
bd8f48c7 463
bdd0b75e
GS
464 bangumi_info = self._download_json(
465 'https://api.bilibili.com/pgc/view/web/season', video_id, 'Get episode details',
466 query={'ep_id': episode_id}, headers=headers)['result']
467
468 episode_number, episode_info = next((
469 (idx, ep) for idx, ep in enumerate(traverse_obj(
470 bangumi_info, ('episodes', ..., {dict})), 1)
471 if str_or_none(ep.get('id')) == episode_id), (1, {}))
c90c5b9b 472
bdd0b75e 473 season_id = bangumi_info.get('season_id')
c90c5b9b 474 season_number = season_id and next((
475 idx + 1 for idx, e in enumerate(
bdd0b75e 476 traverse_obj(bangumi_info, ('seasons', ...)))
c90c5b9b 477 if e.get('season_id') == season_id
478 ), None)
06167fbb 479
bdd0b75e
GS
480 aid = episode_info.get('aid')
481
e88d44c6 482 return {
ad974876
L
483 'id': video_id,
484 'formats': formats,
bdd0b75e
GS
485 **traverse_obj(bangumi_info, {
486 'series': ('series', 'series_title', {str}),
487 'series_id': ('series', 'series_id', {str_or_none}),
488 'thumbnail': ('square_cover', {url_or_none}),
489 }),
490 'title': join_nonempty('title', 'long_title', delim=' ', from_dict=episode_info),
491 'episode': episode_info.get('long_title'),
492 'episode_id': episode_id,
493 'episode_number': int_or_none(episode_info.get('title')) or episode_number,
494 'season_id': str_or_none(season_id),
c90c5b9b 495 'season_number': season_number,
bdd0b75e 496 'timestamp': int_or_none(episode_info.get('pub_time')),
c90c5b9b 497 'duration': float_or_none(play_info.get('timelength'), scale=1000),
bdd0b75e
GS
498 'subtitles': self.extract_subtitles(video_id, aid, episode_info.get('cid')),
499 '__post_extractor': self.extract_comments(aid),
500 'http_headers': headers,
e88d44c6 501 }
bd8f48c7 502
bd8f48c7 503
bdd0b75e 504class BiliBiliBangumiMediaIE(BilibiliBaseIE):
ad974876 505 _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)'
bd8f48c7 506 _TESTS = [{
ad974876 507 'url': 'https://www.bilibili.com/bangumi/media/md24097891',
bd8f48c7 508 'info_dict': {
ad974876 509 'id': '24097891',
bd8f48c7 510 },
ad974876 511 'playlist_mincount': 25,
bd8f48c7
YCH
512 }]
513
bd8f48c7 514 def _real_extract(self, url):
ad974876
L
515 media_id = self._match_id(url)
516 webpage = self._download_webpage(url, media_id)
bdd0b75e
GS
517 ss_id = self._search_json(
518 r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)['mediaInfo']['season_id']
519
520 return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id)
521
bd8f48c7 522
bdd0b75e
GS
523class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
524 _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P<id>\d+)'
525 _TESTS = [{
526 'url': 'https://www.bilibili.com/bangumi/play/ss26801',
527 'info_dict': {
528 'id': '26801'
529 },
530 'playlist_mincount': 26
531 }]
532
533 def _real_extract(self, url):
534 ss_id = self._match_id(url)
bd8f48c7 535
bdd0b75e 536 return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id)
4bc15a68
RA
537
538
2b9d0216
L
539class BilibiliSpaceBaseIE(InfoExtractor):
540 def _extract_playlist(self, fetch_page, get_metadata, get_entries):
12f153a8 541 first_page = fetch_page(0)
2b9d0216
L
542 metadata = get_metadata(first_page)
543
544 paged_list = InAdvancePagedList(
12f153a8 545 lambda idx: get_entries(fetch_page(idx) if idx else first_page),
2b9d0216
L
546 metadata['page_count'], metadata['page_size'])
547
548 return metadata, paged_list
549
550
551class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE):
552 _VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)(?P<video>/video)?/?(?:[?#]|$)'
6efb0711 553 _TESTS = [{
554 'url': 'https://space.bilibili.com/3985676/video',
2b9d0216
L
555 'info_dict': {
556 'id': '3985676',
557 },
558 'playlist_mincount': 178,
6f10cdcf
E
559 }, {
560 'url': 'https://space.bilibili.com/313580179/video',
561 'info_dict': {
562 'id': '313580179',
563 },
564 'playlist_mincount': 92,
6efb0711 565 }]
566
6f10cdcf
E
567 def _extract_signature(self, playlist_id):
568 session_data = self._download_json('https://api.bilibili.com/x/web-interface/nav', playlist_id, fatal=False)
569
570 key_from_url = lambda x: x[x.rfind('/') + 1:].split('.')[0]
571 img_key = traverse_obj(
572 session_data, ('data', 'wbi_img', 'img_url', {key_from_url})) or '34478ba821254d9d93542680e3b86100'
573 sub_key = traverse_obj(
574 session_data, ('data', 'wbi_img', 'sub_url', {key_from_url})) or '7e16a90d190a4355a78fd00b32a38de6'
575
576 session_key = img_key + sub_key
577
578 signature_values = []
579 for position in (
580 46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39,
581 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63,
582 57, 62, 11, 36, 20, 34, 44, 52
583 ):
584 char_at_position = try_call(lambda: session_key[position])
585 if char_at_position:
586 signature_values.append(char_at_position)
587
588 return ''.join(signature_values)[:32]
589
2b9d0216
L
590 def _real_extract(self, url):
591 playlist_id, is_video_url = self._match_valid_url(url).group('id', 'video')
592 if not is_video_url:
593 self.to_screen('A channel URL was given. Only the channel\'s videos will be downloaded. '
594 'To download audios, add a "/audio" to the URL')
595
6f10cdcf
E
596 signature = self._extract_signature(playlist_id)
597
2b9d0216 598 def fetch_page(page_idx):
6f10cdcf
E
599 query = {
600 'keyword': '',
601 'mid': playlist_id,
602 'order': 'pubdate',
603 'order_avoided': 'true',
604 'platform': 'web',
605 'pn': page_idx + 1,
606 'ps': 30,
607 'tid': 0,
608 'web_location': 1550101,
609 'wts': int(time.time()),
610 }
611 query['w_rid'] = hashlib.md5(f'{urllib.parse.urlencode(query)}{signature}'.encode()).hexdigest()
612
12f153a8 613 try:
6f10cdcf
E
614 response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search',
615 playlist_id, note=f'Downloading page {page_idx}', query=query)
12f153a8
L
616 except ExtractorError as e:
617 if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 412:
618 raise ExtractorError(
619 'Request is blocked by server (412), please add cookies, wait and try later.', expected=True)
620 raise
621 if response['code'] == -401:
622 raise ExtractorError(
623 'Request is blocked by server (401), please add cookies, wait and try later.', expected=True)
624 return response['data']
2b9d0216
L
625
626 def get_metadata(page_data):
627 page_size = page_data['page']['ps']
628 entry_count = page_data['page']['count']
629 return {
630 'page_count': math.ceil(entry_count / page_size),
631 'page_size': page_size,
632 }
6efb0711 633
2b9d0216
L
634 def get_entries(page_data):
635 for entry in traverse_obj(page_data, ('list', 'vlist')) or []:
636 yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}', BiliBiliIE, entry['bvid'])
6efb0711 637
2b9d0216
L
638 metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
639 return self.playlist_result(paged_list, playlist_id)
6efb0711 640
6efb0711 641
2b9d0216
L
642class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE):
643 _VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)/audio'
644 _TESTS = [{
6f10cdcf 645 'url': 'https://space.bilibili.com/313580179/audio',
2b9d0216 646 'info_dict': {
6f10cdcf 647 'id': '313580179',
2b9d0216
L
648 },
649 'playlist_mincount': 1,
650 }]
651
652 def _real_extract(self, url):
653 playlist_id = self._match_id(url)
654
655 def fetch_page(page_idx):
656 return self._download_json(
657 'https://api.bilibili.com/audio/music-service/web/song/upper', playlist_id,
658 note=f'Downloading page {page_idx}',
12f153a8 659 query={'uid': playlist_id, 'pn': page_idx + 1, 'ps': 30, 'order': 1, 'jsonp': 'jsonp'})['data']
2b9d0216
L
660
661 def get_metadata(page_data):
662 return {
663 'page_count': page_data['pageCount'],
664 'page_size': page_data['pageSize'],
665 }
666
667 def get_entries(page_data):
668 for entry in page_data.get('data', []):
669 yield self.url_result(f'https://www.bilibili.com/audio/au{entry["id"]}', BilibiliAudioIE, entry['id'])
670
671 metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
672 return self.playlist_result(paged_list, playlist_id)
673
674
675class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE):
676 _VALID_URL = r'https?://space.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail\?sid=(?P<sid>\d+)'
677 _TESTS = [{
678 'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445',
679 'info_dict': {
680 'id': '2142762_57445',
681 'title': '《底特律 变人》'
682 },
683 'playlist_mincount': 31,
684 }]
06167fbb 685
686 def _real_extract(self, url):
2b9d0216
L
687 mid, sid = self._match_valid_url(url).group('mid', 'sid')
688 playlist_id = f'{mid}_{sid}'
689
690 def fetch_page(page_idx):
691 return self._download_json(
692 'https://api.bilibili.com/x/polymer/space/seasons_archives_list',
693 playlist_id, note=f'Downloading page {page_idx}',
12f153a8 694 query={'mid': mid, 'season_id': sid, 'page_num': page_idx + 1, 'page_size': 30})['data']
2b9d0216
L
695
696 def get_metadata(page_data):
697 page_size = page_data['page']['page_size']
698 entry_count = page_data['page']['total']
699 return {
700 'page_count': math.ceil(entry_count / page_size),
701 'page_size': page_size,
702 'title': traverse_obj(page_data, ('meta', 'name'))
703 }
704
705 def get_entries(page_data):
706 for entry in page_data.get('archives', []):
707 yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}',
708 BiliBiliIE, entry['bvid'])
709
710 metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
711 return self.playlist_result(paged_list, playlist_id, metadata['title'])
06167fbb 712
713
c34f505b 714class BilibiliCategoryIE(InfoExtractor):
715 IE_NAME = 'Bilibili category extractor'
716 _MAX_RESULTS = 1000000
717 _VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+'
718 _TESTS = [{
719 'url': 'https://www.bilibili.com/v/kichiku/mad',
720 'info_dict': {
721 'id': 'kichiku: mad',
722 'title': 'kichiku: mad'
723 },
724 'playlist_mincount': 45,
725 'params': {
726 'playlistend': 45
727 }
728 }]
729
730 def _fetch_page(self, api_url, num_pages, query, page_num):
731 parsed_json = self._download_json(
732 api_url, query, query={'Search_key': query, 'pn': page_num},
733 note='Extracting results from page %s of %s' % (page_num, num_pages))
734
f8580bf0 735 video_list = traverse_obj(parsed_json, ('data', 'archives'), expected_type=list)
c34f505b 736 if not video_list:
737 raise ExtractorError('Failed to retrieve video list for page %d' % page_num)
738
739 for video in video_list:
740 yield self.url_result(
741 'https://www.bilibili.com/video/%s' % video['bvid'], 'BiliBili', video['bvid'])
742
743 def _entries(self, category, subcategory, query):
744 # map of categories : subcategories : RIDs
745 rid_map = {
746 'kichiku': {
747 'mad': 26,
748 'manual_vocaloid': 126,
749 'guide': 22,
750 'theatre': 216,
751 'course': 127
752 },
753 }
754
755 if category not in rid_map:
e88d44c6 756 raise ExtractorError(
757 f'The category {category} isn\'t supported. Supported categories: {list(rid_map.keys())}')
c34f505b 758 if subcategory not in rid_map[category]:
e88d44c6 759 raise ExtractorError(
760 f'The subcategory {subcategory} isn\'t supported for this category. Supported subcategories: {list(rid_map[category].keys())}')
c34f505b 761 rid_value = rid_map[category][subcategory]
762
763 api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value
764 page_json = self._download_json(api_url, query, query={'Search_key': query, 'pn': '1'})
f8580bf0 765 page_data = traverse_obj(page_json, ('data', 'page'), expected_type=dict)
c34f505b 766 count, size = int_or_none(page_data.get('count')), int_or_none(page_data.get('size'))
767 if count is None or not size:
768 raise ExtractorError('Failed to calculate either page count or size')
769
770 num_pages = math.ceil(count / size)
771
772 return OnDemandPagedList(functools.partial(
773 self._fetch_page, api_url, num_pages, query), size)
774
775 def _real_extract(self, url):
ad974876 776 category, subcategory = urllib.parse.urlparse(url).path.split('/')[2:4]
c34f505b 777 query = '%s: %s' % (category, subcategory)
778
779 return self.playlist_result(self._entries(category, subcategory, query), query, query)
780
781
06167fbb 782class BiliBiliSearchIE(SearchInfoExtractor):
96565c7e 783 IE_DESC = 'Bilibili video search'
06167fbb 784 _MAX_RESULTS = 100000
785 _SEARCH_KEY = 'bilisearch'
06167fbb 786
e88d44c6 787 def _search_results(self, query):
788 for page_num in itertools.count(1):
789 videos = self._download_json(
790 'https://api.bilibili.com/x/web-interface/search/type', query,
791 note=f'Extracting results from page {page_num}', query={
792 'Search_key': query,
793 'keyword': query,
794 'page': page_num,
795 'context': '',
e88d44c6 796 'duration': 0,
797 'tids_2': '',
798 '__refresh__': 'true',
799 'search_type': 'video',
800 'tids': 0,
801 'highlight': 1,
2d101954 802 })['data'].get('result')
803 if not videos:
804 break
06167fbb 805 for video in videos:
e88d44c6 806 yield self.url_result(video['arcurl'], 'BiliBili', str(video['aid']))
06167fbb 807
808
4bc15a68
RA
809class BilibiliAudioBaseIE(InfoExtractor):
810 def _call_api(self, path, sid, query=None):
811 if not query:
812 query = {'sid': sid}
813 return self._download_json(
814 'https://www.bilibili.com/audio/music-service-c/web/' + path,
815 sid, query=query)['data']
816
817
818class BilibiliAudioIE(BilibiliAudioBaseIE):
819 _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P<id>\d+)'
820 _TEST = {
821 'url': 'https://www.bilibili.com/audio/au1003142',
822 'md5': 'fec4987014ec94ef9e666d4d158ad03b',
823 'info_dict': {
824 'id': '1003142',
825 'ext': 'm4a',
826 'title': '【tsukimi】YELLOW / 神山羊',
827 'artist': 'tsukimi',
828 'comment_count': int,
829 'description': 'YELLOW的mp3版!',
830 'duration': 183,
831 'subtitles': {
832 'origin': [{
833 'ext': 'lrc',
834 }],
835 },
836 'thumbnail': r're:^https?://.+\.jpg',
837 'timestamp': 1564836614,
838 'upload_date': '20190803',
839 'uploader': 'tsukimi-つきみぐー',
840 'view_count': int,
841 },
842 }
843
844 def _real_extract(self, url):
845 au_id = self._match_id(url)
846
847 play_data = self._call_api('url', au_id)
848 formats = [{
849 'url': play_data['cdns'][0],
850 'filesize': int_or_none(play_data.get('size')),
f0884c8b 851 'vcodec': 'none'
4bc15a68
RA
852 }]
853
6d1b3489 854 for a_format in formats:
855 a_format.setdefault('http_headers', {}).update({
856 'Referer': url,
857 })
858
4bc15a68
RA
859 song = self._call_api('song/info', au_id)
860 title = song['title']
861 statistic = song.get('statistic') or {}
862
863 subtitles = None
864 lyric = song.get('lyric')
865 if lyric:
866 subtitles = {
867 'origin': [{
868 'url': lyric,
869 }]
870 }
871
872 return {
873 'id': au_id,
874 'title': title,
875 'formats': formats,
876 'artist': song.get('author'),
877 'comment_count': int_or_none(statistic.get('comment')),
878 'description': song.get('intro'),
879 'duration': int_or_none(song.get('duration')),
880 'subtitles': subtitles,
881 'thumbnail': song.get('cover'),
882 'timestamp': int_or_none(song.get('passtime')),
883 'uploader': song.get('uname'),
884 'view_count': int_or_none(statistic.get('play')),
885 }
886
887
888class BilibiliAudioAlbumIE(BilibiliAudioBaseIE):
889 _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P<id>\d+)'
890 _TEST = {
891 'url': 'https://www.bilibili.com/audio/am10624',
892 'info_dict': {
893 'id': '10624',
894 'title': '每日新曲推荐(每日11:00更新)',
895 'description': '每天11:00更新,为你推送最新音乐',
896 },
897 'playlist_count': 19,
898 }
899
900 def _real_extract(self, url):
901 am_id = self._match_id(url)
902
903 songs = self._call_api(
904 'song/of-menu', am_id, {'sid': am_id, 'pn': 1, 'ps': 100})['data']
905
906 entries = []
907 for song in songs:
908 sid = str_or_none(song.get('id'))
909 if not sid:
910 continue
911 entries.append(self.url_result(
912 'https://www.bilibili.com/audio/au' + sid,
913 BilibiliAudioIE.ie_key(), sid))
914
915 if entries:
916 album_data = self._call_api('menu/info', am_id) or {}
917 album_title = album_data.get('title')
918 if album_title:
919 for entry in entries:
920 entry['album'] = album_title
921 return self.playlist_result(
922 entries, am_id, album_title, album_data.get('intro'))
923
924 return self.playlist_result(entries, am_id)
63dce309
S
925
926
927class BiliBiliPlayerIE(InfoExtractor):
928 _VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P<id>\d+)'
929 _TEST = {
930 'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1',
931 'only_matching': True,
932 }
933
934 def _real_extract(self, url):
935 video_id = self._match_id(url)
936 return self.url_result(
937 'http://www.bilibili.tv/video/av%s/' % video_id,
938 ie=BiliBiliIE.ie_key(), video_id=video_id)
16f7e6be
AG
939
940
941class BiliIntlBaseIE(InfoExtractor):
c62ecf0d 942 _API_URL = 'https://api.bilibili.tv/intl/gateway'
cfcf60ea 943 _NETRC_MACHINE = 'biliintl'
16f7e6be 944
c62ecf0d 945 def _call_api(self, endpoint, *args, **kwargs):
cfcf60ea
M
946 json = self._download_json(self._API_URL + endpoint, *args, **kwargs)
947 if json.get('code'):
948 if json['code'] in (10004004, 10004005, 10023006):
949 self.raise_login_required()
950 elif json['code'] == 10004001:
951 self.raise_geo_restricted()
952 else:
953 if json.get('message') and str(json['code']) != json['message']:
954 errmsg = f'{kwargs.get("errnote", "Unable to download JSON metadata")}: {self.IE_NAME} said: {json["message"]}'
955 else:
956 errmsg = kwargs.get('errnote', 'Unable to download JSON metadata')
957 if kwargs.get('fatal'):
958 raise ExtractorError(errmsg)
959 else:
960 self.report_warning(errmsg)
961 return json.get('data')
16f7e6be 962
efc947fb 963 def json2srt(self, json):
964 data = '\n\n'.join(
965 f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}'
dfb855b4 966 for i, line in enumerate(traverse_obj(json, (
967 'body', lambda _, l: l['content'] and l['from'] and l['to']))))
efc947fb 968 return data
969
f5f15c99
LR
970 def _get_subtitles(self, *, ep_id=None, aid=None):
971 sub_json = self._call_api(
fbb888a3 972 '/web/v2/subtitle', ep_id or aid, fatal=False,
973 note='Downloading subtitles list', errnote='Unable to download subtitles list',
974 query=filter_dict({
f5f15c99 975 'platform': 'web',
fbb888a3 976 's_locale': 'en_US',
f5f15c99
LR
977 'episode_id': ep_id,
978 'aid': aid,
fbb888a3 979 })) or {}
16f7e6be 980 subtitles = {}
c62ecf0d 981 for sub in sub_json.get('subtitles') or []:
16f7e6be
AG
982 sub_url = sub.get('url')
983 if not sub_url:
984 continue
c62ecf0d 985 sub_data = self._download_json(
f5f15c99 986 sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False,
c62ecf0d 987 note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '')
efc947fb 988 if not sub_data:
989 continue
c62ecf0d 990 subtitles.setdefault(sub.get('lang_key', 'en'), []).append({
efc947fb 991 'ext': 'srt',
992 'data': self.json2srt(sub_data)
16f7e6be
AG
993 })
994 return subtitles
995
f5f15c99
LR
996 def _get_formats(self, *, ep_id=None, aid=None):
997 video_json = self._call_api(
998 '/web/playurl', ep_id or aid, note='Downloading video formats',
999 errnote='Unable to download video formats', query=filter_dict({
1000 'platform': 'web',
1001 'ep_id': ep_id,
1002 'aid': aid,
1003 }))
16f7e6be
AG
1004 video_json = video_json['playurl']
1005 formats = []
c62ecf0d 1006 for vid in video_json.get('video') or []:
16f7e6be
AG
1007 video_res = vid.get('video_resource') or {}
1008 video_info = vid.get('stream_info') or {}
1009 if not video_res.get('url'):
1010 continue
1011 formats.append({
1012 'url': video_res['url'],
1013 'ext': 'mp4',
1014 'format_note': video_info.get('desc_words'),
1015 'width': video_res.get('width'),
1016 'height': video_res.get('height'),
1017 'vbr': video_res.get('bandwidth'),
1018 'acodec': 'none',
1019 'vcodec': video_res.get('codecs'),
1020 'filesize': video_res.get('size'),
1021 })
c62ecf0d 1022 for aud in video_json.get('audio_resource') or []:
16f7e6be
AG
1023 if not aud.get('url'):
1024 continue
1025 formats.append({
1026 'url': aud['url'],
1027 'ext': 'mp4',
1028 'abr': aud.get('bandwidth'),
1029 'acodec': aud.get('codecs'),
1030 'vcodec': 'none',
1031 'filesize': aud.get('size'),
1032 })
1033
16f7e6be
AG
1034 return formats
1035
26fdfc37 1036 def _parse_video_metadata(self, video_data):
16f7e6be 1037 return {
f5f15c99
LR
1038 'title': video_data.get('title_display') or video_data.get('title'),
1039 'thumbnail': video_data.get('cover'),
c62ecf0d 1040 'episode_number': int_or_none(self._search_regex(
f5f15c99 1041 r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)),
16f7e6be
AG
1042 }
1043
52efa4b3 1044 def _perform_login(self, username, password):
65f6e807 1045 if not Cryptodome.RSA:
f6a765ce 1046 raise ExtractorError('pycryptodomex not found. Please install', expected=True)
cfcf60ea
M
1047
1048 key_data = self._download_json(
1049 'https://passport.bilibili.tv/x/intl/passport-login/web/key?lang=en-US', None,
1050 note='Downloading login key', errnote='Unable to download login key')['data']
1051
65f6e807 1052 public_key = Cryptodome.RSA.importKey(key_data['key'])
1053 password_hash = Cryptodome.PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8'))
cfcf60ea
M
1054 login_post = self._download_json(
1055 'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({
1056 'username': username,
1057 'password': base64.b64encode(password_hash).decode('ascii'),
1058 'keep_me': 'true',
1059 's_locale': 'en_US',
1060 'isTrusted': 'true'
1061 }), note='Logging in', errnote='Unable to log in')
1062 if login_post.get('code'):
1063 if login_post.get('message'):
1064 raise ExtractorError(f'Unable to log in: {self.IE_NAME} said: {login_post["message"]}', expected=True)
1065 else:
1066 raise ExtractorError('Unable to log in')
1067
16f7e6be
AG
1068
1069class BiliIntlIE(BiliIntlBaseIE):
0831d95c 1070 _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?(play/(?P<season_id>\d+)/(?P<ep_id>\d+)|video/(?P<aid>\d+))'
16f7e6be 1071 _TESTS = [{
cfcf60ea 1072 # Bstation page
16f7e6be
AG
1073 'url': 'https://www.bilibili.tv/en/play/34613/341736',
1074 'info_dict': {
1075 'id': '341736',
1076 'ext': 'mp4',
c62ecf0d
M
1077 'title': 'E2 - The First Night',
1078 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
16f7e6be 1079 'episode_number': 2,
d37422f1
H
1080 'upload_date': '20201009',
1081 'episode': 'Episode 2',
1082 'timestamp': 1602259500,
1083 'description': 'md5:297b5a17155eb645e14a14b385ab547e',
0ba87dd2
H
1084 'chapters': [{
1085 'start_time': 0,
1086 'end_time': 76.242,
1087 'title': '<Untitled Chapter 1>'
1088 }, {
1089 'start_time': 76.242,
1090 'end_time': 161.161,
1091 'title': 'Intro'
1092 }, {
1093 'start_time': 1325.742,
1094 'end_time': 1403.903,
1095 'title': 'Outro'
1096 }],
c62ecf0d 1097 }
16f7e6be 1098 }, {
cfcf60ea 1099 # Non-Bstation page
c62ecf0d 1100 'url': 'https://www.bilibili.tv/en/play/1033760/11005006',
16f7e6be 1101 'info_dict': {
c62ecf0d 1102 'id': '11005006',
16f7e6be 1103 'ext': 'mp4',
c62ecf0d
M
1104 'title': 'E3 - Who?',
1105 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
1106 'episode_number': 3,
d37422f1
H
1107 'description': 'md5:e1a775e71a35c43f141484715470ad09',
1108 'episode': 'Episode 3',
1109 'upload_date': '20211219',
1110 'timestamp': 1639928700,
0ba87dd2
H
1111 'chapters': [{
1112 'start_time': 0,
1113 'end_time': 88.0,
1114 'title': '<Untitled Chapter 1>'
1115 }, {
1116 'start_time': 88.0,
1117 'end_time': 156.0,
1118 'title': 'Intro'
1119 }, {
1120 'start_time': 1173.0,
1121 'end_time': 1259.535,
1122 'title': 'Outro'
1123 }],
c62ecf0d 1124 }
cfcf60ea
M
1125 }, {
1126 # Subtitle with empty content
1127 'url': 'https://www.bilibili.tv/en/play/1005144/10131790',
1128 'info_dict': {
1129 'id': '10131790',
1130 'ext': 'mp4',
1131 'title': 'E140 - Two Heartbeats: Kabuto\'s Trap',
1132 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
1133 'episode_number': 140,
1134 },
1135 'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.'
d37422f1
H
1136 }, {
1137 'url': 'https://www.bilibili.tv/en/video/2041863208',
1138 'info_dict': {
1139 'id': '2041863208',
1140 'ext': 'mp4',
1141 'timestamp': 1670874843,
1142 'description': 'Scheduled for April 2023.\nStudio: ufotable',
1143 'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$',
1144 'upload_date': '20221212',
1145 'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation',
b093c38c
H
1146 },
1147 }, {
1148 # episode comment extraction
1149 'url': 'https://www.bilibili.tv/en/play/34580/340317',
1150 'info_dict': {
1151 'id': '340317',
1152 'ext': 'mp4',
1153 'timestamp': 1604057820,
1154 'upload_date': '20201030',
1155 'episode_number': 5,
1156 'title': 'E5 - My Own Steel',
1157 'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2',
1158 'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$',
1159 'episode': 'Episode 5',
1160 'comment_count': int,
1161 'chapters': [{
1162 'start_time': 0,
1163 'end_time': 61.0,
1164 'title': '<Untitled Chapter 1>'
1165 }, {
1166 'start_time': 61.0,
1167 'end_time': 134.0,
1168 'title': 'Intro'
1169 }, {
1170 'start_time': 1290.0,
1171 'end_time': 1379.0,
1172 'title': 'Outro'
1173 }],
1174 },
1175 'params': {
1176 'getcomments': True
1177 }
1178 }, {
1179 # user generated content comment extraction
1180 'url': 'https://www.bilibili.tv/en/video/2045730385',
1181 'info_dict': {
1182 'id': '2045730385',
1183 'ext': 'mp4',
1184 'description': 'md5:693b6f3967fb4e7e7764ea817857c33a',
1185 'timestamp': 1667891924,
1186 'upload_date': '20221108',
1187 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation',
1188 'comment_count': int,
1189 'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg',
1190 },
1191 'params': {
1192 'getcomments': True
d37422f1 1193 }
0ba87dd2
H
1194 }, {
1195 # episode id without intro and outro
1196 'url': 'https://www.bilibili.tv/en/play/1048837/11246489',
1197 'info_dict': {
1198 'id': '11246489',
1199 'ext': 'mp4',
1200 'title': 'E1 - Operation \'Strix\' <Owl>',
1201 'description': 'md5:b4434eb1a9a97ad2bccb779514b89f17',
1202 'timestamp': 1649516400,
1203 'thumbnail': 'https://pic.bstarstatic.com/ogv/62cb1de23ada17fb70fbe7bdd6ff29c29da02a64.png',
1204 'episode': 'Episode 1',
1205 'episode_number': 1,
1206 'upload_date': '20220409',
1207 },
c62ecf0d
M
1208 }, {
1209 'url': 'https://www.biliintl.com/en/play/34613/341736',
1210 'only_matching': True,
f5f15c99
LR
1211 }, {
1212 # User-generated content (as opposed to a series licensed from a studio)
1213 'url': 'https://bilibili.tv/en/video/2019955076',
1214 'only_matching': True,
1215 }, {
1216 # No language in URL
1217 'url': 'https://www.bilibili.tv/video/2019955076',
1218 'only_matching': True,
0831d95c 1219 }, {
1220 # Uppercase language in URL
1221 'url': 'https://www.bilibili.tv/EN/video/2019955076',
1222 'only_matching': True,
16f7e6be
AG
1223 }]
1224
26fdfc37 1225 def _make_url(video_id, series_id=None):
1226 if series_id:
1227 return f'https://www.bilibili.tv/en/play/{series_id}/{video_id}'
1228 return f'https://www.bilibili.tv/en/video/{video_id}'
1229
1230 def _extract_video_metadata(self, url, video_id, season_id):
1231 url, smuggled_data = unsmuggle_url(url, {})
1232 if smuggled_data.get('title'):
1233 return smuggled_data
1234
c62ecf0d
M
1235 webpage = self._download_webpage(url, video_id)
1236 # Bstation layout
8072ef2b 1237 initial_data = (
1238 self._search_json(r'window\.__INITIAL_(?:DATA|STATE)__\s*=', webpage, 'preload state', video_id, default={})
1239 or self._search_nuxt_data(webpage, video_id, '__initialState', fatal=False, traverse=None))
1240 video_data = traverse_obj(
d37422f1 1241 initial_data, ('OgvVideo', 'epDetail'), ('UgcVideo', 'videoData'), ('ugc', 'archive'), expected_type=dict) or {}
c62ecf0d 1242
f5f15c99 1243 if season_id and not video_data:
c62ecf0d
M
1244 # Non-Bstation layout, read through episode list
1245 season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id)
26fdfc37 1246 video_data = traverse_obj(season_json, (
1247 'sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == video_id
1248 ), expected_type=dict, get_all=False)
1249
d37422f1
H
1250 # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found
1251 return merge_dicts(
b093c38c 1252 self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), {
d37422f1
H
1253 'title': self._html_search_meta('og:title', webpage),
1254 'description': self._html_search_meta('og:description', webpage)
1255 })
26fdfc37 1256
b093c38c
H
1257 def _get_comments_reply(self, root_id, next_id=0, display_id=None):
1258 comment_api_raw_data = self._download_json(
1259 'https://api.bilibili.tv/reply/web/detail', display_id,
1260 note=f'Downloading reply comment of {root_id} - {next_id}',
1261 query={
1262 'platform': 'web',
1263 'ps': 20, # comment's reply per page (default: 3)
1264 'root': root_id,
1265 'next': next_id,
1266 })
1267
1268 for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
1269 yield {
1270 'author': traverse_obj(replies, ('member', 'name')),
1271 'author_id': traverse_obj(replies, ('member', 'mid')),
1272 'author_thumbnail': traverse_obj(replies, ('member', 'face')),
1273 'text': traverse_obj(replies, ('content', 'message')),
1274 'id': replies.get('rpid'),
1275 'like_count': int_or_none(replies.get('like_count')),
1276 'parent': replies.get('parent'),
1277 'timestamp': unified_timestamp(replies.get('ctime_text'))
1278 }
1279
1280 if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
1281 yield from self._get_comments_reply(
1282 root_id, comment_api_raw_data['data']['cursor']['next'], display_id)
1283
1284 def _get_comments(self, video_id, ep_id):
1285 for i in itertools.count(0):
1286 comment_api_raw_data = self._download_json(
1287 'https://api.bilibili.tv/reply/web/root', video_id,
1288 note=f'Downloading comment page {i + 1}',
1289 query={
1290 'platform': 'web',
1291 'pn': i, # page number
1292 'ps': 20, # comment per page (default: 20)
1293 'oid': video_id,
1294 'type': 3 if ep_id else 1, # 1: user generated content, 3: series content
1295 'sort_type': 1, # 1: best, 2: recent
1296 })
1297
1298 for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
1299 yield {
1300 'author': traverse_obj(replies, ('member', 'name')),
1301 'author_id': traverse_obj(replies, ('member', 'mid')),
1302 'author_thumbnail': traverse_obj(replies, ('member', 'face')),
1303 'text': traverse_obj(replies, ('content', 'message')),
1304 'id': replies.get('rpid'),
1305 'like_count': int_or_none(replies.get('like_count')),
1306 'timestamp': unified_timestamp(replies.get('ctime_text')),
1307 'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))),
1308 }
1309 if replies.get('count'):
1310 yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id)
1311
1312 if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
1313 break
1314
26fdfc37 1315 def _real_extract(self, url):
1316 season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')
1317 video_id = ep_id or aid
0ba87dd2
H
1318 chapters = None
1319
1320 if ep_id:
1321 intro_ending_json = self._call_api(
1322 f'/web/v2/ogv/play/episode?episode_id={ep_id}&platform=web',
1323 video_id, fatal=False) or {}
1324 if intro_ending_json.get('skip'):
1325 # FIXME: start time and end time seems a bit off a few second even it corrext based on ogv.*.js
1326 # ref: https://p.bstarstatic.com/fe-static/bstar-web-new/assets/ogv.2b147442.js
1327 chapters = [{
1328 'start_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'opening_start_time')), 1000),
1329 'end_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'opening_end_time')), 1000),
1330 'title': 'Intro'
1331 }, {
1332 'start_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'ending_start_time')), 1000),
1333 'end_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'ending_end_time')), 1000),
1334 'title': 'Outro'
1335 }]
26fdfc37 1336
1337 return {
1338 'id': video_id,
1339 **self._extract_video_metadata(url, video_id, season_id),
1340 'formats': self._get_formats(ep_id=ep_id, aid=aid),
1341 'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid),
b093c38c
H
1342 'chapters': chapters,
1343 '__post_extractor': self.extract_comments(video_id, ep_id)
26fdfc37 1344 }
16f7e6be
AG
1345
1346
1347class BiliIntlSeriesIE(BiliIntlBaseIE):
08e29b9f 1348 IE_NAME = 'biliIntl:series'
76c3cecc 1349 _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?(?:play|media)/(?P<id>\d+)/?(?:[?#]|$)'
16f7e6be
AG
1350 _TESTS = [{
1351 'url': 'https://www.bilibili.tv/en/play/34613',
1352 'playlist_mincount': 15,
1353 'info_dict': {
1354 'id': '34613',
76c3cecc
H
1355 'title': 'TONIKAWA: Over the Moon For You',
1356 'description': 'md5:297b5a17155eb645e14a14b385ab547e',
1357 'categories': ['Slice of life', 'Comedy', 'Romance'],
c62ecf0d
M
1358 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
1359 'view_count': int,
16f7e6be
AG
1360 },
1361 'params': {
1362 'skip_download': True,
16f7e6be 1363 },
76c3cecc
H
1364 }, {
1365 'url': 'https://www.bilibili.tv/en/media/1048837',
1366 'info_dict': {
1367 'id': '1048837',
1368 'title': 'SPY×FAMILY',
1369 'description': 'md5:b4434eb1a9a97ad2bccb779514b89f17',
1370 'categories': ['Adventure', 'Action', 'Comedy'],
1371 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.jpg$',
1372 'view_count': int,
1373 },
1374 'playlist_mincount': 25,
16f7e6be
AG
1375 }, {
1376 'url': 'https://www.biliintl.com/en/play/34613',
c62ecf0d 1377 'only_matching': True,
0831d95c 1378 }, {
1379 'url': 'https://www.biliintl.com/EN/play/34613',
1380 'only_matching': True,
16f7e6be
AG
1381 }]
1382
c62ecf0d
M
1383 def _entries(self, series_id):
1384 series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id)
26fdfc37 1385 for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict):
1386 episode_id = str(episode['episode_id'])
1387 yield self.url_result(smuggle_url(
1388 BiliIntlIE._make_url(episode_id, series_id),
1389 self._parse_video_metadata(episode)
1390 ), BiliIntlIE, episode_id)
16f7e6be
AG
1391
1392 def _real_extract(self, url):
c62ecf0d
M
1393 series_id = self._match_id(url)
1394 series_info = self._call_api(f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {}
1395 return self.playlist_result(
1396 self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'),
1397 categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none),
1398 thumbnail=url_or_none(series_info.get('horizontal_cover')), view_count=parse_count(series_info.get('view')))
b4f53662
H
1399
1400
1401class BiliLiveIE(InfoExtractor):
ca2f6e14 1402 _VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P<id>\d+)'
b4f53662
H
1403
1404 _TESTS = [{
1405 'url': 'https://live.bilibili.com/196',
1406 'info_dict': {
1407 'id': '33989',
1408 'description': "周六杂谈回,其他时候随机游戏。 | \n录播:@下播型泛式录播组。 | \n直播通知群(全员禁言):666906670,902092584,59971⑧481 (功能一样,别多加)",
1409 'ext': 'flv',
1410 'title': "太空狼人杀联动,不被爆杀就算赢",
1411 'thumbnail': "https://i0.hdslb.com/bfs/live/new_room_cover/e607bc1529057ef4b332e1026e62cf46984c314d.jpg",
1412 'timestamp': 1650802769,
1413 },
1414 'skip': 'not live'
1415 }, {
1416 'url': 'https://live.bilibili.com/196?broadcast_type=0&is_room_feed=1?spm_id_from=333.999.space_home.strengthen_live_card.click',
1417 'only_matching': True
1c226ccd 1418 }, {
1419 'url': 'https://live.bilibili.com/blanc/196',
1420 'only_matching': True
b4f53662
H
1421 }]
1422
1423 _FORMATS = {
1424 80: {'format_id': 'low', 'format_note': '流畅'},
1425 150: {'format_id': 'high_res', 'format_note': '高清'},
1426 250: {'format_id': 'ultra_high_res', 'format_note': '超清'},
1427 400: {'format_id': 'blue_ray', 'format_note': '蓝光'},
1428 10000: {'format_id': 'source', 'format_note': '原画'},
1429 20000: {'format_id': '4K', 'format_note': '4K'},
1430 30000: {'format_id': 'dolby', 'format_note': '杜比'},
1431 }
1432
1433 _quality = staticmethod(qualities(list(_FORMATS)))
1434
1435 def _call_api(self, path, room_id, query):
1436 api_result = self._download_json(f'https://api.live.bilibili.com/{path}', room_id, query=query)
1437 if api_result.get('code') != 0:
1438 raise ExtractorError(api_result.get('message') or 'Unable to download JSON metadata')
1439 return api_result.get('data') or {}
1440
1441 def _parse_formats(self, qn, fmt):
1442 for codec in fmt.get('codec') or []:
1443 if codec.get('current_qn') != qn:
1444 continue
1445 for url_info in codec['url_info']:
1446 yield {
1447 'url': f'{url_info["host"]}{codec["base_url"]}{url_info["extra"]}',
1448 'ext': fmt.get('format_name'),
1449 'vcodec': codec.get('codec_name'),
1450 'quality': self._quality(qn),
1451 **self._FORMATS[qn],
1452 }
1453
1454 def _real_extract(self, url):
1455 room_id = self._match_id(url)
1456 room_data = self._call_api('room/v1/Room/get_info', room_id, {'id': room_id})
1457 if room_data.get('live_status') == 0:
1458 raise ExtractorError('Streamer is not live', expected=True)
1459
1460 formats = []
1461 for qn in self._FORMATS.keys():
1462 stream_data = self._call_api('xlive/web-room/v2/index/getRoomPlayInfo', room_id, {
1463 'room_id': room_id,
1464 'qn': qn,
1465 'codec': '0,1',
1466 'format': '0,2',
1467 'mask': '0',
1468 'no_playurl': '0',
1469 'platform': 'web',
1470 'protocol': '0,1',
1471 })
1472 for fmt in traverse_obj(stream_data, ('playurl_info', 'playurl', 'stream', ..., 'format', ...)) or []:
1473 formats.extend(self._parse_formats(qn, fmt))
b4f53662
H
1474
1475 return {
1476 'id': room_id,
1477 'title': room_data.get('title'),
1478 'description': room_data.get('description'),
1479 'thumbnail': room_data.get('user_cover'),
1480 'timestamp': stream_data.get('live_time'),
1481 'formats': formats,
ca2f6e14 1482 'is_live': True,
b4f53662
H
1483 'http_headers': {
1484 'Referer': url,
1485 },
1486 }