]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/bilibili.py
[hls] Fix unapplied byte_range for EXT-X-MAP fragment
[yt-dlp.git] / yt_dlp / extractor / bilibili.py
CommitLineData
cfcf60ea 1import base64
04b32c8f 2import hashlib
6efb0711 3import itertools
c34f505b 4import functools
520e7533 5import re
c34f505b 6import math
28746fbd 7
06167fbb 8from .common import InfoExtractor, SearchInfoExtractor
bd8f48c7
YCH
9from ..compat import (
10 compat_parse_qs,
11 compat_urlparse,
c34f505b 12 compat_urllib_parse_urlparse
bd8f48c7 13)
28746fbd 14from ..utils import (
bd8f48c7 15 ExtractorError,
f5f15c99 16 filter_dict,
6461f2b7
YCH
17 int_or_none,
18 float_or_none,
f8580bf0 19 mimetype2ext,
bd8f48c7 20 parse_iso8601,
e88d44c6 21 traverse_obj,
c62ecf0d 22 parse_count,
bd8f48c7 23 smuggle_url,
efc947fb 24 srt_subtitles_timecode,
4bc15a68 25 str_or_none,
bd8f48c7 26 strip_jsonp,
04b32c8f 27 unified_timestamp,
bd8f48c7 28 unsmuggle_url,
1f85029d 29 urlencode_postdata,
c62ecf0d 30 url_or_none,
c34f505b 31 OnDemandPagedList
28746fbd
PH
32)
33
34
35class BiliBiliIE(InfoExtractor):
b4eb08bb
S
36 _VALID_URL = r'''(?x)
37 https?://
38 (?:(?:www|bangumi)\.)?
39 bilibili\.(?:tv|com)/
40 (?:
41 (?:
42 video/[aA][vV]|
43 anime/(?P<anime_id>\d+)/play\#
06167fbb 44 )(?P<id>\d+)|
9536bc07 45 (s/)?video/[bB][vV](?P<id_bv>[^/?#&]+)
b4eb08bb 46 )
06167fbb 47 (?:/?\?p=(?P<page>\d+))?
b4eb08bb 48 '''
28746fbd 49
bd8f48c7 50 _TESTS = [{
06167fbb 51 'url': 'http://www.bilibili.com/video/av1074402/',
89fabf11 52 'md5': '7ac275ec84a99a6552c5d229659a0fe1',
28746fbd 53 'info_dict': {
54bb3906 54 'id': '1074402_part1',
f8580bf0 55 'ext': 'mp4',
28746fbd 56 'title': '【金坷垃】金泡沫',
f8580bf0 57 'uploader_id': '156160',
58 'uploader': '菊子桑',
59 'upload_date': '20140420',
6461f2b7 60 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
3526c304 61 'timestamp': 1398012678,
89fabf11
JN
62 'tags': ['顶上去报复社会', '该来的总会来的', '金克拉是检验歌曲的唯一标准', '坷垃教主', '金坷垃', '邓紫棋', '治愈系坷垃'],
63 'bv_id': 'BV11x411K7CN',
64 'cid': '1554319',
65 'thumbnail': 'http://i2.hdslb.com/bfs/archive/c79a8cf0347cd7a897c53a2f756e96aead128e8c.jpg',
66 'duration': 308.36,
28746fbd 67 },
bd8f48c7
YCH
68 }, {
69 # Tested in BiliBiliBangumiIE
70 'url': 'http://bangumi.bilibili.com/anime/1869/play#40062',
71 'only_matching': True,
06167fbb 72 }, {
73 # bilibili.tv
74 'url': 'http://www.bilibili.tv/video/av1074402/',
75 'only_matching': True,
bd8f48c7
YCH
76 }, {
77 'url': 'http://bangumi.bilibili.com/anime/5802/play#100643',
78 'md5': '3f721ad1e75030cc06faf73587cfec57',
79 'info_dict': {
54bb3906 80 'id': '100643_part1',
bd8f48c7
YCH
81 'ext': 'mp4',
82 'title': 'CHAOS;CHILD',
83 'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...',
84 },
85 'skip': 'Geo-restricted to China',
ca270371 86 }, {
ca270371
YCH
87 'url': 'http://www.bilibili.com/video/av8903802/',
88 'info_dict': {
54bb3906 89 'id': '8903802_part1',
f8580bf0 90 'ext': 'mp4',
ca270371 91 'title': '阿滴英文|英文歌分享#6 "Closer',
f8580bf0 92 'upload_date': '20170301',
ca270371 93 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文',
f8580bf0 94 'timestamp': 1488382634,
95 'uploader_id': '65880958',
96 'uploader': '阿滴英文',
89fabf11
JN
97 'thumbnail': 'http://i2.hdslb.com/bfs/archive/49267ce20bc246be6304bf369a3ded0256854c23.jpg',
98 'cid': '14694589',
99 'duration': 554.117,
100 'bv_id': 'BV13x41117TL',
101 'tags': ['人文', '英语', '文化', '公开课', '阿滴英文'],
f8580bf0 102 },
103 'params': {
104 'skip_download': True,
ca270371 105 },
b4eb08bb
S
106 }, {
107 # new BV video id format
108 'url': 'https://www.bilibili.com/video/BV1JE411F741',
109 'only_matching': True,
adc74b3c 110 }, {
111 # Anthology
112 'url': 'https://www.bilibili.com/video/BV1bK411W797',
113 'info_dict': {
114 'id': 'BV1bK411W797',
7e60c069 115 'title': '物语中的人物是如何吐槽自己的OP的'
adc74b3c 116 },
117 'playlist_count': 17,
89fabf11
JN
118 }, {
119 # Correct matching of single and double quotes in title
120 'url': 'https://www.bilibili.com/video/BV1NY411E7Rx/',
121 'info_dict': {
122 'id': '255513412_part1',
123 'ext': 'mp4',
124 'title': 'Vid"eo" Te\'st',
125 'cid': '570602418',
126 'thumbnail': 'http://i2.hdslb.com/bfs/archive/0c0de5a90b6d5b991b8dcc6cde0afbf71d564791.jpg',
127 'upload_date': '20220408',
128 'timestamp': 1649436552,
129 'description': 'Vid"eo" Te\'st',
130 'uploader_id': '1630758804',
131 'bv_id': 'BV1NY411E7Rx',
132 'duration': 60.394,
133 'uploader': 'bili_31244483705',
134 'tags': ['VLOG'],
135 },
136 'params': {
137 'skip_download': True,
138 },
bd8f48c7 139 }]
28746fbd 140
c9a0ea6e
S
141 _APP_KEY = 'iVGUTjsxvpLeuDCf'
142 _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt'
6461f2b7 143
bd8f48c7
YCH
144 def _report_error(self, result):
145 if 'message' in result:
146 raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True)
147 elif 'code' in result:
148 raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True)
149 else:
150 raise ExtractorError('Can\'t extract Bangumi episode ID')
151
520e7533 152 def _real_extract(self, url):
bd8f48c7
YCH
153 url, smuggled_data = unsmuggle_url(url, {})
154
5ad28e7f 155 mobj = self._match_valid_url(url)
06167fbb 156 video_id = mobj.group('id_bv') or mobj.group('id')
157
158 av_id, bv_id = self._get_video_id_set(video_id, mobj.group('id_bv') is not None)
159 video_id = av_id
160
f8580bf0 161 info = {}
bd8f48c7 162 anime_id = mobj.group('anime_id')
06167fbb 163 page_id = mobj.group('page')
6461f2b7
YCH
164 webpage = self._download_webpage(url, video_id)
165
adc74b3c 166 # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself.
167 # If the video has no page argument, check to see if it's an anthology
168 if page_id is None:
a06916d9 169 if not self.get_param('noplaylist'):
adc74b3c 170 r = self._extract_anthology_entries(bv_id, video_id, webpage)
171 if r is not None:
172 self.to_screen('Downloading anthology %s - add --no-playlist to just download video' % video_id)
173 return r
7e60c069 174 else:
175 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
adc74b3c 176
bd8f48c7 177 if 'anime/' not in url:
3526c304 178 cid = self._search_regex(
e88d44c6 179 r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + str(page_id), webpage, 'cid',
06167fbb 180 default=None
181 ) or self._search_regex(
61cb6683 182 r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid',
3526c304
S
183 default=None
184 ) or compat_parse_qs(self._search_regex(
95a1322b
S
185 [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
186 r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)',
187 r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
7be15d40
P
188 webpage, 'player parameters'))['cid'][0]
189 else:
bd8f48c7 190 if 'no_bangumi_tip' not in smuggled_data:
7a5c1cfe 191 self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run yt-dlp with %s' % (
bd8f48c7 192 video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id)))
10db0d2f 193 headers = {
194 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
195 'Referer': url
196 }
197 headers.update(self.geo_verification_headers())
bd8f48c7 198
1f85029d
YCH
199 js = self._download_json(
200 'http://bangumi.bilibili.com/web_api/get_source', video_id,
201 data=urlencode_postdata({'episode_id': video_id}),
bd8f48c7
YCH
202 headers=headers)
203 if 'result' not in js:
204 self._report_error(js)
7be15d40 205 cid = js['result']['cid']
04b32c8f 206
10db0d2f 207 headers = {
208 'Accept': 'application/json',
209 'Referer': url
210 }
211 headers.update(self.geo_verification_headers())
212
f8580bf0 213 video_info = self._parse_json(
54bb3906 214 self._search_regex(r'window.__playinfo__\s*=\s*({.+?})</script>', webpage, 'video info', default=None) or '{}',
215 video_id, fatal=False)
f8580bf0 216 video_info = video_info.get('data') or {}
217
218 durl = traverse_obj(video_info, ('dash', 'video'))
219 audios = traverse_obj(video_info, ('dash', 'audio')) or []
d90e4030 220 entries = []
c4a21bc9 221
3526c304
S
222 RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4')
223 for num, rendition in enumerate(RENDITIONS, start=1):
224 payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition)
225 sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest()
3526c304 226 if not video_info:
f8580bf0 227 video_info = self._download_json(
228 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign),
229 video_id, note='Downloading video info page',
230 headers=headers, fatal=num == len(RENDITIONS))
231 if not video_info:
232 continue
3526c304 233
f8580bf0 234 if not durl and 'durl' not in video_info:
3526c304
S
235 if num < len(RENDITIONS):
236 continue
237 self._report_error(video_info)
238
f8580bf0 239 formats = []
240 for idx, durl in enumerate(durl or video_info['durl']):
241 formats.append({
242 'url': durl.get('baseUrl') or durl.get('base_url') or durl.get('url'),
243 'ext': mimetype2ext(durl.get('mimeType') or durl.get('mime_type')),
244 'fps': int_or_none(durl.get('frameRate') or durl.get('frame_rate')),
245 'width': int_or_none(durl.get('width')),
246 'height': int_or_none(durl.get('height')),
247 'vcodec': durl.get('codecs'),
248 'acodec': 'none' if audios else None,
249 'tbr': float_or_none(durl.get('bandwidth'), scale=1000),
250 'filesize': int_or_none(durl.get('size')),
251 })
252 for backup_url in traverse_obj(durl, 'backup_url', expected_type=list) or []:
3526c304
S
253 formats.append({
254 'url': backup_url,
f983b875 255 'quality': -2 if 'hd.mp4' in backup_url else -3,
3526c304
S
256 })
257
f8580bf0 258 for audio in audios:
259 formats.append({
260 'url': audio.get('baseUrl') or audio.get('base_url') or audio.get('url'),
261 'ext': mimetype2ext(audio.get('mimeType') or audio.get('mime_type')),
262 'fps': int_or_none(audio.get('frameRate') or audio.get('frame_rate')),
263 'width': int_or_none(audio.get('width')),
264 'height': int_or_none(audio.get('height')),
265 'acodec': audio.get('codecs'),
266 'vcodec': 'none',
267 'tbr': float_or_none(audio.get('bandwidth'), scale=1000),
268 'filesize': int_or_none(audio.get('size'))
6461f2b7 269 })
f8580bf0 270 for backup_url in traverse_obj(audio, 'backup_url', expected_type=list) or []:
271 formats.append({
272 'url': backup_url,
273 # backup URLs have lower priorities
274 'quality': -3,
275 })
276
277 info.update({
278 'id': video_id,
279 'duration': float_or_none(durl.get('length'), 1000),
280 'formats': formats,
be8d6234
DZ
281 'http_headers': {
282 'Referer': url,
283 },
f8580bf0 284 })
3526c304 285 break
6461f2b7 286
f8580bf0 287 self._sort_formats(formats)
288
54bb3906 289 title = self._html_search_regex((
89fabf11
JN
290 r'<h1[^>]+title=(["])(?P<content>[^"]+)',
291 r'<h1[^>]+title=([\'])(?P<content>[^\']+)',
54bb3906 292 r'(?s)<h1[^>]*>(?P<content>.+?)</h1>',
293 self._meta_regex('title')
294 ), webpage, 'title', group='content', fatal=False)
adc74b3c 295
296 # Get part title for anthologies
297 if page_id is not None:
f8580bf0 298 # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video.
299 part_info = traverse_obj(self._download_json(
300 f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp',
301 video_id, note='Extracting videos in anthology'), 'data', expected_type=list)
302 title = title if len(part_info) == 1 else traverse_obj(part_info, (int(page_id) - 1, 'part')) or title
adc74b3c 303
6461f2b7 304 description = self._html_search_meta('description', webpage)
04b32c8f 305 timestamp = unified_timestamp(self._html_search_regex(
3526c304
S
306 r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time',
307 default=None) or self._html_search_meta(
308 'uploadDate', webpage, 'timestamp', default=None))
1f85029d 309 thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)
6461f2b7
YCH
310
311 # TODO 'view_count' requires deobfuscating Javascript
f8580bf0 312 info.update({
54bb3906 313 'id': f'{video_id}_part{page_id or 1}',
06167fbb 314 'cid': cid,
d90e4030 315 'title': title,
6461f2b7
YCH
316 'description': description,
317 'timestamp': timestamp,
7be15d40 318 'thumbnail': thumbnail,
04b32c8f 319 'duration': float_or_none(video_info.get('timelength'), scale=1000),
f8580bf0 320 })
d90e4030 321
6461f2b7 322 uploader_mobj = re.search(
7e60c069 323 r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>\s*(?P<name>[^<]+?)\s*<',
6461f2b7
YCH
324 webpage)
325 if uploader_mobj:
326 info.update({
ed807c18 327 'uploader': uploader_mobj.group('name').strip(),
6461f2b7
YCH
328 'uploader_id': uploader_mobj.group('id'),
329 })
06167fbb 330
3526c304
S
331 if not info.get('uploader'):
332 info['uploader'] = self._html_search_meta(
333 'author', webpage, 'uploader', default=None)
6461f2b7 334
06167fbb 335 top_level_info = {
e88d44c6 336 'tags': traverse_obj(self._download_json(
337 f'https://api.bilibili.com/x/tag/archive/tags?aid={video_id}',
338 video_id, fatal=False, note='Downloading tags'), ('data', ..., 'tag_name')),
06167fbb 339 }
277d6ff5 340
f8580bf0 341 info['subtitles'] = {
e88d44c6 342 'danmaku': [{
343 'ext': 'xml',
344 'url': f'https://comment.bilibili.com/{cid}.xml',
345 }]
346 }
06167fbb 347
e88d44c6 348 r'''
06167fbb 349 # Requires https://github.com/m13253/danmaku2ass which is licenced under GPL3
350 # See https://github.com/animelover1984/youtube-dl
e88d44c6 351
352 raw_danmaku = self._download_webpage(
353 f'https://comment.bilibili.com/{cid}.xml', video_id, fatal=False, note='Downloading danmaku comments')
06167fbb 354 danmaku = NiconicoIE.CreateDanmaku(raw_danmaku, commentType='Bilibili', x=1024, y=576)
355 entries[0]['subtitles'] = {
356 'danmaku': [{
357 'ext': 'ass',
358 'data': danmaku
359 }]
360 }
361 '''
362
e88d44c6 363 top_level_info['__post_extractor'] = self.extract_comments(video_id)
364
6461f2b7
YCH
365 for entry in entries:
366 entry.update(info)
367
d90e4030 368 if len(entries) == 1:
06167fbb 369 entries[0].update(top_level_info)
d90e4030 370 return entries[0]
bd8f48c7 371
e88d44c6 372 for idx, entry in enumerate(entries):
373 entry['id'] = '%s_part%d' % (video_id, (idx + 1))
06167fbb 374
e88d44c6 375 return {
e88d44c6 376 'id': str(video_id),
377 'bv_id': bv_id,
378 'title': title,
379 'description': description,
e88d44c6 380 **info, **top_level_info
381 }
06167fbb 382
adc74b3c 383 def _extract_anthology_entries(self, bv_id, video_id, webpage):
384 title = self._html_search_regex(
385 (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
ac0efabf 386 r'(?s)<h1[^>]*>(?P<title>.+?)</h1>',
387 r'<title>(?P<title>.+?)</title>'), webpage, 'title',
adc74b3c 388 group='title')
389 json_data = self._download_json(
e88d44c6 390 f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp',
adc74b3c 391 video_id, note='Extracting videos in anthology')
392
e88d44c6 393 if json_data['data']:
adc74b3c 394 return self.playlist_from_matches(
395 json_data['data'], bv_id, title, ie=BiliBiliIE.ie_key(),
396 getter=lambda entry: 'https://www.bilibili.com/video/%s?p=%d' % (bv_id, entry['page']))
397
06167fbb 398 def _get_video_id_set(self, id, is_bv):
399 query = {'bvid': id} if is_bv else {'aid': id}
400 response = self._download_json(
401 "http://api.bilibili.cn/x/web-interface/view",
402 id, query=query,
403 note='Grabbing original ID via API')
404
405 if response['code'] == -400:
406 raise ExtractorError('Video ID does not exist', expected=True, video_id=id)
407 elif response['code'] != 0:
e88d44c6 408 raise ExtractorError(f'Unknown error occurred during API check (code {response["code"]})',
409 expected=True, video_id=id)
410 return response['data']['aid'], response['data']['bvid']
06167fbb 411
e88d44c6 412 def _get_comments(self, video_id, commentPageNumber=0):
413 for idx in itertools.count(1):
414 replies = traverse_obj(
415 self._download_json(
416 f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={video_id}&type=1&jsonp=jsonp&sort=2&_=1567227301685',
8e7ab2cf 417 video_id, note=f'Extracting comments from page {idx}', fatal=False),
418 ('data', 'replies'))
419 if not replies:
420 return
e88d44c6 421 for children in map(self._get_all_children, replies):
422 yield from children
423
424 def _get_all_children(self, reply):
425 yield {
426 'author': traverse_obj(reply, ('member', 'uname')),
427 'author_id': traverse_obj(reply, ('member', 'mid')),
428 'id': reply.get('rpid'),
429 'text': traverse_obj(reply, ('content', 'message')),
430 'timestamp': reply.get('ctime'),
431 'parent': reply.get('parent') or 'root',
432 }
433 for children in map(self._get_all_children, reply.get('replies') or []):
434 yield from children
06167fbb 435
bd8f48c7
YCH
436
437class BiliBiliBangumiIE(InfoExtractor):
438 _VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)'
439
440 IE_NAME = 'bangumi.bilibili.com'
441 IE_DESC = 'BiliBili番剧'
442
443 _TESTS = [{
444 'url': 'http://bangumi.bilibili.com/anime/1869',
445 'info_dict': {
446 'id': '1869',
447 'title': '混沌武士',
448 'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
449 },
450 'playlist_count': 26,
451 }, {
452 'url': 'http://bangumi.bilibili.com/anime/1869',
453 'info_dict': {
454 'id': '1869',
455 'title': '混沌武士',
456 'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
457 },
458 'playlist': [{
459 'md5': '91da8621454dd58316851c27c68b0c13',
460 'info_dict': {
461 'id': '40062',
462 'ext': 'mp4',
463 'title': '混沌武士',
464 'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子...',
465 'timestamp': 1414538739,
466 'upload_date': '20141028',
467 'episode': '疾风怒涛 Tempestuous Temperaments',
468 'episode_number': 1,
469 },
470 }],
471 'params': {
472 'playlist_items': '1',
473 },
474 }]
475
476 @classmethod
477 def suitable(cls, url):
478 return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url)
479
480 def _real_extract(self, url):
481 bangumi_id = self._match_id(url)
482
483 # Sometimes this API returns a JSONP response
484 season_info = self._download_json(
485 'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id,
486 bangumi_id, transform_source=strip_jsonp)['result']
487
488 entries = [{
489 '_type': 'url_transparent',
490 'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}),
491 'ie_key': BiliBiliIE.ie_key(),
492 'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '),
493 'episode': episode.get('index_title'),
494 'episode_number': int_or_none(episode.get('index')),
495 } for episode in season_info['episodes']]
496
497 entries = sorted(entries, key=lambda entry: entry.get('episode_number'))
498
499 return self.playlist_result(
500 entries, bangumi_id,
501 season_info.get('bangumi_title'), season_info.get('evaluate'))
4bc15a68
RA
502
503
06167fbb 504class BilibiliChannelIE(InfoExtractor):
505 _VALID_URL = r'https?://space.bilibili\.com/(?P<id>\d+)'
6efb0711 506 _API_URL = "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%d&jsonp=jsonp"
507 _TESTS = [{
508 'url': 'https://space.bilibili.com/3985676/video',
509 'info_dict': {},
510 'playlist_mincount': 112,
511 }]
512
513 def _entries(self, list_id):
514 count, max_count = 0, None
515
516 for page_num in itertools.count(1):
e88d44c6 517 data = self._download_json(
518 self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data']
6efb0711 519
f8580bf0 520 max_count = max_count or traverse_obj(data, ('page', 'count'))
6efb0711 521
f8580bf0 522 entries = traverse_obj(data, ('list', 'vlist'))
6efb0711 523 if not entries:
524 return
525 for entry in entries:
526 yield self.url_result(
527 'https://www.bilibili.com/video/%s' % entry['bvid'],
528 BiliBiliIE.ie_key(), entry['bvid'])
529
530 count += len(entries)
531 if max_count and count >= max_count:
532 return
06167fbb 533
534 def _real_extract(self, url):
535 list_id = self._match_id(url)
6efb0711 536 return self.playlist_result(self._entries(list_id), list_id)
06167fbb 537
538
c34f505b 539class BilibiliCategoryIE(InfoExtractor):
540 IE_NAME = 'Bilibili category extractor'
541 _MAX_RESULTS = 1000000
542 _VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+'
543 _TESTS = [{
544 'url': 'https://www.bilibili.com/v/kichiku/mad',
545 'info_dict': {
546 'id': 'kichiku: mad',
547 'title': 'kichiku: mad'
548 },
549 'playlist_mincount': 45,
550 'params': {
551 'playlistend': 45
552 }
553 }]
554
555 def _fetch_page(self, api_url, num_pages, query, page_num):
556 parsed_json = self._download_json(
557 api_url, query, query={'Search_key': query, 'pn': page_num},
558 note='Extracting results from page %s of %s' % (page_num, num_pages))
559
f8580bf0 560 video_list = traverse_obj(parsed_json, ('data', 'archives'), expected_type=list)
c34f505b 561 if not video_list:
562 raise ExtractorError('Failed to retrieve video list for page %d' % page_num)
563
564 for video in video_list:
565 yield self.url_result(
566 'https://www.bilibili.com/video/%s' % video['bvid'], 'BiliBili', video['bvid'])
567
568 def _entries(self, category, subcategory, query):
569 # map of categories : subcategories : RIDs
570 rid_map = {
571 'kichiku': {
572 'mad': 26,
573 'manual_vocaloid': 126,
574 'guide': 22,
575 'theatre': 216,
576 'course': 127
577 },
578 }
579
580 if category not in rid_map:
e88d44c6 581 raise ExtractorError(
582 f'The category {category} isn\'t supported. Supported categories: {list(rid_map.keys())}')
c34f505b 583 if subcategory not in rid_map[category]:
e88d44c6 584 raise ExtractorError(
585 f'The subcategory {subcategory} isn\'t supported for this category. Supported subcategories: {list(rid_map[category].keys())}')
c34f505b 586 rid_value = rid_map[category][subcategory]
587
588 api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value
589 page_json = self._download_json(api_url, query, query={'Search_key': query, 'pn': '1'})
f8580bf0 590 page_data = traverse_obj(page_json, ('data', 'page'), expected_type=dict)
c34f505b 591 count, size = int_or_none(page_data.get('count')), int_or_none(page_data.get('size'))
592 if count is None or not size:
593 raise ExtractorError('Failed to calculate either page count or size')
594
595 num_pages = math.ceil(count / size)
596
597 return OnDemandPagedList(functools.partial(
598 self._fetch_page, api_url, num_pages, query), size)
599
600 def _real_extract(self, url):
601 u = compat_urllib_parse_urlparse(url)
602 category, subcategory = u.path.split('/')[2:4]
603 query = '%s: %s' % (category, subcategory)
604
605 return self.playlist_result(self._entries(category, subcategory, query), query, query)
606
607
06167fbb 608class BiliBiliSearchIE(SearchInfoExtractor):
96565c7e 609 IE_DESC = 'Bilibili video search'
06167fbb 610 _MAX_RESULTS = 100000
611 _SEARCH_KEY = 'bilisearch'
06167fbb 612
e88d44c6 613 def _search_results(self, query):
614 for page_num in itertools.count(1):
615 videos = self._download_json(
616 'https://api.bilibili.com/x/web-interface/search/type', query,
617 note=f'Extracting results from page {page_num}', query={
618 'Search_key': query,
619 'keyword': query,
620 'page': page_num,
621 'context': '',
622 'order': 'pubdate',
623 'duration': 0,
624 'tids_2': '',
625 '__refresh__': 'true',
626 'search_type': 'video',
627 'tids': 0,
628 'highlight': 1,
629 })['data'].get('result') or []
06167fbb 630 for video in videos:
e88d44c6 631 yield self.url_result(video['arcurl'], 'BiliBili', str(video['aid']))
06167fbb 632
633
4bc15a68
RA
634class BilibiliAudioBaseIE(InfoExtractor):
635 def _call_api(self, path, sid, query=None):
636 if not query:
637 query = {'sid': sid}
638 return self._download_json(
639 'https://www.bilibili.com/audio/music-service-c/web/' + path,
640 sid, query=query)['data']
641
642
643class BilibiliAudioIE(BilibiliAudioBaseIE):
644 _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P<id>\d+)'
645 _TEST = {
646 'url': 'https://www.bilibili.com/audio/au1003142',
647 'md5': 'fec4987014ec94ef9e666d4d158ad03b',
648 'info_dict': {
649 'id': '1003142',
650 'ext': 'm4a',
651 'title': '【tsukimi】YELLOW / 神山羊',
652 'artist': 'tsukimi',
653 'comment_count': int,
654 'description': 'YELLOW的mp3版!',
655 'duration': 183,
656 'subtitles': {
657 'origin': [{
658 'ext': 'lrc',
659 }],
660 },
661 'thumbnail': r're:^https?://.+\.jpg',
662 'timestamp': 1564836614,
663 'upload_date': '20190803',
664 'uploader': 'tsukimi-つきみぐー',
665 'view_count': int,
666 },
667 }
668
669 def _real_extract(self, url):
670 au_id = self._match_id(url)
671
672 play_data = self._call_api('url', au_id)
673 formats = [{
674 'url': play_data['cdns'][0],
675 'filesize': int_or_none(play_data.get('size')),
f0884c8b 676 'vcodec': 'none'
4bc15a68
RA
677 }]
678
679 song = self._call_api('song/info', au_id)
680 title = song['title']
681 statistic = song.get('statistic') or {}
682
683 subtitles = None
684 lyric = song.get('lyric')
685 if lyric:
686 subtitles = {
687 'origin': [{
688 'url': lyric,
689 }]
690 }
691
692 return {
693 'id': au_id,
694 'title': title,
695 'formats': formats,
696 'artist': song.get('author'),
697 'comment_count': int_or_none(statistic.get('comment')),
698 'description': song.get('intro'),
699 'duration': int_or_none(song.get('duration')),
700 'subtitles': subtitles,
701 'thumbnail': song.get('cover'),
702 'timestamp': int_or_none(song.get('passtime')),
703 'uploader': song.get('uname'),
704 'view_count': int_or_none(statistic.get('play')),
705 }
706
707
708class BilibiliAudioAlbumIE(BilibiliAudioBaseIE):
709 _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P<id>\d+)'
710 _TEST = {
711 'url': 'https://www.bilibili.com/audio/am10624',
712 'info_dict': {
713 'id': '10624',
714 'title': '每日新曲推荐(每日11:00更新)',
715 'description': '每天11:00更新,为你推送最新音乐',
716 },
717 'playlist_count': 19,
718 }
719
720 def _real_extract(self, url):
721 am_id = self._match_id(url)
722
723 songs = self._call_api(
724 'song/of-menu', am_id, {'sid': am_id, 'pn': 1, 'ps': 100})['data']
725
726 entries = []
727 for song in songs:
728 sid = str_or_none(song.get('id'))
729 if not sid:
730 continue
731 entries.append(self.url_result(
732 'https://www.bilibili.com/audio/au' + sid,
733 BilibiliAudioIE.ie_key(), sid))
734
735 if entries:
736 album_data = self._call_api('menu/info', am_id) or {}
737 album_title = album_data.get('title')
738 if album_title:
739 for entry in entries:
740 entry['album'] = album_title
741 return self.playlist_result(
742 entries, am_id, album_title, album_data.get('intro'))
743
744 return self.playlist_result(entries, am_id)
63dce309
S
745
746
747class BiliBiliPlayerIE(InfoExtractor):
748 _VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P<id>\d+)'
749 _TEST = {
750 'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1',
751 'only_matching': True,
752 }
753
754 def _real_extract(self, url):
755 video_id = self._match_id(url)
756 return self.url_result(
757 'http://www.bilibili.tv/video/av%s/' % video_id,
758 ie=BiliBiliIE.ie_key(), video_id=video_id)
16f7e6be
AG
759
760
761class BiliIntlBaseIE(InfoExtractor):
c62ecf0d 762 _API_URL = 'https://api.bilibili.tv/intl/gateway'
cfcf60ea 763 _NETRC_MACHINE = 'biliintl'
16f7e6be 764
c62ecf0d 765 def _call_api(self, endpoint, *args, **kwargs):
cfcf60ea
M
766 json = self._download_json(self._API_URL + endpoint, *args, **kwargs)
767 if json.get('code'):
768 if json['code'] in (10004004, 10004005, 10023006):
769 self.raise_login_required()
770 elif json['code'] == 10004001:
771 self.raise_geo_restricted()
772 else:
773 if json.get('message') and str(json['code']) != json['message']:
774 errmsg = f'{kwargs.get("errnote", "Unable to download JSON metadata")}: {self.IE_NAME} said: {json["message"]}'
775 else:
776 errmsg = kwargs.get('errnote', 'Unable to download JSON metadata')
777 if kwargs.get('fatal'):
778 raise ExtractorError(errmsg)
779 else:
780 self.report_warning(errmsg)
781 return json.get('data')
16f7e6be 782
efc947fb 783 def json2srt(self, json):
784 data = '\n\n'.join(
785 f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}'
cfcf60ea 786 for i, line in enumerate(json['body']) if line.get('content'))
efc947fb 787 return data
788
f5f15c99
LR
789 def _get_subtitles(self, *, ep_id=None, aid=None):
790 sub_json = self._call_api(
791 '/web/v2/subtitle', ep_id or aid, note='Downloading subtitles list',
792 errnote='Unable to download subtitles list', query=filter_dict({
793 'platform': 'web',
794 'episode_id': ep_id,
795 'aid': aid,
796 }))
16f7e6be 797 subtitles = {}
c62ecf0d 798 for sub in sub_json.get('subtitles') or []:
16f7e6be
AG
799 sub_url = sub.get('url')
800 if not sub_url:
801 continue
c62ecf0d 802 sub_data = self._download_json(
f5f15c99 803 sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False,
c62ecf0d 804 note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '')
efc947fb 805 if not sub_data:
806 continue
c62ecf0d 807 subtitles.setdefault(sub.get('lang_key', 'en'), []).append({
efc947fb 808 'ext': 'srt',
809 'data': self.json2srt(sub_data)
16f7e6be
AG
810 })
811 return subtitles
812
f5f15c99
LR
813 def _get_formats(self, *, ep_id=None, aid=None):
814 video_json = self._call_api(
815 '/web/playurl', ep_id or aid, note='Downloading video formats',
816 errnote='Unable to download video formats', query=filter_dict({
817 'platform': 'web',
818 'ep_id': ep_id,
819 'aid': aid,
820 }))
16f7e6be
AG
821 video_json = video_json['playurl']
822 formats = []
c62ecf0d 823 for vid in video_json.get('video') or []:
16f7e6be
AG
824 video_res = vid.get('video_resource') or {}
825 video_info = vid.get('stream_info') or {}
826 if not video_res.get('url'):
827 continue
828 formats.append({
829 'url': video_res['url'],
830 'ext': 'mp4',
831 'format_note': video_info.get('desc_words'),
832 'width': video_res.get('width'),
833 'height': video_res.get('height'),
834 'vbr': video_res.get('bandwidth'),
835 'acodec': 'none',
836 'vcodec': video_res.get('codecs'),
837 'filesize': video_res.get('size'),
838 })
c62ecf0d 839 for aud in video_json.get('audio_resource') or []:
16f7e6be
AG
840 if not aud.get('url'):
841 continue
842 formats.append({
843 'url': aud['url'],
844 'ext': 'mp4',
845 'abr': aud.get('bandwidth'),
846 'acodec': aud.get('codecs'),
847 'vcodec': 'none',
848 'filesize': aud.get('size'),
849 })
850
851 self._sort_formats(formats)
852 return formats
853
f5f15c99 854 def _extract_video_info(self, video_data, *, ep_id=None, aid=None):
16f7e6be 855 return {
f5f15c99
LR
856 'id': ep_id or aid,
857 'title': video_data.get('title_display') or video_data.get('title'),
858 'thumbnail': video_data.get('cover'),
c62ecf0d 859 'episode_number': int_or_none(self._search_regex(
f5f15c99
LR
860 r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)),
861 'formats': self._get_formats(ep_id=ep_id, aid=aid),
862 'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid),
16f7e6be
AG
863 'extractor_key': BiliIntlIE.ie_key(),
864 }
865
52efa4b3 866 def _perform_login(self, username, password):
cfcf60ea
M
867 try:
868 from Cryptodome.PublicKey import RSA
869 from Cryptodome.Cipher import PKCS1_v1_5
870 except ImportError:
871 try:
872 from Crypto.PublicKey import RSA
873 from Crypto.Cipher import PKCS1_v1_5
874 except ImportError:
875 raise ExtractorError('pycryptodomex not found. Please install', expected=True)
876
877 key_data = self._download_json(
878 'https://passport.bilibili.tv/x/intl/passport-login/web/key?lang=en-US', None,
879 note='Downloading login key', errnote='Unable to download login key')['data']
880
881 public_key = RSA.importKey(key_data['key'])
882 password_hash = PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8'))
883 login_post = self._download_json(
884 'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({
885 'username': username,
886 'password': base64.b64encode(password_hash).decode('ascii'),
887 'keep_me': 'true',
888 's_locale': 'en_US',
889 'isTrusted': 'true'
890 }), note='Logging in', errnote='Unable to log in')
891 if login_post.get('code'):
892 if login_post.get('message'):
893 raise ExtractorError(f'Unable to log in: {self.IE_NAME} said: {login_post["message"]}', expected=True)
894 else:
895 raise ExtractorError('Unable to log in')
896
16f7e6be
AG
897
898class BiliIntlIE(BiliIntlBaseIE):
f5f15c99 899 _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?(play/(?P<season_id>\d+)/(?P<ep_id>\d+)|video/(?P<aid>\d+))'
16f7e6be 900 _TESTS = [{
cfcf60ea 901 # Bstation page
16f7e6be
AG
902 'url': 'https://www.bilibili.tv/en/play/34613/341736',
903 'info_dict': {
904 'id': '341736',
905 'ext': 'mp4',
c62ecf0d
M
906 'title': 'E2 - The First Night',
907 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
16f7e6be 908 'episode_number': 2,
c62ecf0d 909 }
16f7e6be 910 }, {
cfcf60ea 911 # Non-Bstation page
c62ecf0d 912 'url': 'https://www.bilibili.tv/en/play/1033760/11005006',
16f7e6be 913 'info_dict': {
c62ecf0d 914 'id': '11005006',
16f7e6be 915 'ext': 'mp4',
c62ecf0d
M
916 'title': 'E3 - Who?',
917 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
918 'episode_number': 3,
919 }
cfcf60ea
M
920 }, {
921 # Subtitle with empty content
922 'url': 'https://www.bilibili.tv/en/play/1005144/10131790',
923 'info_dict': {
924 'id': '10131790',
925 'ext': 'mp4',
926 'title': 'E140 - Two Heartbeats: Kabuto\'s Trap',
927 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
928 'episode_number': 140,
929 },
930 'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.'
c62ecf0d
M
931 }, {
932 'url': 'https://www.biliintl.com/en/play/34613/341736',
933 'only_matching': True,
f5f15c99
LR
934 }, {
935 # User-generated content (as opposed to a series licensed from a studio)
936 'url': 'https://bilibili.tv/en/video/2019955076',
937 'only_matching': True,
938 }, {
939 # No language in URL
940 'url': 'https://www.bilibili.tv/video/2019955076',
941 'only_matching': True,
16f7e6be
AG
942 }]
943
944 def _real_extract(self, url):
f5f15c99
LR
945 season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')
946 video_id = ep_id or aid
c62ecf0d
M
947 webpage = self._download_webpage(url, video_id)
948 # Bstation layout
949 initial_data = self._parse_json(self._search_regex(
f5f15c99 950 r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage,
c62ecf0d 951 'preload state', default='{}'), video_id, fatal=False) or {}
f5f15c99
LR
952 video_data = (
953 traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict)
954 or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {})
c62ecf0d 955
f5f15c99 956 if season_id and not video_data:
c62ecf0d
M
957 # Non-Bstation layout, read through episode list
958 season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id)
a44ca5a4 959 video_data = traverse_obj(season_json,
960 ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id),
961 expected_type=dict, get_all=False)
f5f15c99 962 return self._extract_video_info(video_data, ep_id=ep_id, aid=aid)
16f7e6be
AG
963
964
965class BiliIntlSeriesIE(BiliIntlBaseIE):
c62ecf0d 966 _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)$'
16f7e6be
AG
967 _TESTS = [{
968 'url': 'https://www.bilibili.tv/en/play/34613',
969 'playlist_mincount': 15,
970 'info_dict': {
971 'id': '34613',
c62ecf0d
M
972 'title': 'Fly Me to the Moon',
973 'description': 'md5:a861ee1c4dc0acfad85f557cc42ac627',
974 'categories': ['Romance', 'Comedy', 'Slice of life'],
975 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
976 'view_count': int,
16f7e6be
AG
977 },
978 'params': {
979 'skip_download': True,
16f7e6be
AG
980 },
981 }, {
982 'url': 'https://www.biliintl.com/en/play/34613',
c62ecf0d 983 'only_matching': True,
16f7e6be
AG
984 }]
985
c62ecf0d
M
986 def _entries(self, series_id):
987 series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id)
988 for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]):
989 episode_id = str(episode.get('episode_id'))
f5f15c99 990 yield self._extract_video_info(episode, ep_id=episode_id)
16f7e6be
AG
991
992 def _real_extract(self, url):
c62ecf0d
M
993 series_id = self._match_id(url)
994 series_info = self._call_api(f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {}
995 return self.playlist_result(
996 self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'),
997 categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none),
998 thumbnail=url_or_none(series_info.get('horizontal_cover')), view_count=parse_count(series_info.get('view')))