]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/abematv.py
[extractors/rtvc] Add extractors (#6578)
[yt-dlp.git] / yt_dlp / extractor / abematv.py
CommitLineData
f8271158 1import base64
2import binascii
bc83b4b0 3import functools
3e9b66d7
LNO
4import hashlib
5import hmac
f8271158 6import io
7import json
3e9b66d7
LNO
8import re
9import struct
f8271158 10import time
14f25df2 11import urllib.parse
ac668111 12import urllib.request
f9934b96 13import urllib.response
14import uuid
3e9b66d7 15
3e9b66d7
LNO
16from .common import InfoExtractor
17from ..aes import aes_ecb_decrypt
3e9b66d7
LNO
18from ..utils import (
19 ExtractorError,
f8271158 20 bytes_to_intlist,
7b2c3f47 21 decode_base_n,
3e9b66d7 22 int_or_none,
f8271158 23 intlist_to_bytes,
bc83b4b0 24 OnDemandPagedList,
3e9b66d7
LNO
25 request_to_url,
26 time_seconds,
3e9b66d7 27 traverse_obj,
f8271158 28 update_url_query,
3e9b66d7
LNO
29)
30
3e9b66d7
LNO
31# NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862)
32
e5a998f3 33
08d30158 34def add_opener(ydl, handler):
3e9b66d7
LNO
35 ''' Add a handler for opening URLs, like _download_webpage '''
36 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
37 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
ac668111 38 assert isinstance(ydl._opener, urllib.request.OpenerDirector)
08d30158 39 ydl._opener.add_handler(handler)
3e9b66d7
LNO
40
41
08d30158 42def remove_opener(ydl, handler):
3e9b66d7
LNO
43 '''
44 Remove handler(s) for opening URLs
45 @param handler Either handler object itself or handler type.
46 Specifying handler type will remove all handler which isinstance returns True.
47 '''
48 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
49 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
08d30158 50 opener = ydl._opener
ac668111 51 assert isinstance(ydl._opener, urllib.request.OpenerDirector)
3e9b66d7
LNO
52 if isinstance(handler, (type, tuple)):
53 find_cp = lambda x: isinstance(x, handler)
54 else:
55 find_cp = lambda x: x is handler
56
57 removed = []
58 for meth in dir(handler):
59 if meth in ["redirect_request", "do_open", "proxy_open"]:
60 # oops, coincidental match
61 continue
62
63 i = meth.find("_")
64 protocol = meth[:i]
65 condition = meth[i + 1:]
66
67 if condition.startswith("error"):
68 j = condition.find("_") + i + 1
69 kind = meth[j + 1:]
70 try:
71 kind = int(kind)
72 except ValueError:
73 pass
74 lookup = opener.handle_error.get(protocol, {})
75 opener.handle_error[protocol] = lookup
76 elif condition == "open":
77 kind = protocol
78 lookup = opener.handle_open
79 elif condition == "response":
80 kind = protocol
81 lookup = opener.process_response
82 elif condition == "request":
83 kind = protocol
84 lookup = opener.process_request
85 else:
86 continue
87
88 handlers = lookup.setdefault(kind, [])
89 if handlers:
90 handlers[:] = [x for x in handlers if not find_cp(x)]
91
92 removed.append(x for x in handlers if find_cp(x))
93
94 if removed:
95 for x in opener.handlers:
96 if find_cp(x):
97 x.add_parent(None)
98 opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)]
99
100
ac668111 101class AbemaLicenseHandler(urllib.request.BaseHandler):
3e9b66d7
LNO
102 handler_order = 499
103 STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
104 HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E'
105
106 def __init__(self, ie: 'AbemaTVIE'):
962ffcf8 107 # the protocol that this should really handle is 'abematv-license://'
3e9b66d7
LNO
108 # abematv_license_open is just a placeholder for development purposes
109 # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510
110 setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open'))
111 self.ie = ie
112
113 def _get_videokey_from_ticket(self, ticket):
9809740b 114 to_show = self.ie.get_param('verbose', False)
3e9b66d7
LNO
115 media_token = self.ie._get_media_token(to_show=to_show)
116
117 license_response = self.ie._download_json(
118 'https://license.abema.io/abematv-hls', None, note='Requesting playback license' if to_show else False,
119 query={'t': media_token},
120 data=json.dumps({
121 'kv': 'a',
122 'lt': ticket
123 }).encode('utf-8'),
124 headers={
125 'Content-Type': 'application/json',
126 })
127
7b2c3f47 128 res = decode_base_n(license_response['k'], table=self.STRTABLE)
3e9b66d7
LNO
129 encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff))
130
131 h = hmac.new(
f8271158 132 binascii.unhexlify(self.HKEY),
3e9b66d7
LNO
133 (license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'),
134 digestmod=hashlib.sha256)
135 enckey = bytes_to_intlist(h.digest())
136
137 return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey))
138
139 def abematv_license_open(self, url):
140 url = request_to_url(url)
14f25df2 141 ticket = urllib.parse.urlparse(url).netloc
3e9b66d7 142 response_data = self._get_videokey_from_ticket(ticket)
f9934b96 143 return urllib.response.addinfourl(io.BytesIO(response_data), headers={
3e9b66d7
LNO
144 'Content-Length': len(response_data),
145 }, url=url, code=200)
146
147
148class AbemaTVBaseIE(InfoExtractor):
3e9b66d7
LNO
149 _USERTOKEN = None
150 _DEVICE_ID = None
3e9b66d7
LNO
151 _MEDIATOKEN = None
152
153 _SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe'
154
bc83b4b0
L
155 @classmethod
156 def _generate_aks(cls, deviceid):
3e9b66d7
LNO
157 deviceid = deviceid.encode('utf-8')
158 # add 1 hour and then drop minute and secs
a4f16832 159 ts_1hour = int((time_seconds() // 3600 + 1) * 3600)
3e9b66d7
LNO
160 time_struct = time.gmtime(ts_1hour)
161 ts_1hour_str = str(ts_1hour).encode('utf-8')
162
163 tmp = None
164
165 def mix_once(nonce):
166 nonlocal tmp
bc83b4b0 167 h = hmac.new(cls._SECRETKEY, digestmod=hashlib.sha256)
3e9b66d7
LNO
168 h.update(nonce)
169 tmp = h.digest()
170
171 def mix_tmp(count):
172 nonlocal tmp
173 for i in range(count):
174 mix_once(tmp)
175
176 def mix_twist(nonce):
177 nonlocal tmp
f8271158 178 mix_once(base64.urlsafe_b64encode(tmp).rstrip(b'=') + nonce)
3e9b66d7 179
bc83b4b0 180 mix_once(cls._SECRETKEY)
3e9b66d7
LNO
181 mix_tmp(time_struct.tm_mon)
182 mix_twist(deviceid)
183 mix_tmp(time_struct.tm_mday % 5)
184 mix_twist(ts_1hour_str)
185 mix_tmp(time_struct.tm_hour % 5)
186
f8271158 187 return base64.urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8')
3e9b66d7
LNO
188
189 def _get_device_token(self):
190 if self._USERTOKEN:
191 return self._USERTOKEN
192
a4f16832
L
193 username, _ = self._get_login_info()
194 AbemaTVBaseIE._USERTOKEN = username and self.cache.load(self._NETRC_MACHINE, username)
195 if AbemaTVBaseIE._USERTOKEN:
196 # try authentication with locally stored token
197 try:
198 self._get_media_token(True)
199 return
200 except ExtractorError as e:
201 self.report_warning(f'Failed to login with cached user token; obtaining a fresh one ({e})')
202
bc83b4b0 203 AbemaTVBaseIE._DEVICE_ID = str(uuid.uuid4())
3e9b66d7
LNO
204 aks = self._generate_aks(self._DEVICE_ID)
205 user_data = self._download_json(
206 'https://api.abema.io/v1/users', None, note='Authorizing',
207 data=json.dumps({
208 'deviceId': self._DEVICE_ID,
209 'applicationKeySecret': aks,
210 }).encode('utf-8'),
211 headers={
212 'Content-Type': 'application/json',
213 })
bc83b4b0 214 AbemaTVBaseIE._USERTOKEN = user_data['token']
3e9b66d7
LNO
215
216 # don't allow adding it 2 times or more, though it's guarded
217 remove_opener(self._downloader, AbemaLicenseHandler)
218 add_opener(self._downloader, AbemaLicenseHandler(self))
219
220 return self._USERTOKEN
221
222 def _get_media_token(self, invalidate=False, to_show=True):
223 if not invalidate and self._MEDIATOKEN:
224 return self._MEDIATOKEN
225
bc83b4b0 226 AbemaTVBaseIE._MEDIATOKEN = self._download_json(
3e9b66d7
LNO
227 'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False,
228 query={
229 'osName': 'android',
230 'osVersion': '6.0.1',
231 'osLang': 'ja_JP',
232 'osTimezone': 'Asia/Tokyo',
233 'appId': 'tv.abema',
234 'appVersion': '3.27.1'
235 }, headers={
bc83b4b0 236 'Authorization': f'bearer {self._get_device_token()}',
3e9b66d7
LNO
237 })['token']
238
239 return self._MEDIATOKEN
240
bc83b4b0
L
241 def _call_api(self, endpoint, video_id, query=None, note='Downloading JSON metadata'):
242 return self._download_json(
243 f'https://api.abema.io/{endpoint}', video_id, query=query or {},
244 note=note,
245 headers={
246 'Authorization': f'bearer {self._get_device_token()}',
247 })
248
249 def _extract_breadcrumb_list(self, webpage, video_id):
250 for jld in re.finditer(
251 r'(?is)</span></li></ul><script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
252 webpage):
253 jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
254 if traverse_obj(jsonld, '@type') != 'BreadcrumbList':
255 continue
256 items = traverse_obj(jsonld, ('itemListElement', ..., 'name'))
257 if items:
258 return items
259 return []
260
261
262class AbemaTVIE(AbemaTVBaseIE):
263 _VALID_URL = r'https?://abema\.tv/(?P<type>now-on-air|video/episode|channels/.+?/slots)/(?P<id>[^?/]+)'
264 _NETRC_MACHINE = 'abematv'
265 _TESTS = [{
266 'url': 'https://abema.tv/video/episode/194-25_s2_p1',
267 'info_dict': {
268 'id': '194-25_s2_p1',
269 'title': '第1話 「チーズケーキ」 「モーニング再び」',
270 'series': '異世界食堂2',
271 'series_number': 2,
272 'episode': '第1話 「チーズケーキ」 「モーニング再び」',
273 'episode_number': 1,
274 },
275 'skip': 'expired',
276 }, {
277 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d',
278 'info_dict': {
279 'id': 'E8tvAnMJ7a9a5d',
280 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】',
281 'series': 'ゆるキャン△ SEASON2',
282 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】',
283 'series_number': 2,
284 'episode_number': 1,
285 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17',
286 },
287 'skip': 'expired',
288 }, {
289 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047',
290 'info_dict': {
291 'id': 'E8tvAnMJ7a9a5d',
292 'title': '第5話『光射す』',
293 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d',
294 'thumbnail': r're:https://hayabusa\.io/.+',
295 'series': '相棒',
296 'episode': '第5話『光射す』',
297 },
298 'skip': 'expired',
299 }, {
300 'url': 'https://abema.tv/now-on-air/abema-anime',
301 'info_dict': {
302 'id': 'abema-anime',
303 # this varies
304 # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】',
305 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f',
306 'is_live': True,
307 },
308 'skip': 'Not supported until yt-dlp implements native live downloader OR AbemaTV can start a local HTTP server',
309 }]
310 _TIMETABLE = None
311
52efa4b3 312 def _perform_login(self, username, password):
a4f16832
L
313 self._get_device_token()
314 if self.cache.load(self._NETRC_MACHINE, username) and self._get_media_token():
315 self.write_debug('Skipping logging in')
316 return
317
3e9b66d7
LNO
318 if '@' in username: # don't strictly check if it's email address or not
319 ep, method = 'user/email', 'email'
320 else:
321 ep, method = 'oneTimePassword', 'userId'
322
323 login_response = self._download_json(
324 f'https://api.abema.io/v1/auth/{ep}', None, note='Logging in',
325 data=json.dumps({
326 method: username,
327 'password': password
328 }).encode('utf-8'), headers={
bc83b4b0 329 'Authorization': f'bearer {self._get_device_token()}',
3e9b66d7
LNO
330 'Origin': 'https://abema.tv',
331 'Referer': 'https://abema.tv/',
332 'Content-Type': 'application/json',
333 })
334
bc83b4b0 335 AbemaTVBaseIE._USERTOKEN = login_response['token']
3e9b66d7 336 self._get_media_token(True)
a4f16832 337 self.cache.store(self._NETRC_MACHINE, username, AbemaTVBaseIE._USERTOKEN)
3e9b66d7
LNO
338
339 def _real_extract(self, url):
340 # starting download using infojson from this extractor is undefined behavior,
962ffcf8 341 # and never be fixed in the future; you must trigger downloads by directly specifying URL.
3e9b66d7
LNO
342 # (unless there's a way to hook before downloading by extractor)
343 video_id, video_type = self._match_valid_url(url).group('id', 'type')
344 headers = {
345 'Authorization': 'Bearer ' + self._get_device_token(),
346 }
347 video_type = video_type.split('/')[-1]
348
349 webpage = self._download_webpage(url, video_id)
350 canonical_url = self._search_regex(
351 r'<link\s+rel="canonical"\s*href="(.+?)"', webpage, 'canonical URL',
352 default=url)
353 info = self._search_json_ld(webpage, video_id, default={})
354
355 title = self._search_regex(
356 r'<span\s*class=".+?EpisodeTitleBlock__title">(.+?)</span>', webpage, 'title', default=None)
357 if not title:
358 jsonld = None
359 for jld in re.finditer(
360 r'(?is)<span\s*class="com-m-Thumbnail__image">(?:</span>)?<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
361 webpage):
362 jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
363 if jsonld:
364 break
365 if jsonld:
366 title = jsonld.get('caption')
367 if not title and video_type == 'now-on-air':
368 if not self._TIMETABLE:
369 # cache the timetable because it goes to 5MiB in size (!!)
370 self._TIMETABLE = self._download_json(
371 'https://api.abema.io/v1/timetable/dataSet?debug=false', video_id,
372 headers=headers)
373 now = time_seconds(hours=9)
374 for slot in self._TIMETABLE.get('slots', []):
375 if slot.get('channelId') != video_id:
376 continue
377 if slot['startAt'] <= now and now < slot['endAt']:
378 title = slot['title']
379 break
380
381 # read breadcrumb on top of page
382 breadcrumb = self._extract_breadcrumb_list(webpage, video_id)
383 if breadcrumb:
62b58c09 384 # breadcrumb list translates to: (e.g. 1st test for this IE)
3e9b66d7
LNO
385 # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title)
386 # hence this works
387 info['series'] = breadcrumb[-2]
388 info['episode'] = breadcrumb[-1]
389 if not title:
390 title = info['episode']
391
392 description = self._html_search_regex(
393 (r'<p\s+class="com-video-EpisodeDetailsBlock__content"><span\s+class=".+?">(.+?)</span></p><div',
394 r'<span\s+class=".+?SlotSummary.+?">(.+?)</span></div><div',),
395 webpage, 'description', default=None, group=1)
396 if not description:
397 og_desc = self._html_search_meta(
398 ('description', 'og:description', 'twitter:description'), webpage)
399 if og_desc:
400 description = re.sub(r'''(?sx)
401 ^(.+?)(?:
402 アニメの動画を無料で見るならABEMA!| # anime
403 等、.+ # applies for most of categories
404 )?
405 ''', r'\1', og_desc)
406
407 # canonical URL may contain series and episode number
408 mobj = re.search(r's(\d+)_p(\d+)$', canonical_url)
409 if mobj:
410 seri = int_or_none(mobj.group(1), default=float('inf'))
411 epis = int_or_none(mobj.group(2), default=float('inf'))
412 info['series_number'] = seri if seri < 100 else None
413 # some anime like Detective Conan (though not available in AbemaTV)
414 # has more than 1000 episodes (1026 as of 2021/11/15)
415 info['episode_number'] = epis if epis < 2000 else None
416
417 is_live, m3u8_url = False, None
418 if video_type == 'now-on-air':
419 is_live = True
420 channel_url = 'https://api.abema.io/v1/channels'
421 if video_id == 'news-global':
422 channel_url = update_url_query(channel_url, {'division': '1'})
423 onair_channels = self._download_json(channel_url, video_id)
424 for ch in onair_channels['channels']:
425 if video_id == ch['id']:
426 m3u8_url = ch['playback']['hls']
427 break
428 else:
429 raise ExtractorError(f'Cannot find on-air {video_id} channel.', expected=True)
430 elif video_type == 'episode':
431 api_response = self._download_json(
432 f'https://api.abema.io/v1/video/programs/{video_id}', video_id,
433 note='Checking playability',
434 headers=headers)
6839ae1f 435 ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType'))
3e9b66d7
LNO
436 if 3 not in ondemand_types:
437 # cannot acquire decryption key for these streams
438 self.report_warning('This is a premium-only stream')
439
440 m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8'
441 elif video_type == 'slots':
442 api_response = self._download_json(
443 f'https://api.abema.io/v1/media/slots/{video_id}', video_id,
444 note='Checking playability',
445 headers=headers)
446 if not traverse_obj(api_response, ('slot', 'flags', 'timeshiftFree'), default=False):
447 self.report_warning('This is a premium-only stream')
448
449 m3u8_url = f'https://vod-abematv.akamaized.net/slot/{video_id}/playlist.m3u8'
450 else:
451 raise ExtractorError('Unreachable')
452
453 if is_live:
454 self.report_warning("This is a livestream; yt-dlp doesn't support downloading natively, but FFmpeg cannot handle m3u8 manifests from AbemaTV")
455 self.report_warning('Please consider using Streamlink to download these streams (https://github.com/streamlink/streamlink)')
456 formats = self._extract_m3u8_formats(
457 m3u8_url, video_id, ext='mp4', live=is_live)
458
459 info.update({
460 'id': video_id,
461 'title': title,
462 'description': description,
463 'formats': formats,
464 'is_live': is_live,
465 })
466 return info
467
468
469class AbemaTVTitleIE(AbemaTVBaseIE):
470 _VALID_URL = r'https?://abema\.tv/video/title/(?P<id>[^?/]+)'
bc83b4b0 471 _PAGE_SIZE = 25
3e9b66d7
LNO
472
473 _TESTS = [{
474 'url': 'https://abema.tv/video/title/90-1597',
475 'info_dict': {
476 'id': '90-1597',
477 'title': 'シャッフルアイランド',
478 },
479 'playlist_mincount': 2,
480 }, {
481 'url': 'https://abema.tv/video/title/193-132',
482 'info_dict': {
483 'id': '193-132',
484 'title': '真心が届く~僕とスターのオフィス・ラブ!?~',
485 },
486 'playlist_mincount': 16,
bc83b4b0
L
487 }, {
488 'url': 'https://abema.tv/video/title/25-102',
489 'info_dict': {
490 'id': '25-102',
491 'title': 'ソードアート・オンライン アリシゼーション',
492 },
493 'playlist_mincount': 24,
3e9b66d7
LNO
494 }]
495
bc83b4b0
L
496 def _fetch_page(self, playlist_id, series_version, page):
497 programs = self._call_api(
498 f'v1/video/series/{playlist_id}/programs', playlist_id,
499 note=f'Downloading page {page + 1}',
500 query={
501 'seriesVersion': series_version,
502 'offset': str(page * self._PAGE_SIZE),
503 'order': 'seq',
504 'limit': str(self._PAGE_SIZE),
505 })
506 yield from (
507 self.url_result(f'https://abema.tv/video/episode/{x}')
6839ae1f 508 for x in traverse_obj(programs, ('programs', ..., 'id')))
3e9b66d7 509
bc83b4b0
L
510 def _entries(self, playlist_id, series_version):
511 return OnDemandPagedList(
512 functools.partial(self._fetch_page, playlist_id, series_version),
513 self._PAGE_SIZE)
3e9b66d7 514
bc83b4b0
L
515 def _real_extract(self, url):
516 playlist_id = self._match_id(url)
517 series_info = self._call_api(f'v1/video/series/{playlist_id}', playlist_id)
3e9b66d7 518
bc83b4b0
L
519 return self.playlist_result(
520 self._entries(playlist_id, series_info['version']), playlist_id=playlist_id,
521 playlist_title=series_info.get('title'),
522 playlist_description=series_info.get('content'))