]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/abematv.py
[cleanup] Fix infodict returned fields (#8906)
[yt-dlp.git] / yt_dlp / extractor / abematv.py
CommitLineData
f8271158 1import base64
2import binascii
bc83b4b0 3import functools
3e9b66d7
LNO
4import hashlib
5import hmac
f8271158 6import io
7import json
3e9b66d7
LNO
8import re
9import struct
f8271158 10import time
14f25df2 11import urllib.parse
ac668111 12import urllib.request
f9934b96 13import urllib.response
14import uuid
497bbbbd 15from ..utils.networking import clean_proxies
3e9b66d7
LNO
16from .common import InfoExtractor
17from ..aes import aes_ecb_decrypt
3e9b66d7
LNO
18from ..utils import (
19 ExtractorError,
f8271158 20 bytes_to_intlist,
7b2c3f47 21 decode_base_n,
3e9b66d7 22 int_or_none,
f8271158 23 intlist_to_bytes,
bc83b4b0 24 OnDemandPagedList,
3e9b66d7 25 time_seconds,
3e9b66d7 26 traverse_obj,
f8271158 27 update_url_query,
3e9b66d7
LNO
28)
29
3e9b66d7 30
9f662472 31def add_opener(ydl, handler): # FIXME: Create proper API in .networking
32 """Add a handler for opening URLs, like _download_webpage"""
3e9b66d7
LNO
33 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
34 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
9f662472 35 rh = ydl._request_director.handlers['Urllib']
36 if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES:
37 return
497bbbbd
S
38 headers = ydl.params['http_headers'].copy()
39 proxies = ydl.proxies.copy()
40 clean_proxies(proxies, headers)
41 opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=proxies)
9f662472 42 assert isinstance(opener, urllib.request.OpenerDirector)
43 opener.add_handler(handler)
44 rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license')
3e9b66d7
LNO
45
46
ac668111 47class AbemaLicenseHandler(urllib.request.BaseHandler):
3e9b66d7
LNO
48 handler_order = 499
49 STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
50 HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E'
51
52 def __init__(self, ie: 'AbemaTVIE'):
962ffcf8 53 # the protocol that this should really handle is 'abematv-license://'
3e9b66d7
LNO
54 # abematv_license_open is just a placeholder for development purposes
55 # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510
56 setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open'))
57 self.ie = ie
58
59 def _get_videokey_from_ticket(self, ticket):
9809740b 60 to_show = self.ie.get_param('verbose', False)
3e9b66d7
LNO
61 media_token = self.ie._get_media_token(to_show=to_show)
62
63 license_response = self.ie._download_json(
64 'https://license.abema.io/abematv-hls', None, note='Requesting playback license' if to_show else False,
65 query={'t': media_token},
66 data=json.dumps({
67 'kv': 'a',
68 'lt': ticket
69 }).encode('utf-8'),
70 headers={
71 'Content-Type': 'application/json',
72 })
73
7b2c3f47 74 res = decode_base_n(license_response['k'], table=self.STRTABLE)
3e9b66d7
LNO
75 encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff))
76
77 h = hmac.new(
f8271158 78 binascii.unhexlify(self.HKEY),
3e9b66d7
LNO
79 (license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'),
80 digestmod=hashlib.sha256)
81 enckey = bytes_to_intlist(h.digest())
82
83 return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey))
84
85 def abematv_license_open(self, url):
3d2623a8 86 url = url.get_full_url() if isinstance(url, urllib.request.Request) else url
14f25df2 87 ticket = urllib.parse.urlparse(url).netloc
3e9b66d7 88 response_data = self._get_videokey_from_ticket(ticket)
f9934b96 89 return urllib.response.addinfourl(io.BytesIO(response_data), headers={
9f662472 90 'Content-Length': str(len(response_data)),
3e9b66d7
LNO
91 }, url=url, code=200)
92
93
94class AbemaTVBaseIE(InfoExtractor):
8226a381 95 _NETRC_MACHINE = 'abematv'
96
3e9b66d7
LNO
97 _USERTOKEN = None
98 _DEVICE_ID = None
3e9b66d7
LNO
99 _MEDIATOKEN = None
100
101 _SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe'
102
bc83b4b0
L
103 @classmethod
104 def _generate_aks(cls, deviceid):
3e9b66d7
LNO
105 deviceid = deviceid.encode('utf-8')
106 # add 1 hour and then drop minute and secs
a4f16832 107 ts_1hour = int((time_seconds() // 3600 + 1) * 3600)
3e9b66d7
LNO
108 time_struct = time.gmtime(ts_1hour)
109 ts_1hour_str = str(ts_1hour).encode('utf-8')
110
111 tmp = None
112
113 def mix_once(nonce):
114 nonlocal tmp
bc83b4b0 115 h = hmac.new(cls._SECRETKEY, digestmod=hashlib.sha256)
3e9b66d7
LNO
116 h.update(nonce)
117 tmp = h.digest()
118
119 def mix_tmp(count):
120 nonlocal tmp
121 for i in range(count):
122 mix_once(tmp)
123
124 def mix_twist(nonce):
125 nonlocal tmp
f8271158 126 mix_once(base64.urlsafe_b64encode(tmp).rstrip(b'=') + nonce)
3e9b66d7 127
bc83b4b0 128 mix_once(cls._SECRETKEY)
3e9b66d7
LNO
129 mix_tmp(time_struct.tm_mon)
130 mix_twist(deviceid)
131 mix_tmp(time_struct.tm_mday % 5)
132 mix_twist(ts_1hour_str)
133 mix_tmp(time_struct.tm_hour % 5)
134
f8271158 135 return base64.urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8')
3e9b66d7
LNO
136
137 def _get_device_token(self):
138 if self._USERTOKEN:
139 return self._USERTOKEN
140
c51316f8 141 add_opener(self._downloader, AbemaLicenseHandler(self))
142
a4f16832 143 username, _ = self._get_login_info()
c51316f8 144 auth_cache = username and self.cache.load(self._NETRC_MACHINE, username, min_ver='2024.01.19')
145 AbemaTVBaseIE._USERTOKEN = auth_cache and auth_cache.get('usertoken')
a4f16832
L
146 if AbemaTVBaseIE._USERTOKEN:
147 # try authentication with locally stored token
148 try:
c51316f8 149 AbemaTVBaseIE._DEVICE_ID = auth_cache.get('device_id')
a4f16832
L
150 self._get_media_token(True)
151 return
152 except ExtractorError as e:
153 self.report_warning(f'Failed to login with cached user token; obtaining a fresh one ({e})')
154
bc83b4b0 155 AbemaTVBaseIE._DEVICE_ID = str(uuid.uuid4())
3e9b66d7
LNO
156 aks = self._generate_aks(self._DEVICE_ID)
157 user_data = self._download_json(
158 'https://api.abema.io/v1/users', None, note='Authorizing',
159 data=json.dumps({
160 'deviceId': self._DEVICE_ID,
161 'applicationKeySecret': aks,
162 }).encode('utf-8'),
163 headers={
164 'Content-Type': 'application/json',
165 })
bc83b4b0 166 AbemaTVBaseIE._USERTOKEN = user_data['token']
3e9b66d7 167
3e9b66d7
LNO
168 return self._USERTOKEN
169
170 def _get_media_token(self, invalidate=False, to_show=True):
171 if not invalidate and self._MEDIATOKEN:
172 return self._MEDIATOKEN
173
bc83b4b0 174 AbemaTVBaseIE._MEDIATOKEN = self._download_json(
3e9b66d7
LNO
175 'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False,
176 query={
177 'osName': 'android',
178 'osVersion': '6.0.1',
179 'osLang': 'ja_JP',
180 'osTimezone': 'Asia/Tokyo',
181 'appId': 'tv.abema',
182 'appVersion': '3.27.1'
183 }, headers={
bc83b4b0 184 'Authorization': f'bearer {self._get_device_token()}',
3e9b66d7
LNO
185 })['token']
186
187 return self._MEDIATOKEN
188
8226a381 189 def _perform_login(self, username, password):
190 self._get_device_token()
191 if self.cache.load(self._NETRC_MACHINE, username, min_ver='2024.01.19') and self._get_media_token():
192 self.write_debug('Skipping logging in')
193 return
194
195 if '@' in username: # don't strictly check if it's email address or not
196 ep, method = 'user/email', 'email'
197 else:
198 ep, method = 'oneTimePassword', 'userId'
199
200 login_response = self._download_json(
201 f'https://api.abema.io/v1/auth/{ep}', None, note='Logging in',
202 data=json.dumps({
203 method: username,
204 'password': password
205 }).encode('utf-8'), headers={
206 'Authorization': f'bearer {self._get_device_token()}',
207 'Origin': 'https://abema.tv',
208 'Referer': 'https://abema.tv/',
209 'Content-Type': 'application/json',
210 })
211
212 AbemaTVBaseIE._USERTOKEN = login_response['token']
213 self._get_media_token(True)
214 auth_cache = {
215 'device_id': AbemaTVBaseIE._DEVICE_ID,
216 'usertoken': AbemaTVBaseIE._USERTOKEN,
217 }
218 self.cache.store(self._NETRC_MACHINE, username, auth_cache)
219
bc83b4b0
L
220 def _call_api(self, endpoint, video_id, query=None, note='Downloading JSON metadata'):
221 return self._download_json(
222 f'https://api.abema.io/{endpoint}', video_id, query=query or {},
223 note=note,
224 headers={
225 'Authorization': f'bearer {self._get_device_token()}',
226 })
227
228 def _extract_breadcrumb_list(self, webpage, video_id):
229 for jld in re.finditer(
230 r'(?is)</span></li></ul><script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
231 webpage):
232 jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
233 if traverse_obj(jsonld, '@type') != 'BreadcrumbList':
234 continue
235 items = traverse_obj(jsonld, ('itemListElement', ..., 'name'))
236 if items:
237 return items
238 return []
239
240
241class AbemaTVIE(AbemaTVBaseIE):
242 _VALID_URL = r'https?://abema\.tv/(?P<type>now-on-air|video/episode|channels/.+?/slots)/(?P<id>[^?/]+)'
bc83b4b0
L
243 _TESTS = [{
244 'url': 'https://abema.tv/video/episode/194-25_s2_p1',
245 'info_dict': {
246 'id': '194-25_s2_p1',
247 'title': '第1話 「チーズケーキ」 「モーニング再び」',
248 'series': '異世界食堂2',
cc07f5cc 249 'season': 'シーズン2',
250 'season_number': 2,
bc83b4b0
L
251 'episode': '第1話 「チーズケーキ」 「モーニング再び」',
252 'episode_number': 1,
253 },
254 'skip': 'expired',
255 }, {
256 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d',
257 'info_dict': {
258 'id': 'E8tvAnMJ7a9a5d',
259 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】',
260 'series': 'ゆるキャン△ SEASON2',
261 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】',
f4f9f6d0 262 'season_number': 2,
bc83b4b0
L
263 'episode_number': 1,
264 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17',
265 },
266 'skip': 'expired',
267 }, {
268 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047',
269 'info_dict': {
270 'id': 'E8tvAnMJ7a9a5d',
271 'title': '第5話『光射す』',
272 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d',
273 'thumbnail': r're:https://hayabusa\.io/.+',
274 'series': '相棒',
275 'episode': '第5話『光射す』',
276 },
277 'skip': 'expired',
278 }, {
279 'url': 'https://abema.tv/now-on-air/abema-anime',
280 'info_dict': {
281 'id': 'abema-anime',
282 # this varies
283 # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】',
284 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f',
285 'is_live': True,
286 },
287 'skip': 'Not supported until yt-dlp implements native live downloader OR AbemaTV can start a local HTTP server',
288 }]
289 _TIMETABLE = None
290
3e9b66d7
LNO
291 def _real_extract(self, url):
292 # starting download using infojson from this extractor is undefined behavior,
962ffcf8 293 # and never be fixed in the future; you must trigger downloads by directly specifying URL.
3e9b66d7
LNO
294 # (unless there's a way to hook before downloading by extractor)
295 video_id, video_type = self._match_valid_url(url).group('id', 'type')
296 headers = {
297 'Authorization': 'Bearer ' + self._get_device_token(),
298 }
299 video_type = video_type.split('/')[-1]
300
301 webpage = self._download_webpage(url, video_id)
302 canonical_url = self._search_regex(
303 r'<link\s+rel="canonical"\s*href="(.+?)"', webpage, 'canonical URL',
304 default=url)
305 info = self._search_json_ld(webpage, video_id, default={})
306
307 title = self._search_regex(
308 r'<span\s*class=".+?EpisodeTitleBlock__title">(.+?)</span>', webpage, 'title', default=None)
309 if not title:
310 jsonld = None
311 for jld in re.finditer(
312 r'(?is)<span\s*class="com-m-Thumbnail__image">(?:</span>)?<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
313 webpage):
314 jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
315 if jsonld:
316 break
317 if jsonld:
318 title = jsonld.get('caption')
319 if not title and video_type == 'now-on-air':
320 if not self._TIMETABLE:
321 # cache the timetable because it goes to 5MiB in size (!!)
322 self._TIMETABLE = self._download_json(
323 'https://api.abema.io/v1/timetable/dataSet?debug=false', video_id,
324 headers=headers)
325 now = time_seconds(hours=9)
326 for slot in self._TIMETABLE.get('slots', []):
327 if slot.get('channelId') != video_id:
328 continue
329 if slot['startAt'] <= now and now < slot['endAt']:
330 title = slot['title']
331 break
332
333 # read breadcrumb on top of page
334 breadcrumb = self._extract_breadcrumb_list(webpage, video_id)
335 if breadcrumb:
62b58c09 336 # breadcrumb list translates to: (e.g. 1st test for this IE)
3e9b66d7
LNO
337 # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title)
338 # hence this works
339 info['series'] = breadcrumb[-2]
340 info['episode'] = breadcrumb[-1]
341 if not title:
342 title = info['episode']
343
344 description = self._html_search_regex(
345 (r'<p\s+class="com-video-EpisodeDetailsBlock__content"><span\s+class=".+?">(.+?)</span></p><div',
346 r'<span\s+class=".+?SlotSummary.+?">(.+?)</span></div><div',),
347 webpage, 'description', default=None, group=1)
348 if not description:
349 og_desc = self._html_search_meta(
350 ('description', 'og:description', 'twitter:description'), webpage)
351 if og_desc:
352 description = re.sub(r'''(?sx)
353 ^(.+?)(?:
354 アニメの動画を無料で見るならABEMA!| # anime
355 等、.+ # applies for most of categories
356 )?
357 ''', r'\1', og_desc)
358
cc07f5cc 359 # canonical URL may contain season and episode number
3e9b66d7
LNO
360 mobj = re.search(r's(\d+)_p(\d+)$', canonical_url)
361 if mobj:
362 seri = int_or_none(mobj.group(1), default=float('inf'))
363 epis = int_or_none(mobj.group(2), default=float('inf'))
cc07f5cc 364 info['season_number'] = seri if seri < 100 else None
3e9b66d7
LNO
365 # some anime like Detective Conan (though not available in AbemaTV)
366 # has more than 1000 episodes (1026 as of 2021/11/15)
367 info['episode_number'] = epis if epis < 2000 else None
368
369 is_live, m3u8_url = False, None
370 if video_type == 'now-on-air':
371 is_live = True
372 channel_url = 'https://api.abema.io/v1/channels'
373 if video_id == 'news-global':
374 channel_url = update_url_query(channel_url, {'division': '1'})
375 onair_channels = self._download_json(channel_url, video_id)
376 for ch in onair_channels['channels']:
377 if video_id == ch['id']:
378 m3u8_url = ch['playback']['hls']
379 break
380 else:
381 raise ExtractorError(f'Cannot find on-air {video_id} channel.', expected=True)
382 elif video_type == 'episode':
383 api_response = self._download_json(
384 f'https://api.abema.io/v1/video/programs/{video_id}', video_id,
385 note='Checking playability',
386 headers=headers)
6839ae1f 387 ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType'))
3e9b66d7
LNO
388 if 3 not in ondemand_types:
389 # cannot acquire decryption key for these streams
390 self.report_warning('This is a premium-only stream')
c449c065
L
391 info.update(traverse_obj(api_response, {
392 'series': ('series', 'title'),
cc07f5cc 393 'season': ('season', 'name'),
c449c065
L
394 'season_number': ('season', 'sequence'),
395 'episode_number': ('episode', 'number'),
396 }))
397 if not title:
398 title = traverse_obj(api_response, ('episode', 'title'))
399 if not description:
400 description = traverse_obj(api_response, ('episode', 'content'))
3e9b66d7
LNO
401
402 m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8'
403 elif video_type == 'slots':
404 api_response = self._download_json(
405 f'https://api.abema.io/v1/media/slots/{video_id}', video_id,
406 note='Checking playability',
407 headers=headers)
408 if not traverse_obj(api_response, ('slot', 'flags', 'timeshiftFree'), default=False):
409 self.report_warning('This is a premium-only stream')
410
411 m3u8_url = f'https://vod-abematv.akamaized.net/slot/{video_id}/playlist.m3u8'
412 else:
413 raise ExtractorError('Unreachable')
414
415 if is_live:
416 self.report_warning("This is a livestream; yt-dlp doesn't support downloading natively, but FFmpeg cannot handle m3u8 manifests from AbemaTV")
417 self.report_warning('Please consider using Streamlink to download these streams (https://github.com/streamlink/streamlink)')
418 formats = self._extract_m3u8_formats(
419 m3u8_url, video_id, ext='mp4', live=is_live)
420
421 info.update({
422 'id': video_id,
423 'title': title,
424 'description': description,
425 'formats': formats,
426 'is_live': is_live,
427 })
428 return info
429
430
431class AbemaTVTitleIE(AbemaTVBaseIE):
432 _VALID_URL = r'https?://abema\.tv/video/title/(?P<id>[^?/]+)'
bc83b4b0 433 _PAGE_SIZE = 25
3e9b66d7
LNO
434
435 _TESTS = [{
436 'url': 'https://abema.tv/video/title/90-1597',
437 'info_dict': {
438 'id': '90-1597',
439 'title': 'シャッフルアイランド',
440 },
441 'playlist_mincount': 2,
442 }, {
443 'url': 'https://abema.tv/video/title/193-132',
444 'info_dict': {
445 'id': '193-132',
446 'title': '真心が届く~僕とスターのオフィス・ラブ!?~',
447 },
448 'playlist_mincount': 16,
bc83b4b0
L
449 }, {
450 'url': 'https://abema.tv/video/title/25-102',
451 'info_dict': {
452 'id': '25-102',
453 'title': 'ソードアート・オンライン アリシゼーション',
454 },
455 'playlist_mincount': 24,
3e9b66d7
LNO
456 }]
457
bc83b4b0
L
458 def _fetch_page(self, playlist_id, series_version, page):
459 programs = self._call_api(
460 f'v1/video/series/{playlist_id}/programs', playlist_id,
461 note=f'Downloading page {page + 1}',
462 query={
463 'seriesVersion': series_version,
464 'offset': str(page * self._PAGE_SIZE),
465 'order': 'seq',
466 'limit': str(self._PAGE_SIZE),
467 })
468 yield from (
469 self.url_result(f'https://abema.tv/video/episode/{x}')
6839ae1f 470 for x in traverse_obj(programs, ('programs', ..., 'id')))
3e9b66d7 471
bc83b4b0
L
472 def _entries(self, playlist_id, series_version):
473 return OnDemandPagedList(
474 functools.partial(self._fetch_page, playlist_id, series_version),
475 self._PAGE_SIZE)
3e9b66d7 476
bc83b4b0
L
477 def _real_extract(self, url):
478 playlist_id = self._match_id(url)
479 series_info = self._call_api(f'v1/video/series/{playlist_id}', playlist_id)
3e9b66d7 480
bc83b4b0
L
481 return self.playlist_result(
482 self._entries(playlist_id, series_info['version']), playlist_id=playlist_id,
483 playlist_title=series_info.get('title'),
484 playlist_description=series_info.get('content'))