]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/abematv.py
[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)
[yt-dlp.git] / yt_dlp / extractor / abematv.py
CommitLineData
f8271158 1import base64
2import binascii
bc83b4b0 3import functools
3e9b66d7
LNO
4import hashlib
5import hmac
f8271158 6import io
7import json
3e9b66d7
LNO
8import re
9import struct
f8271158 10import time
14f25df2 11import urllib.parse
ac668111 12import urllib.request
f9934b96 13import urllib.response
14import uuid
e897bd82 15
3e9b66d7
LNO
16from .common import InfoExtractor
17from ..aes import aes_ecb_decrypt
3e9b66d7
LNO
18from ..utils import (
19 ExtractorError,
e897bd82 20 OnDemandPagedList,
f8271158 21 bytes_to_intlist,
7b2c3f47 22 decode_base_n,
3e9b66d7 23 int_or_none,
f8271158 24 intlist_to_bytes,
3e9b66d7 25 time_seconds,
3e9b66d7 26 traverse_obj,
f8271158 27 update_url_query,
3e9b66d7 28)
e897bd82 29from ..utils.networking import clean_proxies
3e9b66d7 30
3e9b66d7 31
9f662472 32def add_opener(ydl, handler): # FIXME: Create proper API in .networking
33 """Add a handler for opening URLs, like _download_webpage"""
3e9b66d7
LNO
34 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
35 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
9f662472 36 rh = ydl._request_director.handlers['Urllib']
37 if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES:
38 return
497bbbbd
S
39 headers = ydl.params['http_headers'].copy()
40 proxies = ydl.proxies.copy()
41 clean_proxies(proxies, headers)
42 opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=proxies)
9f662472 43 assert isinstance(opener, urllib.request.OpenerDirector)
44 opener.add_handler(handler)
45 rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license')
3e9b66d7
LNO
46
47
ac668111 48class AbemaLicenseHandler(urllib.request.BaseHandler):
3e9b66d7
LNO
49 handler_order = 499
50 STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
51 HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E'
52
53 def __init__(self, ie: 'AbemaTVIE'):
962ffcf8 54 # the protocol that this should really handle is 'abematv-license://'
3e9b66d7
LNO
55 # abematv_license_open is just a placeholder for development purposes
56 # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510
93240fc1 57 setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open', None))
3e9b66d7
LNO
58 self.ie = ie
59
60 def _get_videokey_from_ticket(self, ticket):
9809740b 61 to_show = self.ie.get_param('verbose', False)
3e9b66d7
LNO
62 media_token = self.ie._get_media_token(to_show=to_show)
63
64 license_response = self.ie._download_json(
65 'https://license.abema.io/abematv-hls', None, note='Requesting playback license' if to_show else False,
66 query={'t': media_token},
67 data=json.dumps({
68 'kv': 'a',
69 'lt': ticket
70 }).encode('utf-8'),
71 headers={
72 'Content-Type': 'application/json',
73 })
74
7b2c3f47 75 res = decode_base_n(license_response['k'], table=self.STRTABLE)
3e9b66d7
LNO
76 encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff))
77
78 h = hmac.new(
f8271158 79 binascii.unhexlify(self.HKEY),
3e9b66d7
LNO
80 (license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'),
81 digestmod=hashlib.sha256)
82 enckey = bytes_to_intlist(h.digest())
83
84 return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey))
85
86 def abematv_license_open(self, url):
3d2623a8 87 url = url.get_full_url() if isinstance(url, urllib.request.Request) else url
14f25df2 88 ticket = urllib.parse.urlparse(url).netloc
3e9b66d7 89 response_data = self._get_videokey_from_ticket(ticket)
f9934b96 90 return urllib.response.addinfourl(io.BytesIO(response_data), headers={
9f662472 91 'Content-Length': str(len(response_data)),
3e9b66d7
LNO
92 }, url=url, code=200)
93
94
95class AbemaTVBaseIE(InfoExtractor):
8226a381 96 _NETRC_MACHINE = 'abematv'
97
3e9b66d7
LNO
98 _USERTOKEN = None
99 _DEVICE_ID = None
3e9b66d7
LNO
100 _MEDIATOKEN = None
101
102 _SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe'
103
bc83b4b0
L
104 @classmethod
105 def _generate_aks(cls, deviceid):
3e9b66d7
LNO
106 deviceid = deviceid.encode('utf-8')
107 # add 1 hour and then drop minute and secs
a4f16832 108 ts_1hour = int((time_seconds() // 3600 + 1) * 3600)
3e9b66d7
LNO
109 time_struct = time.gmtime(ts_1hour)
110 ts_1hour_str = str(ts_1hour).encode('utf-8')
111
112 tmp = None
113
114 def mix_once(nonce):
115 nonlocal tmp
bc83b4b0 116 h = hmac.new(cls._SECRETKEY, digestmod=hashlib.sha256)
3e9b66d7
LNO
117 h.update(nonce)
118 tmp = h.digest()
119
120 def mix_tmp(count):
121 nonlocal tmp
122 for i in range(count):
123 mix_once(tmp)
124
125 def mix_twist(nonce):
126 nonlocal tmp
f8271158 127 mix_once(base64.urlsafe_b64encode(tmp).rstrip(b'=') + nonce)
3e9b66d7 128
bc83b4b0 129 mix_once(cls._SECRETKEY)
3e9b66d7
LNO
130 mix_tmp(time_struct.tm_mon)
131 mix_twist(deviceid)
132 mix_tmp(time_struct.tm_mday % 5)
133 mix_twist(ts_1hour_str)
134 mix_tmp(time_struct.tm_hour % 5)
135
f8271158 136 return base64.urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8')
3e9b66d7
LNO
137
138 def _get_device_token(self):
139 if self._USERTOKEN:
140 return self._USERTOKEN
141
c51316f8 142 add_opener(self._downloader, AbemaLicenseHandler(self))
143
a4f16832 144 username, _ = self._get_login_info()
c51316f8 145 auth_cache = username and self.cache.load(self._NETRC_MACHINE, username, min_ver='2024.01.19')
146 AbemaTVBaseIE._USERTOKEN = auth_cache and auth_cache.get('usertoken')
a4f16832
L
147 if AbemaTVBaseIE._USERTOKEN:
148 # try authentication with locally stored token
149 try:
c51316f8 150 AbemaTVBaseIE._DEVICE_ID = auth_cache.get('device_id')
a4f16832
L
151 self._get_media_token(True)
152 return
153 except ExtractorError as e:
154 self.report_warning(f'Failed to login with cached user token; obtaining a fresh one ({e})')
155
bc83b4b0 156 AbemaTVBaseIE._DEVICE_ID = str(uuid.uuid4())
3e9b66d7
LNO
157 aks = self._generate_aks(self._DEVICE_ID)
158 user_data = self._download_json(
159 'https://api.abema.io/v1/users', None, note='Authorizing',
160 data=json.dumps({
161 'deviceId': self._DEVICE_ID,
162 'applicationKeySecret': aks,
163 }).encode('utf-8'),
164 headers={
165 'Content-Type': 'application/json',
166 })
bc83b4b0 167 AbemaTVBaseIE._USERTOKEN = user_data['token']
3e9b66d7 168
3e9b66d7
LNO
169 return self._USERTOKEN
170
171 def _get_media_token(self, invalidate=False, to_show=True):
172 if not invalidate and self._MEDIATOKEN:
173 return self._MEDIATOKEN
174
bc83b4b0 175 AbemaTVBaseIE._MEDIATOKEN = self._download_json(
3e9b66d7
LNO
176 'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False,
177 query={
178 'osName': 'android',
179 'osVersion': '6.0.1',
180 'osLang': 'ja_JP',
181 'osTimezone': 'Asia/Tokyo',
182 'appId': 'tv.abema',
183 'appVersion': '3.27.1'
184 }, headers={
bc83b4b0 185 'Authorization': f'bearer {self._get_device_token()}',
3e9b66d7
LNO
186 })['token']
187
188 return self._MEDIATOKEN
189
8226a381 190 def _perform_login(self, username, password):
191 self._get_device_token()
192 if self.cache.load(self._NETRC_MACHINE, username, min_ver='2024.01.19') and self._get_media_token():
193 self.write_debug('Skipping logging in')
194 return
195
196 if '@' in username: # don't strictly check if it's email address or not
197 ep, method = 'user/email', 'email'
198 else:
199 ep, method = 'oneTimePassword', 'userId'
200
201 login_response = self._download_json(
202 f'https://api.abema.io/v1/auth/{ep}', None, note='Logging in',
203 data=json.dumps({
204 method: username,
205 'password': password
206 }).encode('utf-8'), headers={
207 'Authorization': f'bearer {self._get_device_token()}',
208 'Origin': 'https://abema.tv',
209 'Referer': 'https://abema.tv/',
210 'Content-Type': 'application/json',
211 })
212
213 AbemaTVBaseIE._USERTOKEN = login_response['token']
214 self._get_media_token(True)
215 auth_cache = {
216 'device_id': AbemaTVBaseIE._DEVICE_ID,
217 'usertoken': AbemaTVBaseIE._USERTOKEN,
218 }
219 self.cache.store(self._NETRC_MACHINE, username, auth_cache)
220
bc83b4b0
L
221 def _call_api(self, endpoint, video_id, query=None, note='Downloading JSON metadata'):
222 return self._download_json(
223 f'https://api.abema.io/{endpoint}', video_id, query=query or {},
224 note=note,
225 headers={
226 'Authorization': f'bearer {self._get_device_token()}',
227 })
228
229 def _extract_breadcrumb_list(self, webpage, video_id):
230 for jld in re.finditer(
231 r'(?is)</span></li></ul><script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
232 webpage):
233 jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
234 if traverse_obj(jsonld, '@type') != 'BreadcrumbList':
235 continue
236 items = traverse_obj(jsonld, ('itemListElement', ..., 'name'))
237 if items:
238 return items
239 return []
240
241
242class AbemaTVIE(AbemaTVBaseIE):
243 _VALID_URL = r'https?://abema\.tv/(?P<type>now-on-air|video/episode|channels/.+?/slots)/(?P<id>[^?/]+)'
bc83b4b0
L
244 _TESTS = [{
245 'url': 'https://abema.tv/video/episode/194-25_s2_p1',
246 'info_dict': {
247 'id': '194-25_s2_p1',
248 'title': '第1話 「チーズケーキ」 「モーニング再び」',
249 'series': '異世界食堂2',
cc07f5cc 250 'season': 'シーズン2',
251 'season_number': 2,
bc83b4b0
L
252 'episode': '第1話 「チーズケーキ」 「モーニング再び」',
253 'episode_number': 1,
254 },
255 'skip': 'expired',
256 }, {
257 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d',
258 'info_dict': {
259 'id': 'E8tvAnMJ7a9a5d',
260 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】',
261 'series': 'ゆるキャン△ SEASON2',
262 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】',
f4f9f6d0 263 'season_number': 2,
bc83b4b0
L
264 'episode_number': 1,
265 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17',
266 },
267 'skip': 'expired',
268 }, {
269 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047',
270 'info_dict': {
271 'id': 'E8tvAnMJ7a9a5d',
272 'title': '第5話『光射す』',
273 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d',
274 'thumbnail': r're:https://hayabusa\.io/.+',
275 'series': '相棒',
276 'episode': '第5話『光射す』',
277 },
278 'skip': 'expired',
279 }, {
280 'url': 'https://abema.tv/now-on-air/abema-anime',
281 'info_dict': {
282 'id': 'abema-anime',
283 # this varies
284 # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】',
285 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f',
286 'is_live': True,
287 },
288 'skip': 'Not supported until yt-dlp implements native live downloader OR AbemaTV can start a local HTTP server',
289 }]
290 _TIMETABLE = None
291
3e9b66d7
LNO
292 def _real_extract(self, url):
293 # starting download using infojson from this extractor is undefined behavior,
962ffcf8 294 # and never be fixed in the future; you must trigger downloads by directly specifying URL.
3e9b66d7
LNO
295 # (unless there's a way to hook before downloading by extractor)
296 video_id, video_type = self._match_valid_url(url).group('id', 'type')
297 headers = {
298 'Authorization': 'Bearer ' + self._get_device_token(),
299 }
300 video_type = video_type.split('/')[-1]
301
302 webpage = self._download_webpage(url, video_id)
303 canonical_url = self._search_regex(
304 r'<link\s+rel="canonical"\s*href="(.+?)"', webpage, 'canonical URL',
305 default=url)
306 info = self._search_json_ld(webpage, video_id, default={})
307
308 title = self._search_regex(
309 r'<span\s*class=".+?EpisodeTitleBlock__title">(.+?)</span>', webpage, 'title', default=None)
310 if not title:
311 jsonld = None
312 for jld in re.finditer(
313 r'(?is)<span\s*class="com-m-Thumbnail__image">(?:</span>)?<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
314 webpage):
315 jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
316 if jsonld:
317 break
318 if jsonld:
319 title = jsonld.get('caption')
320 if not title and video_type == 'now-on-air':
321 if not self._TIMETABLE:
322 # cache the timetable because it goes to 5MiB in size (!!)
323 self._TIMETABLE = self._download_json(
324 'https://api.abema.io/v1/timetable/dataSet?debug=false', video_id,
325 headers=headers)
326 now = time_seconds(hours=9)
327 for slot in self._TIMETABLE.get('slots', []):
328 if slot.get('channelId') != video_id:
329 continue
330 if slot['startAt'] <= now and now < slot['endAt']:
331 title = slot['title']
332 break
333
334 # read breadcrumb on top of page
335 breadcrumb = self._extract_breadcrumb_list(webpage, video_id)
336 if breadcrumb:
62b58c09 337 # breadcrumb list translates to: (e.g. 1st test for this IE)
3e9b66d7
LNO
338 # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title)
339 # hence this works
340 info['series'] = breadcrumb[-2]
341 info['episode'] = breadcrumb[-1]
342 if not title:
343 title = info['episode']
344
345 description = self._html_search_regex(
346 (r'<p\s+class="com-video-EpisodeDetailsBlock__content"><span\s+class=".+?">(.+?)</span></p><div',
347 r'<span\s+class=".+?SlotSummary.+?">(.+?)</span></div><div',),
348 webpage, 'description', default=None, group=1)
349 if not description:
350 og_desc = self._html_search_meta(
351 ('description', 'og:description', 'twitter:description'), webpage)
352 if og_desc:
353 description = re.sub(r'''(?sx)
354 ^(.+?)(?:
355 アニメの動画を無料で見るならABEMA!| # anime
356 等、.+ # applies for most of categories
357 )?
358 ''', r'\1', og_desc)
359
cc07f5cc 360 # canonical URL may contain season and episode number
3e9b66d7
LNO
361 mobj = re.search(r's(\d+)_p(\d+)$', canonical_url)
362 if mobj:
363 seri = int_or_none(mobj.group(1), default=float('inf'))
364 epis = int_or_none(mobj.group(2), default=float('inf'))
cc07f5cc 365 info['season_number'] = seri if seri < 100 else None
3e9b66d7
LNO
366 # some anime like Detective Conan (though not available in AbemaTV)
367 # has more than 1000 episodes (1026 as of 2021/11/15)
368 info['episode_number'] = epis if epis < 2000 else None
369
370 is_live, m3u8_url = False, None
371 if video_type == 'now-on-air':
372 is_live = True
373 channel_url = 'https://api.abema.io/v1/channels'
374 if video_id == 'news-global':
375 channel_url = update_url_query(channel_url, {'division': '1'})
376 onair_channels = self._download_json(channel_url, video_id)
377 for ch in onair_channels['channels']:
378 if video_id == ch['id']:
379 m3u8_url = ch['playback']['hls']
380 break
381 else:
382 raise ExtractorError(f'Cannot find on-air {video_id} channel.', expected=True)
383 elif video_type == 'episode':
384 api_response = self._download_json(
385 f'https://api.abema.io/v1/video/programs/{video_id}', video_id,
386 note='Checking playability',
387 headers=headers)
6839ae1f 388 ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType'))
3e9b66d7
LNO
389 if 3 not in ondemand_types:
390 # cannot acquire decryption key for these streams
391 self.report_warning('This is a premium-only stream')
c449c065
L
392 info.update(traverse_obj(api_response, {
393 'series': ('series', 'title'),
cc07f5cc 394 'season': ('season', 'name'),
c449c065
L
395 'season_number': ('season', 'sequence'),
396 'episode_number': ('episode', 'number'),
397 }))
398 if not title:
399 title = traverse_obj(api_response, ('episode', 'title'))
400 if not description:
401 description = traverse_obj(api_response, ('episode', 'content'))
3e9b66d7
LNO
402
403 m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8'
404 elif video_type == 'slots':
405 api_response = self._download_json(
406 f'https://api.abema.io/v1/media/slots/{video_id}', video_id,
407 note='Checking playability',
408 headers=headers)
409 if not traverse_obj(api_response, ('slot', 'flags', 'timeshiftFree'), default=False):
410 self.report_warning('This is a premium-only stream')
411
412 m3u8_url = f'https://vod-abematv.akamaized.net/slot/{video_id}/playlist.m3u8'
413 else:
414 raise ExtractorError('Unreachable')
415
416 if is_live:
417 self.report_warning("This is a livestream; yt-dlp doesn't support downloading natively, but FFmpeg cannot handle m3u8 manifests from AbemaTV")
418 self.report_warning('Please consider using Streamlink to download these streams (https://github.com/streamlink/streamlink)')
419 formats = self._extract_m3u8_formats(
420 m3u8_url, video_id, ext='mp4', live=is_live)
421
422 info.update({
423 'id': video_id,
424 'title': title,
425 'description': description,
426 'formats': formats,
427 'is_live': is_live,
428 })
429 return info
430
431
432class AbemaTVTitleIE(AbemaTVBaseIE):
433 _VALID_URL = r'https?://abema\.tv/video/title/(?P<id>[^?/]+)'
bc83b4b0 434 _PAGE_SIZE = 25
3e9b66d7
LNO
435
436 _TESTS = [{
437 'url': 'https://abema.tv/video/title/90-1597',
438 'info_dict': {
439 'id': '90-1597',
440 'title': 'シャッフルアイランド',
441 },
442 'playlist_mincount': 2,
443 }, {
444 'url': 'https://abema.tv/video/title/193-132',
445 'info_dict': {
446 'id': '193-132',
447 'title': '真心が届く~僕とスターのオフィス・ラブ!?~',
448 },
449 'playlist_mincount': 16,
bc83b4b0
L
450 }, {
451 'url': 'https://abema.tv/video/title/25-102',
452 'info_dict': {
453 'id': '25-102',
454 'title': 'ソードアート・オンライン アリシゼーション',
455 },
456 'playlist_mincount': 24,
3e9b66d7
LNO
457 }]
458
bc83b4b0
L
459 def _fetch_page(self, playlist_id, series_version, page):
460 programs = self._call_api(
461 f'v1/video/series/{playlist_id}/programs', playlist_id,
462 note=f'Downloading page {page + 1}',
463 query={
464 'seriesVersion': series_version,
465 'offset': str(page * self._PAGE_SIZE),
466 'order': 'seq',
467 'limit': str(self._PAGE_SIZE),
468 })
469 yield from (
470 self.url_result(f'https://abema.tv/video/episode/{x}')
6839ae1f 471 for x in traverse_obj(programs, ('programs', ..., 'id')))
3e9b66d7 472
bc83b4b0
L
473 def _entries(self, playlist_id, series_version):
474 return OnDemandPagedList(
475 functools.partial(self._fetch_page, playlist_id, series_version),
476 self._PAGE_SIZE)
3e9b66d7 477
bc83b4b0
L
478 def _real_extract(self, url):
479 playlist_id = self._match_id(url)
480 series_info = self._call_api(f'v1/video/series/{playlist_id}', playlist_id)
3e9b66d7 481
bc83b4b0
L
482 return self.playlist_result(
483 self._entries(playlist_id, series_info['version']), playlist_id=playlist_id,
484 playlist_title=series_info.get('title'),
485 playlist_description=series_info.get('content'))