]> jfr.im git - yt-dlp.git/blame_incremental - yt_dlp/extractor/abematv.py
[ie/abematv] Temporary fix for protocol handler
[yt-dlp.git] / yt_dlp / extractor / abematv.py
... / ...
CommitLineData
1import base64
2import binascii
3import functools
4import hashlib
5import hmac
6import io
7import json
8import re
9import struct
10import time
11import urllib.parse
12import urllib.request
13import urllib.response
14import uuid
15
16from .common import InfoExtractor
17from ..aes import aes_ecb_decrypt
18from ..utils import (
19 ExtractorError,
20 bytes_to_intlist,
21 decode_base_n,
22 int_or_none,
23 intlist_to_bytes,
24 OnDemandPagedList,
25 time_seconds,
26 traverse_obj,
27 update_url_query,
28)
29
30
31def add_opener(ydl, handler): # FIXME: Create proper API in .networking
32 """Add a handler for opening URLs, like _download_webpage"""
33 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
34 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
35 rh = ydl._request_director.handlers['Urllib']
36 if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES:
37 return
38 opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=ydl.proxies)
39 assert isinstance(opener, urllib.request.OpenerDirector)
40 opener.add_handler(handler)
41 rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license')
42
43
44class AbemaLicenseHandler(urllib.request.BaseHandler):
45 handler_order = 499
46 STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
47 HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E'
48
49 def __init__(self, ie: 'AbemaTVIE'):
50 # the protocol that this should really handle is 'abematv-license://'
51 # abematv_license_open is just a placeholder for development purposes
52 # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510
53 setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open'))
54 self.ie = ie
55
56 def _get_videokey_from_ticket(self, ticket):
57 to_show = self.ie.get_param('verbose', False)
58 media_token = self.ie._get_media_token(to_show=to_show)
59
60 license_response = self.ie._download_json(
61 'https://license.abema.io/abematv-hls', None, note='Requesting playback license' if to_show else False,
62 query={'t': media_token},
63 data=json.dumps({
64 'kv': 'a',
65 'lt': ticket
66 }).encode('utf-8'),
67 headers={
68 'Content-Type': 'application/json',
69 })
70
71 res = decode_base_n(license_response['k'], table=self.STRTABLE)
72 encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff))
73
74 h = hmac.new(
75 binascii.unhexlify(self.HKEY),
76 (license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'),
77 digestmod=hashlib.sha256)
78 enckey = bytes_to_intlist(h.digest())
79
80 return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey))
81
82 def abematv_license_open(self, url):
83 url = url.get_full_url() if isinstance(url, urllib.request.Request) else url
84 ticket = urllib.parse.urlparse(url).netloc
85 response_data = self._get_videokey_from_ticket(ticket)
86 return urllib.response.addinfourl(io.BytesIO(response_data), headers={
87 'Content-Length': str(len(response_data)),
88 }, url=url, code=200)
89
90
91class AbemaTVBaseIE(InfoExtractor):
92 _USERTOKEN = None
93 _DEVICE_ID = None
94 _MEDIATOKEN = None
95
96 _SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe'
97
98 @classmethod
99 def _generate_aks(cls, deviceid):
100 deviceid = deviceid.encode('utf-8')
101 # add 1 hour and then drop minute and secs
102 ts_1hour = int((time_seconds() // 3600 + 1) * 3600)
103 time_struct = time.gmtime(ts_1hour)
104 ts_1hour_str = str(ts_1hour).encode('utf-8')
105
106 tmp = None
107
108 def mix_once(nonce):
109 nonlocal tmp
110 h = hmac.new(cls._SECRETKEY, digestmod=hashlib.sha256)
111 h.update(nonce)
112 tmp = h.digest()
113
114 def mix_tmp(count):
115 nonlocal tmp
116 for i in range(count):
117 mix_once(tmp)
118
119 def mix_twist(nonce):
120 nonlocal tmp
121 mix_once(base64.urlsafe_b64encode(tmp).rstrip(b'=') + nonce)
122
123 mix_once(cls._SECRETKEY)
124 mix_tmp(time_struct.tm_mon)
125 mix_twist(deviceid)
126 mix_tmp(time_struct.tm_mday % 5)
127 mix_twist(ts_1hour_str)
128 mix_tmp(time_struct.tm_hour % 5)
129
130 return base64.urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8')
131
132 def _get_device_token(self):
133 if self._USERTOKEN:
134 return self._USERTOKEN
135
136 username, _ = self._get_login_info()
137 AbemaTVBaseIE._USERTOKEN = username and self.cache.load(self._NETRC_MACHINE, username)
138 if AbemaTVBaseIE._USERTOKEN:
139 # try authentication with locally stored token
140 try:
141 self._get_media_token(True)
142 return
143 except ExtractorError as e:
144 self.report_warning(f'Failed to login with cached user token; obtaining a fresh one ({e})')
145
146 AbemaTVBaseIE._DEVICE_ID = str(uuid.uuid4())
147 aks = self._generate_aks(self._DEVICE_ID)
148 user_data = self._download_json(
149 'https://api.abema.io/v1/users', None, note='Authorizing',
150 data=json.dumps({
151 'deviceId': self._DEVICE_ID,
152 'applicationKeySecret': aks,
153 }).encode('utf-8'),
154 headers={
155 'Content-Type': 'application/json',
156 })
157 AbemaTVBaseIE._USERTOKEN = user_data['token']
158
159 add_opener(self._downloader, AbemaLicenseHandler(self))
160 return self._USERTOKEN
161
162 def _get_media_token(self, invalidate=False, to_show=True):
163 if not invalidate and self._MEDIATOKEN:
164 return self._MEDIATOKEN
165
166 AbemaTVBaseIE._MEDIATOKEN = self._download_json(
167 'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False,
168 query={
169 'osName': 'android',
170 'osVersion': '6.0.1',
171 'osLang': 'ja_JP',
172 'osTimezone': 'Asia/Tokyo',
173 'appId': 'tv.abema',
174 'appVersion': '3.27.1'
175 }, headers={
176 'Authorization': f'bearer {self._get_device_token()}',
177 })['token']
178
179 return self._MEDIATOKEN
180
181 def _call_api(self, endpoint, video_id, query=None, note='Downloading JSON metadata'):
182 return self._download_json(
183 f'https://api.abema.io/{endpoint}', video_id, query=query or {},
184 note=note,
185 headers={
186 'Authorization': f'bearer {self._get_device_token()}',
187 })
188
189 def _extract_breadcrumb_list(self, webpage, video_id):
190 for jld in re.finditer(
191 r'(?is)</span></li></ul><script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
192 webpage):
193 jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
194 if traverse_obj(jsonld, '@type') != 'BreadcrumbList':
195 continue
196 items = traverse_obj(jsonld, ('itemListElement', ..., 'name'))
197 if items:
198 return items
199 return []
200
201
202class AbemaTVIE(AbemaTVBaseIE):
203 _VALID_URL = r'https?://abema\.tv/(?P<type>now-on-air|video/episode|channels/.+?/slots)/(?P<id>[^?/]+)'
204 _NETRC_MACHINE = 'abematv'
205 _TESTS = [{
206 'url': 'https://abema.tv/video/episode/194-25_s2_p1',
207 'info_dict': {
208 'id': '194-25_s2_p1',
209 'title': '第1話 「チーズケーキ」 「モーニング再び」',
210 'series': '異世界食堂2',
211 'series_number': 2,
212 'episode': '第1話 「チーズケーキ」 「モーニング再び」',
213 'episode_number': 1,
214 },
215 'skip': 'expired',
216 }, {
217 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d',
218 'info_dict': {
219 'id': 'E8tvAnMJ7a9a5d',
220 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】',
221 'series': 'ゆるキャン△ SEASON2',
222 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】',
223 'series_number': 2,
224 'episode_number': 1,
225 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17',
226 },
227 'skip': 'expired',
228 }, {
229 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047',
230 'info_dict': {
231 'id': 'E8tvAnMJ7a9a5d',
232 'title': '第5話『光射す』',
233 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d',
234 'thumbnail': r're:https://hayabusa\.io/.+',
235 'series': '相棒',
236 'episode': '第5話『光射す』',
237 },
238 'skip': 'expired',
239 }, {
240 'url': 'https://abema.tv/now-on-air/abema-anime',
241 'info_dict': {
242 'id': 'abema-anime',
243 # this varies
244 # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】',
245 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f',
246 'is_live': True,
247 },
248 'skip': 'Not supported until yt-dlp implements native live downloader OR AbemaTV can start a local HTTP server',
249 }]
250 _TIMETABLE = None
251
252 def _perform_login(self, username, password):
253 self._get_device_token()
254 if self.cache.load(self._NETRC_MACHINE, username) and self._get_media_token():
255 self.write_debug('Skipping logging in')
256 return
257
258 if '@' in username: # don't strictly check if it's email address or not
259 ep, method = 'user/email', 'email'
260 else:
261 ep, method = 'oneTimePassword', 'userId'
262
263 login_response = self._download_json(
264 f'https://api.abema.io/v1/auth/{ep}', None, note='Logging in',
265 data=json.dumps({
266 method: username,
267 'password': password
268 }).encode('utf-8'), headers={
269 'Authorization': f'bearer {self._get_device_token()}',
270 'Origin': 'https://abema.tv',
271 'Referer': 'https://abema.tv/',
272 'Content-Type': 'application/json',
273 })
274
275 AbemaTVBaseIE._USERTOKEN = login_response['token']
276 self._get_media_token(True)
277 self.cache.store(self._NETRC_MACHINE, username, AbemaTVBaseIE._USERTOKEN)
278
279 def _real_extract(self, url):
280 # starting download using infojson from this extractor is undefined behavior,
281 # and never be fixed in the future; you must trigger downloads by directly specifying URL.
282 # (unless there's a way to hook before downloading by extractor)
283 video_id, video_type = self._match_valid_url(url).group('id', 'type')
284 headers = {
285 'Authorization': 'Bearer ' + self._get_device_token(),
286 }
287 video_type = video_type.split('/')[-1]
288
289 webpage = self._download_webpage(url, video_id)
290 canonical_url = self._search_regex(
291 r'<link\s+rel="canonical"\s*href="(.+?)"', webpage, 'canonical URL',
292 default=url)
293 info = self._search_json_ld(webpage, video_id, default={})
294
295 title = self._search_regex(
296 r'<span\s*class=".+?EpisodeTitleBlock__title">(.+?)</span>', webpage, 'title', default=None)
297 if not title:
298 jsonld = None
299 for jld in re.finditer(
300 r'(?is)<span\s*class="com-m-Thumbnail__image">(?:</span>)?<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
301 webpage):
302 jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
303 if jsonld:
304 break
305 if jsonld:
306 title = jsonld.get('caption')
307 if not title and video_type == 'now-on-air':
308 if not self._TIMETABLE:
309 # cache the timetable because it goes to 5MiB in size (!!)
310 self._TIMETABLE = self._download_json(
311 'https://api.abema.io/v1/timetable/dataSet?debug=false', video_id,
312 headers=headers)
313 now = time_seconds(hours=9)
314 for slot in self._TIMETABLE.get('slots', []):
315 if slot.get('channelId') != video_id:
316 continue
317 if slot['startAt'] <= now and now < slot['endAt']:
318 title = slot['title']
319 break
320
321 # read breadcrumb on top of page
322 breadcrumb = self._extract_breadcrumb_list(webpage, video_id)
323 if breadcrumb:
324 # breadcrumb list translates to: (e.g. 1st test for this IE)
325 # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title)
326 # hence this works
327 info['series'] = breadcrumb[-2]
328 info['episode'] = breadcrumb[-1]
329 if not title:
330 title = info['episode']
331
332 description = self._html_search_regex(
333 (r'<p\s+class="com-video-EpisodeDetailsBlock__content"><span\s+class=".+?">(.+?)</span></p><div',
334 r'<span\s+class=".+?SlotSummary.+?">(.+?)</span></div><div',),
335 webpage, 'description', default=None, group=1)
336 if not description:
337 og_desc = self._html_search_meta(
338 ('description', 'og:description', 'twitter:description'), webpage)
339 if og_desc:
340 description = re.sub(r'''(?sx)
341 ^(.+?)(?:
342 アニメの動画を無料で見るならABEMA!| # anime
343 等、.+ # applies for most of categories
344 )?
345 ''', r'\1', og_desc)
346
347 # canonical URL may contain series and episode number
348 mobj = re.search(r's(\d+)_p(\d+)$', canonical_url)
349 if mobj:
350 seri = int_or_none(mobj.group(1), default=float('inf'))
351 epis = int_or_none(mobj.group(2), default=float('inf'))
352 info['series_number'] = seri if seri < 100 else None
353 # some anime like Detective Conan (though not available in AbemaTV)
354 # has more than 1000 episodes (1026 as of 2021/11/15)
355 info['episode_number'] = epis if epis < 2000 else None
356
357 is_live, m3u8_url = False, None
358 if video_type == 'now-on-air':
359 is_live = True
360 channel_url = 'https://api.abema.io/v1/channels'
361 if video_id == 'news-global':
362 channel_url = update_url_query(channel_url, {'division': '1'})
363 onair_channels = self._download_json(channel_url, video_id)
364 for ch in onair_channels['channels']:
365 if video_id == ch['id']:
366 m3u8_url = ch['playback']['hls']
367 break
368 else:
369 raise ExtractorError(f'Cannot find on-air {video_id} channel.', expected=True)
370 elif video_type == 'episode':
371 api_response = self._download_json(
372 f'https://api.abema.io/v1/video/programs/{video_id}', video_id,
373 note='Checking playability',
374 headers=headers)
375 ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType'))
376 if 3 not in ondemand_types:
377 # cannot acquire decryption key for these streams
378 self.report_warning('This is a premium-only stream')
379 info.update(traverse_obj(api_response, {
380 'series': ('series', 'title'),
381 'season': ('season', 'title'),
382 'season_number': ('season', 'sequence'),
383 'episode_number': ('episode', 'number'),
384 }))
385 if not title:
386 title = traverse_obj(api_response, ('episode', 'title'))
387 if not description:
388 description = traverse_obj(api_response, ('episode', 'content'))
389
390 m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8'
391 elif video_type == 'slots':
392 api_response = self._download_json(
393 f'https://api.abema.io/v1/media/slots/{video_id}', video_id,
394 note='Checking playability',
395 headers=headers)
396 if not traverse_obj(api_response, ('slot', 'flags', 'timeshiftFree'), default=False):
397 self.report_warning('This is a premium-only stream')
398
399 m3u8_url = f'https://vod-abematv.akamaized.net/slot/{video_id}/playlist.m3u8'
400 else:
401 raise ExtractorError('Unreachable')
402
403 if is_live:
404 self.report_warning("This is a livestream; yt-dlp doesn't support downloading natively, but FFmpeg cannot handle m3u8 manifests from AbemaTV")
405 self.report_warning('Please consider using Streamlink to download these streams (https://github.com/streamlink/streamlink)')
406 formats = self._extract_m3u8_formats(
407 m3u8_url, video_id, ext='mp4', live=is_live)
408
409 info.update({
410 'id': video_id,
411 'title': title,
412 'description': description,
413 'formats': formats,
414 'is_live': is_live,
415 })
416 return info
417
418
419class AbemaTVTitleIE(AbemaTVBaseIE):
420 _VALID_URL = r'https?://abema\.tv/video/title/(?P<id>[^?/]+)'
421 _PAGE_SIZE = 25
422
423 _TESTS = [{
424 'url': 'https://abema.tv/video/title/90-1597',
425 'info_dict': {
426 'id': '90-1597',
427 'title': 'シャッフルアイランド',
428 },
429 'playlist_mincount': 2,
430 }, {
431 'url': 'https://abema.tv/video/title/193-132',
432 'info_dict': {
433 'id': '193-132',
434 'title': '真心が届く~僕とスターのオフィス・ラブ!?~',
435 },
436 'playlist_mincount': 16,
437 }, {
438 'url': 'https://abema.tv/video/title/25-102',
439 'info_dict': {
440 'id': '25-102',
441 'title': 'ソードアート・オンライン アリシゼーション',
442 },
443 'playlist_mincount': 24,
444 }]
445
446 def _fetch_page(self, playlist_id, series_version, page):
447 programs = self._call_api(
448 f'v1/video/series/{playlist_id}/programs', playlist_id,
449 note=f'Downloading page {page + 1}',
450 query={
451 'seriesVersion': series_version,
452 'offset': str(page * self._PAGE_SIZE),
453 'order': 'seq',
454 'limit': str(self._PAGE_SIZE),
455 })
456 yield from (
457 self.url_result(f'https://abema.tv/video/episode/{x}')
458 for x in traverse_obj(programs, ('programs', ..., 'id')))
459
460 def _entries(self, playlist_id, series_version):
461 return OnDemandPagedList(
462 functools.partial(self._fetch_page, playlist_id, series_version),
463 self._PAGE_SIZE)
464
465 def _real_extract(self, url):
466 playlist_id = self._match_id(url)
467 series_info = self._call_api(f'v1/video/series/{playlist_id}', playlist_id)
468
469 return self.playlist_result(
470 self._entries(playlist_id, series_info['version']), playlist_id=playlist_id,
471 playlist_title=series_info.get('title'),
472 playlist_description=series_info.get('content'))