]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/abematv.py
[cleanup] Misc cleanup (#2173)
[yt-dlp.git] / yt_dlp / extractor / abematv.py
CommitLineData
f8271158 1import base64
2import binascii
3e9b66d7
LNO
3import hashlib
4import hmac
f8271158 5import io
6import json
3e9b66d7
LNO
7import re
8import struct
f8271158 9import time
f9934b96 10import urllib.response
11import uuid
3e9b66d7 12
3e9b66d7
LNO
13from .common import InfoExtractor
14from ..aes import aes_ecb_decrypt
f8271158 15from ..compat import compat_urllib_parse_urlparse, compat_urllib_request
3e9b66d7
LNO
16from ..utils import (
17 ExtractorError,
f8271158 18 bytes_to_intlist,
3e9b66d7
LNO
19 decode_base,
20 int_or_none,
f8271158 21 intlist_to_bytes,
3e9b66d7
LNO
22 request_to_url,
23 time_seconds,
3e9b66d7 24 traverse_obj,
f8271158 25 update_url_query,
3e9b66d7
LNO
26 urljoin,
27)
28
3e9b66d7
LNO
29# NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862)
30
e5a998f3 31
08d30158 32def add_opener(ydl, handler):
3e9b66d7
LNO
33 ''' Add a handler for opening URLs, like _download_webpage '''
34 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
35 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
08d30158 36 assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector)
37 ydl._opener.add_handler(handler)
3e9b66d7
LNO
38
39
08d30158 40def remove_opener(ydl, handler):
3e9b66d7
LNO
41 '''
42 Remove handler(s) for opening URLs
43 @param handler Either handler object itself or handler type.
44 Specifying handler type will remove all handler which isinstance returns True.
45 '''
46 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
47 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
08d30158 48 opener = ydl._opener
49 assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector)
3e9b66d7
LNO
50 if isinstance(handler, (type, tuple)):
51 find_cp = lambda x: isinstance(x, handler)
52 else:
53 find_cp = lambda x: x is handler
54
55 removed = []
56 for meth in dir(handler):
57 if meth in ["redirect_request", "do_open", "proxy_open"]:
58 # oops, coincidental match
59 continue
60
61 i = meth.find("_")
62 protocol = meth[:i]
63 condition = meth[i + 1:]
64
65 if condition.startswith("error"):
66 j = condition.find("_") + i + 1
67 kind = meth[j + 1:]
68 try:
69 kind = int(kind)
70 except ValueError:
71 pass
72 lookup = opener.handle_error.get(protocol, {})
73 opener.handle_error[protocol] = lookup
74 elif condition == "open":
75 kind = protocol
76 lookup = opener.handle_open
77 elif condition == "response":
78 kind = protocol
79 lookup = opener.process_response
80 elif condition == "request":
81 kind = protocol
82 lookup = opener.process_request
83 else:
84 continue
85
86 handlers = lookup.setdefault(kind, [])
87 if handlers:
88 handlers[:] = [x for x in handlers if not find_cp(x)]
89
90 removed.append(x for x in handlers if find_cp(x))
91
92 if removed:
93 for x in opener.handlers:
94 if find_cp(x):
95 x.add_parent(None)
96 opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)]
97
98
99class AbemaLicenseHandler(compat_urllib_request.BaseHandler):
100 handler_order = 499
101 STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
102 HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E'
103
104 def __init__(self, ie: 'AbemaTVIE'):
105 # the protcol that this should really handle is 'abematv-license://'
106 # abematv_license_open is just a placeholder for development purposes
107 # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510
108 setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open'))
109 self.ie = ie
110
111 def _get_videokey_from_ticket(self, ticket):
112 to_show = self.ie._downloader.params.get('verbose', False)
113 media_token = self.ie._get_media_token(to_show=to_show)
114
115 license_response = self.ie._download_json(
116 'https://license.abema.io/abematv-hls', None, note='Requesting playback license' if to_show else False,
117 query={'t': media_token},
118 data=json.dumps({
119 'kv': 'a',
120 'lt': ticket
121 }).encode('utf-8'),
122 headers={
123 'Content-Type': 'application/json',
124 })
125
126 res = decode_base(license_response['k'], self.STRTABLE)
127 encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff))
128
129 h = hmac.new(
f8271158 130 binascii.unhexlify(self.HKEY),
3e9b66d7
LNO
131 (license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'),
132 digestmod=hashlib.sha256)
133 enckey = bytes_to_intlist(h.digest())
134
135 return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey))
136
137 def abematv_license_open(self, url):
138 url = request_to_url(url)
139 ticket = compat_urllib_parse_urlparse(url).netloc
140 response_data = self._get_videokey_from_ticket(ticket)
f9934b96 141 return urllib.response.addinfourl(io.BytesIO(response_data), headers={
3e9b66d7
LNO
142 'Content-Length': len(response_data),
143 }, url=url, code=200)
144
145
146class AbemaTVBaseIE(InfoExtractor):
147 def _extract_breadcrumb_list(self, webpage, video_id):
148 for jld in re.finditer(
149 r'(?is)</span></li></ul><script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
150 webpage):
151 jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
152 if jsonld:
153 if jsonld.get('@type') != 'BreadcrumbList':
154 continue
155 trav = traverse_obj(jsonld, ('itemListElement', ..., 'name'))
156 if trav:
157 return trav
158 return []
159
160
161class AbemaTVIE(AbemaTVBaseIE):
162 _VALID_URL = r'https?://abema\.tv/(?P<type>now-on-air|video/episode|channels/.+?/slots)/(?P<id>[^?/]+)'
163 _NETRC_MACHINE = 'abematv'
164 _TESTS = [{
165 'url': 'https://abema.tv/video/episode/194-25_s2_p1',
166 'info_dict': {
167 'id': '194-25_s2_p1',
168 'title': '第1話 「チーズケーキ」 「モーニング再び」',
169 'series': '異世界食堂2',
170 'series_number': 2,
171 'episode': '第1話 「チーズケーキ」 「モーニング再び」',
172 'episode_number': 1,
173 },
174 'skip': 'expired',
175 }, {
176 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d',
177 'info_dict': {
178 'id': 'E8tvAnMJ7a9a5d',
179 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】',
180 'series': 'ゆるキャン△ SEASON2',
181 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】',
182 'series_number': 2,
183 'episode_number': 1,
184 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17',
185 },
186 'skip': 'expired',
187 }, {
188 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047',
189 'info_dict': {
190 'id': 'E8tvAnMJ7a9a5d',
191 'title': '第5話『光射す』',
192 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d',
193 'thumbnail': r're:https://hayabusa\.io/.+',
194 'series': '相棒',
195 'episode': '第5話『光射す』',
196 },
197 'skip': 'expired',
198 }, {
199 'url': 'https://abema.tv/now-on-air/abema-anime',
200 'info_dict': {
201 'id': 'abema-anime',
202 # this varies
203 # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】',
204 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f',
205 'is_live': True,
206 },
207 'skip': 'Not supported until yt-dlp implements native live downloader OR AbemaTV can start a local HTTP server',
208 }]
209 _USERTOKEN = None
210 _DEVICE_ID = None
211 _TIMETABLE = None
212 _MEDIATOKEN = None
213
214 _SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe'
215
216 def _generate_aks(self, deviceid):
217 deviceid = deviceid.encode('utf-8')
218 # add 1 hour and then drop minute and secs
219 ts_1hour = int((time_seconds(hours=9) // 3600 + 1) * 3600)
220 time_struct = time.gmtime(ts_1hour)
221 ts_1hour_str = str(ts_1hour).encode('utf-8')
222
223 tmp = None
224
225 def mix_once(nonce):
226 nonlocal tmp
227 h = hmac.new(self._SECRETKEY, digestmod=hashlib.sha256)
228 h.update(nonce)
229 tmp = h.digest()
230
231 def mix_tmp(count):
232 nonlocal tmp
233 for i in range(count):
234 mix_once(tmp)
235
236 def mix_twist(nonce):
237 nonlocal tmp
f8271158 238 mix_once(base64.urlsafe_b64encode(tmp).rstrip(b'=') + nonce)
3e9b66d7
LNO
239
240 mix_once(self._SECRETKEY)
241 mix_tmp(time_struct.tm_mon)
242 mix_twist(deviceid)
243 mix_tmp(time_struct.tm_mday % 5)
244 mix_twist(ts_1hour_str)
245 mix_tmp(time_struct.tm_hour % 5)
246
f8271158 247 return base64.urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8')
3e9b66d7
LNO
248
249 def _get_device_token(self):
250 if self._USERTOKEN:
251 return self._USERTOKEN
252
f9934b96 253 self._DEVICE_ID = str(uuid.uuid4())
3e9b66d7
LNO
254 aks = self._generate_aks(self._DEVICE_ID)
255 user_data = self._download_json(
256 'https://api.abema.io/v1/users', None, note='Authorizing',
257 data=json.dumps({
258 'deviceId': self._DEVICE_ID,
259 'applicationKeySecret': aks,
260 }).encode('utf-8'),
261 headers={
262 'Content-Type': 'application/json',
263 })
264 self._USERTOKEN = user_data['token']
265
266 # don't allow adding it 2 times or more, though it's guarded
267 remove_opener(self._downloader, AbemaLicenseHandler)
268 add_opener(self._downloader, AbemaLicenseHandler(self))
269
270 return self._USERTOKEN
271
272 def _get_media_token(self, invalidate=False, to_show=True):
273 if not invalidate and self._MEDIATOKEN:
274 return self._MEDIATOKEN
275
276 self._MEDIATOKEN = self._download_json(
277 'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False,
278 query={
279 'osName': 'android',
280 'osVersion': '6.0.1',
281 'osLang': 'ja_JP',
282 'osTimezone': 'Asia/Tokyo',
283 'appId': 'tv.abema',
284 'appVersion': '3.27.1'
285 }, headers={
286 'Authorization': 'bearer ' + self._get_device_token()
287 })['token']
288
289 return self._MEDIATOKEN
290
52efa4b3 291 def _perform_login(self, username, password):
3e9b66d7
LNO
292 if '@' in username: # don't strictly check if it's email address or not
293 ep, method = 'user/email', 'email'
294 else:
295 ep, method = 'oneTimePassword', 'userId'
296
297 login_response = self._download_json(
298 f'https://api.abema.io/v1/auth/{ep}', None, note='Logging in',
299 data=json.dumps({
300 method: username,
301 'password': password
302 }).encode('utf-8'), headers={
303 'Authorization': 'bearer ' + self._get_device_token(),
304 'Origin': 'https://abema.tv',
305 'Referer': 'https://abema.tv/',
306 'Content-Type': 'application/json',
307 })
308
309 self._USERTOKEN = login_response['token']
310 self._get_media_token(True)
311
312 def _real_extract(self, url):
313 # starting download using infojson from this extractor is undefined behavior,
314 # and never be fixed in the future; you must trigger downloads by directly specifing URL.
315 # (unless there's a way to hook before downloading by extractor)
316 video_id, video_type = self._match_valid_url(url).group('id', 'type')
317 headers = {
318 'Authorization': 'Bearer ' + self._get_device_token(),
319 }
320 video_type = video_type.split('/')[-1]
321
322 webpage = self._download_webpage(url, video_id)
323 canonical_url = self._search_regex(
324 r'<link\s+rel="canonical"\s*href="(.+?)"', webpage, 'canonical URL',
325 default=url)
326 info = self._search_json_ld(webpage, video_id, default={})
327
328 title = self._search_regex(
329 r'<span\s*class=".+?EpisodeTitleBlock__title">(.+?)</span>', webpage, 'title', default=None)
330 if not title:
331 jsonld = None
332 for jld in re.finditer(
333 r'(?is)<span\s*class="com-m-Thumbnail__image">(?:</span>)?<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
334 webpage):
335 jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
336 if jsonld:
337 break
338 if jsonld:
339 title = jsonld.get('caption')
340 if not title and video_type == 'now-on-air':
341 if not self._TIMETABLE:
342 # cache the timetable because it goes to 5MiB in size (!!)
343 self._TIMETABLE = self._download_json(
344 'https://api.abema.io/v1/timetable/dataSet?debug=false', video_id,
345 headers=headers)
346 now = time_seconds(hours=9)
347 for slot in self._TIMETABLE.get('slots', []):
348 if slot.get('channelId') != video_id:
349 continue
350 if slot['startAt'] <= now and now < slot['endAt']:
351 title = slot['title']
352 break
353
354 # read breadcrumb on top of page
355 breadcrumb = self._extract_breadcrumb_list(webpage, video_id)
356 if breadcrumb:
357 # breadcrumb list translates to: (example is 1st test for this IE)
358 # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title)
359 # hence this works
360 info['series'] = breadcrumb[-2]
361 info['episode'] = breadcrumb[-1]
362 if not title:
363 title = info['episode']
364
365 description = self._html_search_regex(
366 (r'<p\s+class="com-video-EpisodeDetailsBlock__content"><span\s+class=".+?">(.+?)</span></p><div',
367 r'<span\s+class=".+?SlotSummary.+?">(.+?)</span></div><div',),
368 webpage, 'description', default=None, group=1)
369 if not description:
370 og_desc = self._html_search_meta(
371 ('description', 'og:description', 'twitter:description'), webpage)
372 if og_desc:
373 description = re.sub(r'''(?sx)
374 ^(.+?)(?:
375 アニメの動画を無料で見るならABEMA!| # anime
376 等、.+ # applies for most of categories
377 )?
378 ''', r'\1', og_desc)
379
380 # canonical URL may contain series and episode number
381 mobj = re.search(r's(\d+)_p(\d+)$', canonical_url)
382 if mobj:
383 seri = int_or_none(mobj.group(1), default=float('inf'))
384 epis = int_or_none(mobj.group(2), default=float('inf'))
385 info['series_number'] = seri if seri < 100 else None
386 # some anime like Detective Conan (though not available in AbemaTV)
387 # has more than 1000 episodes (1026 as of 2021/11/15)
388 info['episode_number'] = epis if epis < 2000 else None
389
390 is_live, m3u8_url = False, None
391 if video_type == 'now-on-air':
392 is_live = True
393 channel_url = 'https://api.abema.io/v1/channels'
394 if video_id == 'news-global':
395 channel_url = update_url_query(channel_url, {'division': '1'})
396 onair_channels = self._download_json(channel_url, video_id)
397 for ch in onair_channels['channels']:
398 if video_id == ch['id']:
399 m3u8_url = ch['playback']['hls']
400 break
401 else:
402 raise ExtractorError(f'Cannot find on-air {video_id} channel.', expected=True)
403 elif video_type == 'episode':
404 api_response = self._download_json(
405 f'https://api.abema.io/v1/video/programs/{video_id}', video_id,
406 note='Checking playability',
407 headers=headers)
408 ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType'), default=[])
409 if 3 not in ondemand_types:
410 # cannot acquire decryption key for these streams
411 self.report_warning('This is a premium-only stream')
412
413 m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8'
414 elif video_type == 'slots':
415 api_response = self._download_json(
416 f'https://api.abema.io/v1/media/slots/{video_id}', video_id,
417 note='Checking playability',
418 headers=headers)
419 if not traverse_obj(api_response, ('slot', 'flags', 'timeshiftFree'), default=False):
420 self.report_warning('This is a premium-only stream')
421
422 m3u8_url = f'https://vod-abematv.akamaized.net/slot/{video_id}/playlist.m3u8'
423 else:
424 raise ExtractorError('Unreachable')
425
426 if is_live:
427 self.report_warning("This is a livestream; yt-dlp doesn't support downloading natively, but FFmpeg cannot handle m3u8 manifests from AbemaTV")
428 self.report_warning('Please consider using Streamlink to download these streams (https://github.com/streamlink/streamlink)')
429 formats = self._extract_m3u8_formats(
430 m3u8_url, video_id, ext='mp4', live=is_live)
431
432 info.update({
433 'id': video_id,
434 'title': title,
435 'description': description,
436 'formats': formats,
437 'is_live': is_live,
438 })
439 return info
440
441
442class AbemaTVTitleIE(AbemaTVBaseIE):
443 _VALID_URL = r'https?://abema\.tv/video/title/(?P<id>[^?/]+)'
444
445 _TESTS = [{
446 'url': 'https://abema.tv/video/title/90-1597',
447 'info_dict': {
448 'id': '90-1597',
449 'title': 'シャッフルアイランド',
450 },
451 'playlist_mincount': 2,
452 }, {
453 'url': 'https://abema.tv/video/title/193-132',
454 'info_dict': {
455 'id': '193-132',
456 'title': '真心が届く~僕とスターのオフィス・ラブ!?~',
457 },
458 'playlist_mincount': 16,
459 }]
460
461 def _real_extract(self, url):
462 video_id = self._match_id(url)
463 webpage = self._download_webpage(url, video_id)
464
465 playlist_title, breadcrumb = None, self._extract_breadcrumb_list(webpage, video_id)
466 if breadcrumb:
467 playlist_title = breadcrumb[-1]
468
469 playlist = [
470 self.url_result(urljoin('https://abema.tv/', mobj.group(1)))
471 for mobj in re.finditer(r'<li\s*class=".+?EpisodeList.+?"><a\s*href="(/[^"]+?)"', webpage)]
472
473 return self.playlist_result(playlist, playlist_title=playlist_title, playlist_id=video_id)