]> jfr.im git - yt-dlp.git/blame_incremental - yt_dlp/extractor/abematv.py
[ie/motherless] Support uploader playlists (#8994)
[yt-dlp.git] / yt_dlp / extractor / abematv.py
... / ...
CommitLineData
1import base64
2import binascii
3import functools
4import hashlib
5import hmac
6import io
7import json
8import re
9import struct
10import time
11import urllib.parse
12import urllib.request
13import urllib.response
14import uuid
15from ..utils.networking import clean_proxies
16from .common import InfoExtractor
17from ..aes import aes_ecb_decrypt
18from ..utils import (
19 ExtractorError,
20 bytes_to_intlist,
21 decode_base_n,
22 int_or_none,
23 intlist_to_bytes,
24 OnDemandPagedList,
25 time_seconds,
26 traverse_obj,
27 update_url_query,
28)
29
30
31def add_opener(ydl, handler): # FIXME: Create proper API in .networking
32 """Add a handler for opening URLs, like _download_webpage"""
33 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
34 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
35 rh = ydl._request_director.handlers['Urllib']
36 if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES:
37 return
38 headers = ydl.params['http_headers'].copy()
39 proxies = ydl.proxies.copy()
40 clean_proxies(proxies, headers)
41 opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=proxies)
42 assert isinstance(opener, urllib.request.OpenerDirector)
43 opener.add_handler(handler)
44 rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license')
45
46
47class AbemaLicenseHandler(urllib.request.BaseHandler):
48 handler_order = 499
49 STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
50 HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E'
51
52 def __init__(self, ie: 'AbemaTVIE'):
53 # the protocol that this should really handle is 'abematv-license://'
54 # abematv_license_open is just a placeholder for development purposes
55 # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510
56 setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open'))
57 self.ie = ie
58
59 def _get_videokey_from_ticket(self, ticket):
60 to_show = self.ie.get_param('verbose', False)
61 media_token = self.ie._get_media_token(to_show=to_show)
62
63 license_response = self.ie._download_json(
64 'https://license.abema.io/abematv-hls', None, note='Requesting playback license' if to_show else False,
65 query={'t': media_token},
66 data=json.dumps({
67 'kv': 'a',
68 'lt': ticket
69 }).encode('utf-8'),
70 headers={
71 'Content-Type': 'application/json',
72 })
73
74 res = decode_base_n(license_response['k'], table=self.STRTABLE)
75 encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff))
76
77 h = hmac.new(
78 binascii.unhexlify(self.HKEY),
79 (license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'),
80 digestmod=hashlib.sha256)
81 enckey = bytes_to_intlist(h.digest())
82
83 return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey))
84
85 def abematv_license_open(self, url):
86 url = url.get_full_url() if isinstance(url, urllib.request.Request) else url
87 ticket = urllib.parse.urlparse(url).netloc
88 response_data = self._get_videokey_from_ticket(ticket)
89 return urllib.response.addinfourl(io.BytesIO(response_data), headers={
90 'Content-Length': str(len(response_data)),
91 }, url=url, code=200)
92
93
94class AbemaTVBaseIE(InfoExtractor):
95 _NETRC_MACHINE = 'abematv'
96
97 _USERTOKEN = None
98 _DEVICE_ID = None
99 _MEDIATOKEN = None
100
101 _SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe'
102
103 @classmethod
104 def _generate_aks(cls, deviceid):
105 deviceid = deviceid.encode('utf-8')
106 # add 1 hour and then drop minute and secs
107 ts_1hour = int((time_seconds() // 3600 + 1) * 3600)
108 time_struct = time.gmtime(ts_1hour)
109 ts_1hour_str = str(ts_1hour).encode('utf-8')
110
111 tmp = None
112
113 def mix_once(nonce):
114 nonlocal tmp
115 h = hmac.new(cls._SECRETKEY, digestmod=hashlib.sha256)
116 h.update(nonce)
117 tmp = h.digest()
118
119 def mix_tmp(count):
120 nonlocal tmp
121 for i in range(count):
122 mix_once(tmp)
123
124 def mix_twist(nonce):
125 nonlocal tmp
126 mix_once(base64.urlsafe_b64encode(tmp).rstrip(b'=') + nonce)
127
128 mix_once(cls._SECRETKEY)
129 mix_tmp(time_struct.tm_mon)
130 mix_twist(deviceid)
131 mix_tmp(time_struct.tm_mday % 5)
132 mix_twist(ts_1hour_str)
133 mix_tmp(time_struct.tm_hour % 5)
134
135 return base64.urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8')
136
137 def _get_device_token(self):
138 if self._USERTOKEN:
139 return self._USERTOKEN
140
141 add_opener(self._downloader, AbemaLicenseHandler(self))
142
143 username, _ = self._get_login_info()
144 auth_cache = username and self.cache.load(self._NETRC_MACHINE, username, min_ver='2024.01.19')
145 AbemaTVBaseIE._USERTOKEN = auth_cache and auth_cache.get('usertoken')
146 if AbemaTVBaseIE._USERTOKEN:
147 # try authentication with locally stored token
148 try:
149 AbemaTVBaseIE._DEVICE_ID = auth_cache.get('device_id')
150 self._get_media_token(True)
151 return
152 except ExtractorError as e:
153 self.report_warning(f'Failed to login with cached user token; obtaining a fresh one ({e})')
154
155 AbemaTVBaseIE._DEVICE_ID = str(uuid.uuid4())
156 aks = self._generate_aks(self._DEVICE_ID)
157 user_data = self._download_json(
158 'https://api.abema.io/v1/users', None, note='Authorizing',
159 data=json.dumps({
160 'deviceId': self._DEVICE_ID,
161 'applicationKeySecret': aks,
162 }).encode('utf-8'),
163 headers={
164 'Content-Type': 'application/json',
165 })
166 AbemaTVBaseIE._USERTOKEN = user_data['token']
167
168 return self._USERTOKEN
169
170 def _get_media_token(self, invalidate=False, to_show=True):
171 if not invalidate and self._MEDIATOKEN:
172 return self._MEDIATOKEN
173
174 AbemaTVBaseIE._MEDIATOKEN = self._download_json(
175 'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False,
176 query={
177 'osName': 'android',
178 'osVersion': '6.0.1',
179 'osLang': 'ja_JP',
180 'osTimezone': 'Asia/Tokyo',
181 'appId': 'tv.abema',
182 'appVersion': '3.27.1'
183 }, headers={
184 'Authorization': f'bearer {self._get_device_token()}',
185 })['token']
186
187 return self._MEDIATOKEN
188
189 def _perform_login(self, username, password):
190 self._get_device_token()
191 if self.cache.load(self._NETRC_MACHINE, username, min_ver='2024.01.19') and self._get_media_token():
192 self.write_debug('Skipping logging in')
193 return
194
195 if '@' in username: # don't strictly check if it's email address or not
196 ep, method = 'user/email', 'email'
197 else:
198 ep, method = 'oneTimePassword', 'userId'
199
200 login_response = self._download_json(
201 f'https://api.abema.io/v1/auth/{ep}', None, note='Logging in',
202 data=json.dumps({
203 method: username,
204 'password': password
205 }).encode('utf-8'), headers={
206 'Authorization': f'bearer {self._get_device_token()}',
207 'Origin': 'https://abema.tv',
208 'Referer': 'https://abema.tv/',
209 'Content-Type': 'application/json',
210 })
211
212 AbemaTVBaseIE._USERTOKEN = login_response['token']
213 self._get_media_token(True)
214 auth_cache = {
215 'device_id': AbemaTVBaseIE._DEVICE_ID,
216 'usertoken': AbemaTVBaseIE._USERTOKEN,
217 }
218 self.cache.store(self._NETRC_MACHINE, username, auth_cache)
219
220 def _call_api(self, endpoint, video_id, query=None, note='Downloading JSON metadata'):
221 return self._download_json(
222 f'https://api.abema.io/{endpoint}', video_id, query=query or {},
223 note=note,
224 headers={
225 'Authorization': f'bearer {self._get_device_token()}',
226 })
227
228 def _extract_breadcrumb_list(self, webpage, video_id):
229 for jld in re.finditer(
230 r'(?is)</span></li></ul><script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
231 webpage):
232 jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
233 if traverse_obj(jsonld, '@type') != 'BreadcrumbList':
234 continue
235 items = traverse_obj(jsonld, ('itemListElement', ..., 'name'))
236 if items:
237 return items
238 return []
239
240
241class AbemaTVIE(AbemaTVBaseIE):
242 _VALID_URL = r'https?://abema\.tv/(?P<type>now-on-air|video/episode|channels/.+?/slots)/(?P<id>[^?/]+)'
243 _TESTS = [{
244 'url': 'https://abema.tv/video/episode/194-25_s2_p1',
245 'info_dict': {
246 'id': '194-25_s2_p1',
247 'title': '第1話 「チーズケーキ」 「モーニング再び」',
248 'series': '異世界食堂2',
249 'season': 'シーズン2',
250 'season_number': 2,
251 'episode': '第1話 「チーズケーキ」 「モーニング再び」',
252 'episode_number': 1,
253 },
254 'skip': 'expired',
255 }, {
256 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d',
257 'info_dict': {
258 'id': 'E8tvAnMJ7a9a5d',
259 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】',
260 'series': 'ゆるキャン△ SEASON2',
261 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】',
262 'series_number': 2,
263 'episode_number': 1,
264 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17',
265 },
266 'skip': 'expired',
267 }, {
268 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047',
269 'info_dict': {
270 'id': 'E8tvAnMJ7a9a5d',
271 'title': '第5話『光射す』',
272 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d',
273 'thumbnail': r're:https://hayabusa\.io/.+',
274 'series': '相棒',
275 'episode': '第5話『光射す』',
276 },
277 'skip': 'expired',
278 }, {
279 'url': 'https://abema.tv/now-on-air/abema-anime',
280 'info_dict': {
281 'id': 'abema-anime',
282 # this varies
283 # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】',
284 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f',
285 'is_live': True,
286 },
287 'skip': 'Not supported until yt-dlp implements native live downloader OR AbemaTV can start a local HTTP server',
288 }]
289 _TIMETABLE = None
290
291 def _real_extract(self, url):
292 # starting download using infojson from this extractor is undefined behavior,
293 # and never be fixed in the future; you must trigger downloads by directly specifying URL.
294 # (unless there's a way to hook before downloading by extractor)
295 video_id, video_type = self._match_valid_url(url).group('id', 'type')
296 headers = {
297 'Authorization': 'Bearer ' + self._get_device_token(),
298 }
299 video_type = video_type.split('/')[-1]
300
301 webpage = self._download_webpage(url, video_id)
302 canonical_url = self._search_regex(
303 r'<link\s+rel="canonical"\s*href="(.+?)"', webpage, 'canonical URL',
304 default=url)
305 info = self._search_json_ld(webpage, video_id, default={})
306
307 title = self._search_regex(
308 r'<span\s*class=".+?EpisodeTitleBlock__title">(.+?)</span>', webpage, 'title', default=None)
309 if not title:
310 jsonld = None
311 for jld in re.finditer(
312 r'(?is)<span\s*class="com-m-Thumbnail__image">(?:</span>)?<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
313 webpage):
314 jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
315 if jsonld:
316 break
317 if jsonld:
318 title = jsonld.get('caption')
319 if not title and video_type == 'now-on-air':
320 if not self._TIMETABLE:
321 # cache the timetable because it goes to 5MiB in size (!!)
322 self._TIMETABLE = self._download_json(
323 'https://api.abema.io/v1/timetable/dataSet?debug=false', video_id,
324 headers=headers)
325 now = time_seconds(hours=9)
326 for slot in self._TIMETABLE.get('slots', []):
327 if slot.get('channelId') != video_id:
328 continue
329 if slot['startAt'] <= now and now < slot['endAt']:
330 title = slot['title']
331 break
332
333 # read breadcrumb on top of page
334 breadcrumb = self._extract_breadcrumb_list(webpage, video_id)
335 if breadcrumb:
336 # breadcrumb list translates to: (e.g. 1st test for this IE)
337 # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title)
338 # hence this works
339 info['series'] = breadcrumb[-2]
340 info['episode'] = breadcrumb[-1]
341 if not title:
342 title = info['episode']
343
344 description = self._html_search_regex(
345 (r'<p\s+class="com-video-EpisodeDetailsBlock__content"><span\s+class=".+?">(.+?)</span></p><div',
346 r'<span\s+class=".+?SlotSummary.+?">(.+?)</span></div><div',),
347 webpage, 'description', default=None, group=1)
348 if not description:
349 og_desc = self._html_search_meta(
350 ('description', 'og:description', 'twitter:description'), webpage)
351 if og_desc:
352 description = re.sub(r'''(?sx)
353 ^(.+?)(?:
354 アニメの動画を無料で見るならABEMA!| # anime
355 等、.+ # applies for most of categories
356 )?
357 ''', r'\1', og_desc)
358
359 # canonical URL may contain season and episode number
360 mobj = re.search(r's(\d+)_p(\d+)$', canonical_url)
361 if mobj:
362 seri = int_or_none(mobj.group(1), default=float('inf'))
363 epis = int_or_none(mobj.group(2), default=float('inf'))
364 info['season_number'] = seri if seri < 100 else None
365 # some anime like Detective Conan (though not available in AbemaTV)
366 # has more than 1000 episodes (1026 as of 2021/11/15)
367 info['episode_number'] = epis if epis < 2000 else None
368
369 is_live, m3u8_url = False, None
370 if video_type == 'now-on-air':
371 is_live = True
372 channel_url = 'https://api.abema.io/v1/channels'
373 if video_id == 'news-global':
374 channel_url = update_url_query(channel_url, {'division': '1'})
375 onair_channels = self._download_json(channel_url, video_id)
376 for ch in onair_channels['channels']:
377 if video_id == ch['id']:
378 m3u8_url = ch['playback']['hls']
379 break
380 else:
381 raise ExtractorError(f'Cannot find on-air {video_id} channel.', expected=True)
382 elif video_type == 'episode':
383 api_response = self._download_json(
384 f'https://api.abema.io/v1/video/programs/{video_id}', video_id,
385 note='Checking playability',
386 headers=headers)
387 ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType'))
388 if 3 not in ondemand_types:
389 # cannot acquire decryption key for these streams
390 self.report_warning('This is a premium-only stream')
391 info.update(traverse_obj(api_response, {
392 'series': ('series', 'title'),
393 'season': ('season', 'name'),
394 'season_number': ('season', 'sequence'),
395 'episode_number': ('episode', 'number'),
396 }))
397 if not title:
398 title = traverse_obj(api_response, ('episode', 'title'))
399 if not description:
400 description = traverse_obj(api_response, ('episode', 'content'))
401
402 m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8'
403 elif video_type == 'slots':
404 api_response = self._download_json(
405 f'https://api.abema.io/v1/media/slots/{video_id}', video_id,
406 note='Checking playability',
407 headers=headers)
408 if not traverse_obj(api_response, ('slot', 'flags', 'timeshiftFree'), default=False):
409 self.report_warning('This is a premium-only stream')
410
411 m3u8_url = f'https://vod-abematv.akamaized.net/slot/{video_id}/playlist.m3u8'
412 else:
413 raise ExtractorError('Unreachable')
414
415 if is_live:
416 self.report_warning("This is a livestream; yt-dlp doesn't support downloading natively, but FFmpeg cannot handle m3u8 manifests from AbemaTV")
417 self.report_warning('Please consider using Streamlink to download these streams (https://github.com/streamlink/streamlink)')
418 formats = self._extract_m3u8_formats(
419 m3u8_url, video_id, ext='mp4', live=is_live)
420
421 info.update({
422 'id': video_id,
423 'title': title,
424 'description': description,
425 'formats': formats,
426 'is_live': is_live,
427 })
428 return info
429
430
431class AbemaTVTitleIE(AbemaTVBaseIE):
432 _VALID_URL = r'https?://abema\.tv/video/title/(?P<id>[^?/]+)'
433 _PAGE_SIZE = 25
434
435 _TESTS = [{
436 'url': 'https://abema.tv/video/title/90-1597',
437 'info_dict': {
438 'id': '90-1597',
439 'title': 'シャッフルアイランド',
440 },
441 'playlist_mincount': 2,
442 }, {
443 'url': 'https://abema.tv/video/title/193-132',
444 'info_dict': {
445 'id': '193-132',
446 'title': '真心が届く~僕とスターのオフィス・ラブ!?~',
447 },
448 'playlist_mincount': 16,
449 }, {
450 'url': 'https://abema.tv/video/title/25-102',
451 'info_dict': {
452 'id': '25-102',
453 'title': 'ソードアート・オンライン アリシゼーション',
454 },
455 'playlist_mincount': 24,
456 }]
457
458 def _fetch_page(self, playlist_id, series_version, page):
459 programs = self._call_api(
460 f'v1/video/series/{playlist_id}/programs', playlist_id,
461 note=f'Downloading page {page + 1}',
462 query={
463 'seriesVersion': series_version,
464 'offset': str(page * self._PAGE_SIZE),
465 'order': 'seq',
466 'limit': str(self._PAGE_SIZE),
467 })
468 yield from (
469 self.url_result(f'https://abema.tv/video/episode/{x}')
470 for x in traverse_obj(programs, ('programs', ..., 'id')))
471
472 def _entries(self, playlist_id, series_version):
473 return OnDemandPagedList(
474 functools.partial(self._fetch_page, playlist_id, series_version),
475 self._PAGE_SIZE)
476
477 def _real_extract(self, url):
478 playlist_id = self._match_id(url)
479 series_info = self._call_api(f'v1/video/series/{playlist_id}', playlist_id)
480
481 return self.playlist_result(
482 self._entries(playlist_id, series_info['version']), playlist_id=playlist_id,
483 playlist_title=series_info.get('title'),
484 playlist_description=series_info.get('content'))