yt_dlp/extractor/crunchyroll.py

   1 import base64
   2 import urllib.parse
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     ExtractorError,
   7     float_or_none,
   8     format_field,
   9     join_nonempty,
  10     parse_iso8601,
  11     qualities,
  12     traverse_obj,
  13     try_get,
  14 )
  15
  16
  17 class CrunchyrollBaseIE(InfoExtractor):
  18     _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login'
  19     _API_BASE = 'https://api.crunchyroll.com'
  20     _NETRC_MACHINE = 'crunchyroll'
  21     params = None
  22
  23     @property
  24     def is_logged_in(self):
  25         return self._get_cookies(self._LOGIN_URL).get('etp_rt')
  26
  27     def _perform_login(self, username, password):
  28         if self.is_logged_in:
  29             return
  30
  31         upsell_response = self._download_json(
  32             f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id',
  33             query={
  34                 'sess_id': 1,
  35                 'device_id': 'whatvalueshouldbeforweb',
  36                 'device_type': 'com.crunchyroll.static',
  37                 'access_token': 'giKq5eY27ny3cqz',
  38                 'referer': self._LOGIN_URL
  39             })
  40         if upsell_response['code'] != 'ok':
  41             raise ExtractorError('Could not get session id')
  42         session_id = upsell_response['data']['session_id']
  43
  44         login_response = self._download_json(
  45             f'{self._API_BASE}/login.1.json', None, 'Logging in',
  46             data=urllib.parse.urlencode({
  47                 'account': username,
  48                 'password': password,
  49                 'session_id': session_id
  50             }).encode('ascii'))
  51         if login_response['code'] != 'ok':
  52             raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True)
  53         if not self.is_logged_in:
  54             raise ExtractorError('Login succeeded but did not set etp_rt cookie')
  55
  56     def _get_embedded_json(self, webpage, display_id):
  57         initial_state = self._parse_json(self._search_regex(
  58             r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id)
  59         app_config = self._parse_json(self._search_regex(
  60             r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id)
  61         return initial_state, app_config
  62
  63     def _get_params(self, lang):
  64         if not CrunchyrollBaseIE.params:
  65             if self._get_cookies(f'https://www.crunchyroll.com/{lang}').get('etp_rt'):
  66                 grant_type, key = 'etp_rt_cookie', 'accountAuthClientId'
  67             else:
  68                 grant_type, key = 'client_id', 'anonClientId'
  69
  70             initial_state, app_config = self._get_embedded_json(self._download_webpage(
  71                 f'https://www.crunchyroll.com/{lang}', None, note='Retrieving main page'), None)
  72             api_domain = app_config['cxApiParams']['apiDomain'].replace('beta.crunchyroll.com', 'www.crunchyroll.com')
  73
  74             auth_response = self._download_json(
  75                 f'{api_domain}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}',
  76                 headers={
  77                     'Authorization': 'Basic ' + str(base64.b64encode(('%s:' % app_config['cxApiParams'][key]).encode('ascii')), 'ascii')
  78                 }, data=f'grant_type={grant_type}'.encode('ascii'))
  79             policy_response = self._download_json(
  80                 f'{api_domain}/index/v2', None, note='Retrieving signed policy',
  81                 headers={
  82                     'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']
  83                 })
  84             cms = policy_response.get('cms_web')
  85             bucket = cms['bucket']
  86             params = {
  87                 'Policy': cms['policy'],
  88                 'Signature': cms['signature'],
  89                 'Key-Pair-Id': cms['key_pair_id']
  90             }
  91             locale = traverse_obj(initial_state, ('localization', 'locale'))
  92             if locale:
  93                 params['locale'] = locale
  94             CrunchyrollBaseIE.params = (api_domain, bucket, params)
  95         return CrunchyrollBaseIE.params
  96
  97
  98 class CrunchyrollBetaIE(CrunchyrollBaseIE):
  99     IE_NAME = 'crunchyroll'
 100     _VALID_URL = r'''(?x)
 101         https?://(?:beta|www)\.crunchyroll\.com/
 102         (?P<lang>(?:\w{2}(?:-\w{2})?/)?)
 103         watch/(?P<id>\w+)
 104         (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)'''
 105     _TESTS = [{
 106         'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future',
 107         'info_dict': {
 108             'id': 'GY2P1Q98Y',
 109             'ext': 'mp4',
 110             'duration': 1380.241,
 111             'timestamp': 1459632600,
 112             'description': 'md5:a022fbec4fbb023d43631032c91ed64b',
 113             'title': 'World Trigger Episode 73 – To the Future',
 114             'upload_date': '20160402',
 115             'series': 'World Trigger',
 116             'series_id': 'GR757DMKY',
 117             'season': 'World Trigger',
 118             'season_id': 'GR9P39NJ6',
 119             'season_number': 1,
 120             'episode': 'To the Future',
 121             'episode_number': 73,
 122             'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$',
 123             'chapters': 'count:2',
 124         },
 125         'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'},
 126     }, {
 127         'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR',
 128         'info_dict': {
 129             'id': 'GYE5WKQGR',
 130             'ext': 'mp4',
 131             'duration': 366.459,
 132             'timestamp': 1476788400,
 133             'description': 'md5:74b67283ffddd75f6e224ca7dc031e76',
 134             'title': 'SHELTER Episode  – Porter Robinson presents Shelter the Animation',
 135             'upload_date': '20161018',
 136             'series': 'SHELTER',
 137             'series_id': 'GYGG09WWY',
 138             'season': 'SHELTER',
 139             'season_id': 'GR09MGK4R',
 140             'season_number': 1,
 141             'episode': 'Porter Robinson presents Shelter the Animation',
 142             'episode_number': 0,
 143             'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$',
 144             'chapters': 'count:0',
 145         },
 146         'params': {'skip_download': True},
 147         'skip': 'Video is Premium only',
 148     }, {
 149         'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y',
 150         'only_matching': True,
 151     }, {
 152         'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy',
 153         'only_matching': True,
 154     }]
 155
 156     def _real_extract(self, url):
 157         lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id')
 158         api_domain, bucket, params = self._get_params(lang)
 159
 160         episode_response = self._download_json(
 161             f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id,
 162             note='Retrieving episode metadata', query=params)
 163         if episode_response.get('is_premium_only') and not bucket.endswith('crunchyroll'):
 164             if self.is_logged_in:
 165                 raise ExtractorError('This video is for premium members only', expected=True)
 166             else:
 167                 self.raise_login_required('This video is for premium members only')
 168
 169         stream_response = self._download_json(
 170             f'{api_domain}{episode_response["__links__"]["streams"]["href"]}', display_id,
 171             note='Retrieving stream info', query=params)
 172         get_streams = lambda name: (traverse_obj(stream_response, name) or {}).items()
 173
 174         requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])]
 175         hardsub_preference = qualities(requested_hardsubs[::-1])
 176         requested_formats = self._configuration_arg('format') or ['adaptive_hls']
 177
 178         available_formats = {}
 179         for stream_type, streams in get_streams('streams'):
 180             if stream_type not in requested_formats:
 181                 continue
 182             for stream in streams.values():
 183                 if not stream.get('url'):
 184                     continue
 185                 hardsub_lang = stream.get('hardsub_locale') or ''
 186                 format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s'))
 187                 available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url'])
 188
 189         if '' in available_formats and 'all' not in requested_hardsubs:
 190             full_format_langs = set(requested_hardsubs)
 191             self.to_screen(
 192                 'To get all formats of a hardsub language, use '
 193                 '"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". '
 194                 'See https://github.com/yt-dlp/yt-dlp#crunchyrollbeta-crunchyroll for more info',
 195                 only_once=True)
 196         else:
 197             full_format_langs = set(map(str.lower, available_formats))
 198
 199         formats = []
 200         for stream_type, format_id, hardsub_lang, stream_url in available_formats.values():
 201             if stream_type.endswith('hls'):
 202                 if hardsub_lang.lower() in full_format_langs:
 203                     adaptive_formats = self._extract_m3u8_formats(
 204                         stream_url, display_id, 'mp4', m3u8_id=format_id,
 205                         fatal=False, note=f'Downloading {format_id} HLS manifest')
 206                 else:
 207                     adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),)
 208             elif stream_type.endswith('dash'):
 209                 adaptive_formats = self._extract_mpd_formats(
 210                     stream_url, display_id, mpd_id=format_id,
 211                     fatal=False, note=f'Downloading {format_id} MPD manifest')
 212             else:
 213                 self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True)
 214                 continue
 215             for f in adaptive_formats:
 216                 if f.get('acodec') != 'none':
 217                     f['language'] = stream_response.get('audio_locale')
 218                 f['quality'] = hardsub_preference(hardsub_lang.lower())
 219             formats.extend(adaptive_formats)
 220
 221         chapters = None
 222         # if no intro chapter is available, a 403 without usable data is returned
 223         intro_chapter = self._download_json(f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json',
 224                                             display_id, fatal=False, errnote=False)
 225         if isinstance(intro_chapter, dict):
 226             chapters = [{
 227                 'title': 'Intro',
 228                 'start_time': float_or_none(intro_chapter.get('startTime')),
 229                 'end_time': float_or_none(intro_chapter.get('endTime'))
 230             }]
 231
 232         return {
 233             'id': internal_id,
 234             'title': '%s Episode %s – %s' % (
 235                 episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')),
 236             'description': try_get(episode_response, lambda x: x['description'].replace(r'\r\n', '\n')),
 237             'duration': float_or_none(episode_response.get('duration_ms'), 1000),
 238             'timestamp': parse_iso8601(episode_response.get('upload_date')),
 239             'series': episode_response.get('series_title'),
 240             'series_id': episode_response.get('series_id'),
 241             'season': episode_response.get('season_title'),
 242             'season_id': episode_response.get('season_id'),
 243             'season_number': episode_response.get('season_number'),
 244             'episode': episode_response.get('title'),
 245             'episode_number': episode_response.get('sequence_number'),
 246             'formats': formats,
 247             'thumbnails': [{
 248                 'url': thumb.get('source'),
 249                 'width': thumb.get('width'),
 250                 'height': thumb.get('height'),
 251             } for thumb in traverse_obj(episode_response, ('images', 'thumbnail', ..., ...)) or []],
 252             'subtitles': {
 253                 lang: [{
 254                     'url': subtitle_data.get('url'),
 255                     'ext': subtitle_data.get('format')
 256                 }] for lang, subtitle_data in get_streams('subtitles')
 257             },
 258             'chapters': chapters
 259         }
 260
 261
 262 class CrunchyrollBetaShowIE(CrunchyrollBaseIE):
 263     IE_NAME = 'crunchyroll:playlist'
 264     _VALID_URL = r'''(?x)
 265         https?://(?:beta|www)\.crunchyroll\.com/
 266         (?P<lang>(?:\w{2}(?:-\w{2})?/)?)
 267         series/(?P<id>\w+)
 268         (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)'''
 269     _TESTS = [{
 270         'url': 'https://www.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA',
 271         'info_dict': {
 272             'id': 'GY19NQ2QR',
 273             'title': 'Girl Friend BETA',
 274         },
 275         'playlist_mincount': 10,
 276     }, {
 277         'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR',
 278         'only_matching': True,
 279     }]
 280
 281     def _real_extract(self, url):
 282         lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id')
 283         api_domain, bucket, params = self._get_params(lang)
 284
 285         series_response = self._download_json(
 286             f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id,
 287             note='Retrieving series metadata', query=params)
 288
 289         seasons_response = self._download_json(
 290             f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id,
 291             note='Retrieving season list', query=params)
 292
 293         def entries():
 294             for season in seasons_response['items']:
 295                 episodes_response = self._download_json(
 296                     f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id,
 297                     note=f'Retrieving episode list for {season.get("slug_title")}', query=params)
 298                 for episode in episodes_response['items']:
 299                     episode_id = episode['id']
 300                     episode_display_id = episode['slug_title']
 301                     yield {
 302                         '_type': 'url',
 303                         'url': f'https://www.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}',
 304                         'ie_key': CrunchyrollBetaIE.ie_key(),
 305                         'id': episode_id,
 306                         'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')),
 307                         'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')),
 308                         'duration': float_or_none(episode.get('duration_ms'), 1000),
 309                         'series': episode.get('series_title'),
 310                         'series_id': episode.get('series_id'),
 311                         'season': episode.get('season_title'),
 312                         'season_id': episode.get('season_id'),
 313                         'season_number': episode.get('season_number'),
 314                         'episode': episode.get('title'),
 315                         'episode_number': episode.get('sequence_number'),
 316                         'language': episode.get('audio_locale'),
 317                     }
 318
 319         return self.playlist_result(entries(), internal_id, series_response.get('title'))