yt_dlp/extractor/crunchyroll.py

   1 import base64
   2 import urllib.parse
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     ExtractorError,
   7     float_or_none,
   8     format_field,
   9     join_nonempty,
  10     parse_iso8601,
  11     qualities,
  12     traverse_obj,
  13     try_get,
  14 )
  15
  16
  17 class CrunchyrollBaseIE(InfoExtractor):
  18     _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login'
  19     _API_BASE = 'https://api.crunchyroll.com'
  20     _NETRC_MACHINE = 'crunchyroll'
  21     params = None
  22
  23     def _perform_login(self, username, password):
  24         if self._get_cookies(self._LOGIN_URL).get('etp_rt'):
  25             return
  26
  27         upsell_response = self._download_json(
  28             f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id',
  29             query={
  30                 'sess_id': 1,
  31                 'device_id': 'whatvalueshouldbeforweb',
  32                 'device_type': 'com.crunchyroll.static',
  33                 'access_token': 'giKq5eY27ny3cqz',
  34                 'referer': self._LOGIN_URL
  35             })
  36         if upsell_response['code'] != 'ok':
  37             raise ExtractorError('Could not get session id')
  38         session_id = upsell_response['data']['session_id']
  39
  40         login_response = self._download_json(
  41             f'{self._API_BASE}/login.1.json', None, 'Logging in',
  42             data=urllib.parse.urlencode({
  43                 'account': username,
  44                 'password': password,
  45                 'session_id': session_id
  46             }).encode('ascii'))
  47         if login_response['code'] != 'ok':
  48             raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True)
  49         if not self._get_cookies(self._LOGIN_URL).get('etp_rt'):
  50             raise ExtractorError('Login succeeded but did not set etp_rt cookie')
  51
  52     def _get_embedded_json(self, webpage, display_id):
  53         initial_state = self._parse_json(self._search_regex(
  54             r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id)
  55         app_config = self._parse_json(self._search_regex(
  56             r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id)
  57         return initial_state, app_config
  58
  59     def _get_params(self, lang):
  60         if not CrunchyrollBaseIE.params:
  61             if self._get_cookies(f'https://www.crunchyroll.com/{lang}').get('etp_rt'):
  62                 grant_type, key = 'etp_rt_cookie', 'accountAuthClientId'
  63             else:
  64                 grant_type, key = 'client_id', 'anonClientId'
  65
  66             initial_state, app_config = self._get_embedded_json(self._download_webpage(
  67                 f'https://www.crunchyroll.com/{lang}', None, note='Retrieving main page'), None)
  68             api_domain = app_config['cxApiParams']['apiDomain'].replace('beta.crunchyroll.com', 'www.crunchyroll.com')
  69
  70             auth_response = self._download_json(
  71                 f'{api_domain}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}',
  72                 headers={
  73                     'Authorization': 'Basic ' + str(base64.b64encode(('%s:' % app_config['cxApiParams'][key]).encode('ascii')), 'ascii')
  74                 }, data=f'grant_type={grant_type}'.encode('ascii'))
  75             policy_response = self._download_json(
  76                 f'{api_domain}/index/v2', None, note='Retrieving signed policy',
  77                 headers={
  78                     'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']
  79                 })
  80             cms = policy_response.get('cms_web')
  81             bucket = cms['bucket']
  82             params = {
  83                 'Policy': cms['policy'],
  84                 'Signature': cms['signature'],
  85                 'Key-Pair-Id': cms['key_pair_id']
  86             }
  87             locale = traverse_obj(initial_state, ('localization', 'locale'))
  88             if locale:
  89                 params['locale'] = locale
  90             CrunchyrollBaseIE.params = (api_domain, bucket, params)
  91         return CrunchyrollBaseIE.params
  92
  93
  94 class CrunchyrollBetaIE(CrunchyrollBaseIE):
  95     IE_NAME = 'crunchyroll'
  96     _VALID_URL = r'''(?x)
  97         https?://(?:beta|www)\.crunchyroll\.com/
  98         (?P<lang>(?:\w{2}(?:-\w{2})?/)?)
  99         watch/(?P<id>\w+)
 100         (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)'''
 101     _TESTS = [{
 102         'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future',
 103         'info_dict': {
 104             'id': 'GY2P1Q98Y',
 105             'ext': 'mp4',
 106             'duration': 1380.241,
 107             'timestamp': 1459632600,
 108             'description': 'md5:a022fbec4fbb023d43631032c91ed64b',
 109             'title': 'World Trigger Episode 73 – To the Future',
 110             'upload_date': '20160402',
 111             'series': 'World Trigger',
 112             'series_id': 'GR757DMKY',
 113             'season': 'World Trigger',
 114             'season_id': 'GR9P39NJ6',
 115             'season_number': 1,
 116             'episode': 'To the Future',
 117             'episode_number': 73,
 118             'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$',
 119         },
 120         'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'},
 121     }, {
 122         'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR',
 123         'info_dict': {
 124             'id': 'GYE5WKQGR',
 125             'ext': 'mp4',
 126             'duration': 366.459,
 127             'timestamp': 1476788400,
 128             'description': 'md5:74b67283ffddd75f6e224ca7dc031e76',
 129             'title': 'SHELTER Episode  – Porter Robinson presents Shelter the Animation',
 130             'upload_date': '20161018',
 131             'series': 'SHELTER',
 132             'series_id': 'GYGG09WWY',
 133             'season': 'SHELTER',
 134             'season_id': 'GR09MGK4R',
 135             'season_number': 1,
 136             'episode': 'Porter Robinson presents Shelter the Animation',
 137             'episode_number': 0,
 138             'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$',
 139         },
 140         'params': {'skip_download': True},
 141         'skip': 'Video is Premium only',
 142     }, {
 143         'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y',
 144         'only_matching': True,
 145     }, {
 146         'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy',
 147         'only_matching': True,
 148     }]
 149
 150     def _real_extract(self, url):
 151         lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id')
 152         api_domain, bucket, params = self._get_params(lang)
 153
 154         episode_response = self._download_json(
 155             f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id,
 156             note='Retrieving episode metadata', query=params)
 157         if episode_response.get('is_premium_only') and not episode_response.get('playback'):
 158             raise ExtractorError('This video is for premium members only.', expected=True)
 159
 160         stream_response = self._download_json(
 161             f'{api_domain}{episode_response["__links__"]["streams"]["href"]}', display_id,
 162             note='Retrieving stream info', query=params)
 163         get_streams = lambda name: (traverse_obj(stream_response, name) or {}).items()
 164
 165         requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])]
 166         hardsub_preference = qualities(requested_hardsubs[::-1])
 167         requested_formats = self._configuration_arg('format') or ['adaptive_hls']
 168
 169         available_formats = {}
 170         for stream_type, streams in get_streams('streams'):
 171             if stream_type not in requested_formats:
 172                 continue
 173             for stream in streams.values():
 174                 if not stream.get('url'):
 175                     continue
 176                 hardsub_lang = stream.get('hardsub_locale') or ''
 177                 format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s'))
 178                 available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url'])
 179
 180         if '' in available_formats and 'all' not in requested_hardsubs:
 181             full_format_langs = set(requested_hardsubs)
 182             self.to_screen(
 183                 'To get all formats of a hardsub language, use '
 184                 '"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". '
 185                 'See https://github.com/yt-dlp/yt-dlp#crunchyrollbeta for more info',
 186                 only_once=True)
 187         else:
 188             full_format_langs = set(map(str.lower, available_formats))
 189
 190         formats = []
 191         for stream_type, format_id, hardsub_lang, stream_url in available_formats.values():
 192             if stream_type.endswith('hls'):
 193                 if hardsub_lang.lower() in full_format_langs:
 194                     adaptive_formats = self._extract_m3u8_formats(
 195                         stream_url, display_id, 'mp4', m3u8_id=format_id,
 196                         fatal=False, note=f'Downloading {format_id} HLS manifest')
 197                 else:
 198                     adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),)
 199             elif stream_type.endswith('dash'):
 200                 adaptive_formats = self._extract_mpd_formats(
 201                     stream_url, display_id, mpd_id=format_id,
 202                     fatal=False, note=f'Downloading {format_id} MPD manifest')
 203             else:
 204                 self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True)
 205                 continue
 206             for f in adaptive_formats:
 207                 if f.get('acodec') != 'none':
 208                     f['language'] = stream_response.get('audio_locale')
 209                 f['quality'] = hardsub_preference(hardsub_lang.lower())
 210             formats.extend(adaptive_formats)
 211         self._sort_formats(formats)
 212
 213         return {
 214             'id': internal_id,
 215             'title': '%s Episode %s – %s' % (
 216                 episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')),
 217             'description': try_get(episode_response, lambda x: x['description'].replace(r'\r\n', '\n')),
 218             'duration': float_or_none(episode_response.get('duration_ms'), 1000),
 219             'timestamp': parse_iso8601(episode_response.get('upload_date')),
 220             'series': episode_response.get('series_title'),
 221             'series_id': episode_response.get('series_id'),
 222             'season': episode_response.get('season_title'),
 223             'season_id': episode_response.get('season_id'),
 224             'season_number': episode_response.get('season_number'),
 225             'episode': episode_response.get('title'),
 226             'episode_number': episode_response.get('sequence_number'),
 227             'formats': formats,
 228             'thumbnails': [{
 229                 'url': thumb.get('source'),
 230                 'width': thumb.get('width'),
 231                 'height': thumb.get('height'),
 232             } for thumb in traverse_obj(episode_response, ('images', 'thumbnail', ..., ...)) or []],
 233             'subtitles': {
 234                 lang: [{
 235                     'url': subtitle_data.get('url'),
 236                     'ext': subtitle_data.get('format')
 237                 }] for lang, subtitle_data in get_streams('subtitles')
 238             },
 239         }
 240
 241
 242 class CrunchyrollBetaShowIE(CrunchyrollBaseIE):
 243     IE_NAME = 'crunchyroll:playlist'
 244     _VALID_URL = r'''(?x)
 245         https?://(?:beta|www)\.crunchyroll\.com/
 246         (?P<lang>(?:\w{2}(?:-\w{2})?/)?)
 247         series/(?P<id>\w+)
 248         (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)'''
 249     _TESTS = [{
 250         'url': 'https://www.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA',
 251         'info_dict': {
 252             'id': 'GY19NQ2QR',
 253             'title': 'Girl Friend BETA',
 254         },
 255         'playlist_mincount': 10,
 256     }, {
 257         'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR',
 258         'only_matching': True,
 259     }]
 260
 261     def _real_extract(self, url):
 262         lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id')
 263         api_domain, bucket, params = self._get_params(lang)
 264
 265         series_response = self._download_json(
 266             f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id,
 267             note='Retrieving series metadata', query=params)
 268
 269         seasons_response = self._download_json(
 270             f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id,
 271             note='Retrieving season list', query=params)
 272
 273         def entries():
 274             for season in seasons_response['items']:
 275                 episodes_response = self._download_json(
 276                     f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id,
 277                     note=f'Retrieving episode list for {season.get("slug_title")}', query=params)
 278                 for episode in episodes_response['items']:
 279                     episode_id = episode['id']
 280                     episode_display_id = episode['slug_title']
 281                     yield {
 282                         '_type': 'url',
 283                         'url': f'https://www.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}',
 284                         'ie_key': CrunchyrollBetaIE.ie_key(),
 285                         'id': episode_id,
 286                         'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')),
 287                         'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')),
 288                         'duration': float_or_none(episode.get('duration_ms'), 1000),
 289                         'series': episode.get('series_title'),
 290                         'series_id': episode.get('series_id'),
 291                         'season': episode.get('season_title'),
 292                         'season_id': episode.get('season_id'),
 293                         'season_number': episode.get('season_number'),
 294                         'episode': episode.get('title'),
 295                         'episode_number': episode.get('sequence_number')
 296                     }
 297
 298         return self.playlist_result(entries(), internal_id, series_response.get('title'))