[ie/crunchyroll] Fix extractor (#9615)

[yt-dlp.git] / yt_dlp / extractor / crunchyroll.py
diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py

index d35e9995abcc6bcc7e0a87448c5505720e0c18c5..118b575ab28c9ccf56582fc31ccab0217a5f4d35 100644 (file)
--- a/yt_dlp/extractor/crunchyroll.py
+++ b/yt_dlp/extractor/crunchyroll.py
@@ -1,4 +1,5 @@
  import base64
+import uuid
  
  from .common import InfoExtractor
  from ..networking.exceptions import HTTPError
@@ -7,12 +8,11 @@
      float_or_none,
      format_field,
      int_or_none,
-    join_nonempty,
+    jwt_decode_hs256,
      parse_age_limit,
      parse_count,
      parse_iso8601,
      qualities,
-    remove_start,
      time_seconds,
      traverse_obj,
      url_or_none,
@@ -27,6 +27,7 @@ class CrunchyrollBaseIE(InfoExtractor):
      _AUTH_HEADERS = None
      _API_ENDPOINT = None
      _BASIC_AUTH = None
+    _IS_PREMIUM = None
      _CLIENT_ID = ('cr_web', 'noaihdevm_6iyg0a8l0q')
      _LOCALE_LOOKUP = {
          'ar': 'ar-SA',
@@ -84,11 +85,16 @@ def _update_auth(self):
              self.write_debug(f'Using cxApiParam={cx_api_param}')
              CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode()
  
-        grant_type = 'etp_rt_cookie' if self.is_logged_in else 'client_id'
+        auth_headers = {'Authorization': CrunchyrollBaseIE._BASIC_AUTH}
+        if self.is_logged_in:
+            grant_type = 'etp_rt_cookie'
+        else:
+            grant_type = 'client_id'
+            auth_headers['ETP-Anonymous-ID'] = uuid.uuid4()
          try:
              auth_response = self._download_json(
                  f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}',
-                headers={'Authorization': CrunchyrollBaseIE._BASIC_AUTH}, data=f'grant_type={grant_type}'.encode())
+                headers=auth_headers, data=f'grant_type={grant_type}'.encode())
          except ExtractorError as error:
              if isinstance(error.cause, HTTPError) and error.cause.status == 403:
                  raise ExtractorError(
@@ -97,6 +103,7 @@ def _update_auth(self):
                      'and your browser\'s User-Agent (with --user-agent)', expected=True)
              raise
  
+        CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(auth_response, ('access_token', {jwt_decode_hs256}, 'benefits', ...))
          CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']}
          CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10)
  
@@ -135,62 +142,72 @@ def _call_api(self, path, internal_id, lang, note='api', query={}):
              raise ExtractorError(f'Unexpected response when downloading {note} JSON')
          return result
  
-    def _extract_formats(self, stream_response, display_id=None):
-        requested_formats = self._configuration_arg('format') or ['vo_adaptive_hls']
-        available_formats = {}
-        for stream_type, streams in traverse_obj(
-                stream_response, (('streams', ('data', 0)), {dict.items}, ...)):
-            if stream_type not in requested_formats:
+    def _extract_chapters(self, internal_id):
+        # if no skip events are available, a 403 xml error is returned
+        skip_events = self._download_json(
+            f'https://static.crunchyroll.com/skip-events/production/{internal_id}.json',
+            internal_id, note='Downloading chapter info', fatal=False, errnote=False)
+        if not skip_events:
+            return None
+
+        chapters = []
+        for event in ('recap', 'intro', 'credits', 'preview'):
+            start = traverse_obj(skip_events, (event, 'start', {float_or_none}))
+            end = traverse_obj(skip_events, (event, 'end', {float_or_none}))
+            # some chapters have no start and/or ending time, they will just be ignored
+            if start is None or end is None:
                  continue
-            for stream in traverse_obj(streams, lambda _, v: v['url']):
-                hardsub_lang = stream.get('hardsub_locale') or ''
-                format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s'))
-                available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url'])
+            chapters.append({'title': event.capitalize(), 'start_time': start, 'end_time': end})
+
+        return chapters
+
+    def _extract_stream(self, identifier, display_id=None):
+        if not display_id:
+            display_id = identifier
+
+        self._update_auth()
+        stream_response = self._download_json(
+            f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play',
+            display_id, note='Downloading stream info', headers=CrunchyrollBaseIE._AUTH_HEADERS)
+
+        available_formats = {'': ('', '', stream_response['url'])}
+        for hardsub_lang, stream in traverse_obj(stream_response, ('hardSubs', {dict.items}, lambda _, v: v[1]['url'])):
+            available_formats[hardsub_lang] = (f'hardsub-{hardsub_lang}', hardsub_lang, stream['url'])
  
          requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])]
-        if '' in available_formats and 'all' not in requested_hardsubs:
+        hardsub_langs = [lang for lang in available_formats if lang]
+        if hardsub_langs and 'all' not in requested_hardsubs:
              full_format_langs = set(requested_hardsubs)
+            self.to_screen(f'Available hardsub languages: {", ".join(hardsub_langs)}')
              self.to_screen(
-                'To get all formats of a hardsub language, use '
+                'To extract formats of a hardsub language, use '
                  '"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". '
                  'See https://github.com/yt-dlp/yt-dlp#crunchyrollbeta-crunchyroll for more info',
                  only_once=True)
          else:
              full_format_langs = set(map(str.lower, available_formats))
  
-        audio_locale = traverse_obj(stream_response, ((None, 'meta'), 'audio_locale'), get_all=False)
+        audio_locale = traverse_obj(stream_response, ('audioLocale', {str}))
          hardsub_preference = qualities(requested_hardsubs[::-1])
-        formats = []
-        for stream_type, format_id, hardsub_lang, stream_url in available_formats.values():
-            if stream_type.endswith('hls'):
-                if hardsub_lang.lower() in full_format_langs:
-                    adaptive_formats = self._extract_m3u8_formats(
-                        stream_url, display_id, 'mp4', m3u8_id=format_id,
-                        fatal=False, note=f'Downloading {format_id} HLS manifest')
-                else:
-                    adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),)
-            elif stream_type.endswith('dash'):
-                adaptive_formats = self._extract_mpd_formats(
-                    stream_url, display_id, mpd_id=format_id,
-                    fatal=False, note=f'Downloading {format_id} MPD manifest')
+        formats, subtitles = [], {}
+        for format_id, hardsub_lang, stream_url in available_formats.values():
+            if hardsub_lang.lower() in full_format_langs:
+                adaptive_formats, dash_subs = self._extract_mpd_formats_and_subtitles(
+                    stream_url, display_id, mpd_id=format_id, headers=CrunchyrollBaseIE._AUTH_HEADERS,
+                    fatal=False, note=f'Downloading {f"{format_id} " if hardsub_lang else ""}MPD manifest')
+                self._merge_subtitles(dash_subs, target=subtitles)
              else:
-                self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True)
-                continue
+                continue  # XXX: Update this if/when meta mpd formats are working
              for f in adaptive_formats:
                  if f.get('acodec') != 'none':
                      f['language'] = audio_locale
                  f['quality'] = hardsub_preference(hardsub_lang.lower())
              formats.extend(adaptive_formats)
  
-        return formats
-
-    def _extract_subtitles(self, data):
-        subtitles = {}
+        for locale, subtitle in traverse_obj(stream_response, (('subtitles', 'captions'), {dict.items}, ...)):
+            subtitles.setdefault(locale, []).append(traverse_obj(subtitle, {'url': 'url', 'ext': 'format'}))
  
-        for locale, subtitle in traverse_obj(data, ((None, 'meta'), 'subtitles', {dict.items}, ...)):
-            subtitles[locale] = [traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})]
-
-        return subtitles
+        return formats, subtitles
  
  
  class CrunchyrollCmsBaseIE(CrunchyrollBaseIE):
@@ -245,7 +262,11 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
              'like_count': int,
              'dislike_count': int,
          },
-        'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'},
+        'params': {
+            'skip_download': 'm3u8',
+            'extractor_args': {'crunchyrollbeta': {'hardsub': ['de-DE']}},
+            'format': 'bv[format_id~=hardsub]',
+        },
      }, {
          # Premium only
          'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR',
@@ -306,6 +327,7 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
              'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
          },
          'params': {'skip_download': 'm3u8'},
+        'skip': 'no longer exists',
      }, {
          'url': 'https://www.crunchyroll.com/watch/G62PEZ2E6',
          'info_dict': {
@@ -359,31 +381,15 @@ def entries():
          else:
              raise ExtractorError(f'Unknown object type {object_type}')
  
-        # There might be multiple audio languages for one object (`<object>_metadata.versions`),
-        # so we need to get the id from `streams_link` instead or we dont know which language to choose
-        streams_link = response.get('streams_link')
-        if not streams_link and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')):
+        if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')):
              message = f'This {object_type} is for premium members only'
              if self.is_logged_in:
                  raise ExtractorError(message, expected=True)
              self.raise_login_required(message)
  
-        # We need go from unsigned to signed api to avoid getting soft banned
-        stream_response = self._call_cms_api_signed(remove_start(
-            streams_link, '/content/v2/cms/'), internal_id, lang, 'stream info')
-        result['formats'] = self._extract_formats(stream_response, internal_id)
-        result['subtitles'] = self._extract_subtitles(stream_response)
+        result['formats'], result['subtitles'] = self._extract_stream(internal_id)
  
-        # if no intro chapter is available, a 403 without usable data is returned
-        intro_chapter = self._download_json(
-            f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json',
-            internal_id, note='Downloading chapter info', fatal=False, errnote=False)
-        if isinstance(intro_chapter, dict):
-            result['chapters'] = [{
-                'title': 'Intro',
-                'start_time': float_or_none(intro_chapter.get('startTime')),
-                'end_time': float_or_none(intro_chapter.get('endTime')),
-            }]
+        result['chapters'] = self._extract_chapters(internal_id)
  
          def calculate_count(item):
              return parse_count(''.join((item['displayed'], item.get('unit') or '')))
@@ -512,7 +518,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
              'display_id': 'egaono-hana',
              'title': 'Egaono Hana',
              'track': 'Egaono Hana',
-            'artist': 'Goose house',
+            'artists': ['Goose house'],
              'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
              'genres': ['J-Pop'],
          },
@@ -525,11 +531,12 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
              'display_id': 'crossing-field',
              'title': 'Crossing Field',
              'track': 'Crossing Field',
-            'artist': 'LiSA',
+            'artists': ['LiSA'],
              'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
              'genres': ['Anime'],
          },
          'params': {'skip_download': 'm3u8'},
+        'skip': 'no longer exists',
      }, {
          'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135',
          'info_dict': {
@@ -538,7 +545,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
              'display_id': 'live-is-smile-always-364joker-at-yokohama-arena',
              'title': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA',
              'track': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA',
-            'artist': 'LiSA',
+            'artists': ['LiSA'],
              'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
              'description': 'md5:747444e7e6300907b7a43f0a0503072e',
              'genres': ['J-Pop'],
@@ -566,16 +573,14 @@ def _real_extract(self, url):
          if not response:
              raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True)
  
-        streams_link = response.get('streams_link')
-        if not streams_link and response.get('isPremiumOnly'):
+        if not self._IS_PREMIUM and response.get('isPremiumOnly'):
              message = f'This {response.get("type") or "media"} is for premium members only'
              if self.is_logged_in:
                  raise ExtractorError(message, expected=True)
              self.raise_login_required(message)
  
          result = self._transform_music_response(response)
-        stream_response = self._call_api(streams_link, internal_id, lang, 'stream info')
-        result['formats'] = self._extract_formats(stream_response, internal_id)
+        result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id)
  
          return result
  
@@ -587,7 +592,7 @@ def _transform_music_response(data):
                  'display_id': 'slug',
                  'title': 'title',
                  'track': 'title',
-                'artist': ('artist', 'name'),
+                'artists': ('artist', 'name', all),
                  'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n') or None}),
                  'thumbnails': ('images', ..., ..., {
                      'url': ('source', {url_or_none}),
@@ -611,7 +616,7 @@ class CrunchyrollArtistIE(CrunchyrollBaseIE):
          'info_dict': {
              'id': 'MA179CB50D',
              'title': 'LiSA',
-            'genres': ['J-Pop', 'Anime', 'Rock'],
+            'genres': ['Anime', 'J-Pop', 'Rock'],
              'description': 'md5:16d87de61a55c3f7d6c454b73285938e',
          },
          'playlist_mincount': 83,