[ie/crunchyroll] Fix stream extraction (#10005)

[yt-dlp.git] / yt_dlp / extractor / nhk.py
diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py

index cc3c791741a35fed907429b85cfec508eb0ef8a1..8bb017a73210de8736131f2b004af1630fd26205 100644 (file)
--- a/yt_dlp/extractor/nhk.py
+++ b/yt_dlp/extractor/nhk.py
@@ -8,7 +8,9 @@
      int_or_none,
      join_nonempty,
      parse_duration,
+    remove_end,
      traverse_obj,
+    try_call,
      unescapeHTML,
      unified_timestamp,
      url_or_none,
@@ -18,8 +20,7 @@
  
  class NhkBaseIE(InfoExtractor):
      _API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json'
-    _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
-    _TYPE_REGEX = r'/(?P<type>video|audio)/'
+    _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/'
  
      def _call_api(self, m_id, lang, is_video, is_episode, is_clip):
          return self._download_json(
@@ -82,7 +83,7 @@ def _extract_stream_info(self, vod_id):
      def _extract_episode_info(self, url, episode=None):
          fetch_episode = episode is None
          lang, m_type, episode_id = NhkVodIE._match_valid_url(url).group('lang', 'type', 'id')
-        is_video = m_type == 'video'
+        is_video = m_type != 'audio'
  
          if is_video:
              episode_id = episode_id[:4] + '-' + episode_id[4:]
@@ -137,9 +138,10 @@ def get_clean_field(key):
  
          else:
              if fetch_episode:
-                audio_path = episode['audio']['audio']
+                # From https://www3.nhk.or.jp/nhkworld/common/player/radio/inline/rod.html
+                audio_path = remove_end(episode['audio']['audio'], '.m4a')
                  info['formats'] = self._extract_m3u8_formats(
-                    'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path,
+                    f'{urljoin("https://vod-stream.nhk.jp", audio_path)}/index.m3u8',
                      episode_id, 'm4a', entry_protocol='m3u8_native',
                      m3u8_id='hls', fatal=False)
                  for f in info['formats']:
@@ -154,9 +156,11 @@ def get_clean_field(key):
  
  
  class NhkVodIE(NhkBaseIE):
-    # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg
-    _VALID_URL = [rf'{NhkBaseIE._BASE_URL_REGEX}/(?P<type>video)/(?P<id>[0-9a-z]+)',
-                  rf'{NhkBaseIE._BASE_URL_REGEX}/(?P<type>audio)/(?P<id>[^/?#]+?-\d{{8}}-[0-9a-z]+)']
+    _VALID_URL = [
+        rf'{NhkBaseIE._BASE_URL_REGEX}shows/(?:(?P<type>video)/)?(?P<id>\d{{4}}[\da-z]\d+)/?(?:$|[?#])',
+        rf'{NhkBaseIE._BASE_URL_REGEX}(?:ondemand|shows)/(?P<type>audio)/(?P<id>[^/?#]+?-\d{{8}}-[\da-z]+)',
+        rf'{NhkBaseIE._BASE_URL_REGEX}ondemand/(?P<type>video)/(?P<id>\d{{4}}[\da-z]\d+)',  # deprecated
+    ]
      # Content available only for a limited period of time. Visit
      # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
      _TESTS = [{
@@ -166,17 +170,16 @@ class NhkVodIE(NhkBaseIE):
              'ext': 'mp4',
              'title': 'Japan Railway Journal - The Tohoku Shinkansen: Full Speed Ahead',
              'description': 'md5:49f7c5b206e03868a2fdf0d0814b92f6',
-            'thumbnail': 'md5:51bcef4a21936e7fea1ff4e06353f463',
+            'thumbnail': r're:https://.+/.+\.jpg',
              'episode': 'The Tohoku Shinkansen: Full Speed Ahead',
              'series': 'Japan Railway Journal',
-            'modified_timestamp': 1694243656,
+            'modified_timestamp': 1707217907,
              'timestamp': 1681428600,
              'release_timestamp': 1693883728,
              'duration': 1679,
              'upload_date': '20230413',
-            'modified_date': '20230909',
+            'modified_date': '20240206',
              'release_date': '20230905',
-
          },
      }, {
          # video clip
@@ -187,15 +190,15 @@ class NhkVodIE(NhkBaseIE):
              'ext': 'mp4',
              'title': 'Dining with the Chef - Chef Saito\'s Family recipe: MENCHI-KATSU',
              'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
-            'thumbnail': 'md5:d6a4d9b6e9be90aaadda0bcce89631ed',
+            'thumbnail': r're:https://.+/.+\.jpg',
              'series': 'Dining with the Chef',
              'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU',
              'duration': 148,
              'upload_date': '20190816',
              'release_date': '20230902',
              'release_timestamp': 1693619292,
-            'modified_timestamp': 1694168033,
-            'modified_date': '20230908',
+            'modified_timestamp': 1707217907,
+            'modified_date': '20240206',
              'timestamp': 1565997540,
          },
      }, {
@@ -207,7 +210,7 @@ class NhkVodIE(NhkBaseIE):
              'title': 'Living in Japan - Tips for Travelers to Japan / Ramen Vending Machines',
              'series': 'Living in Japan',
              'description': 'md5:0a0e2077d8f07a03071e990a6f51bfab',
-            'thumbnail': 'md5:960622fb6e06054a4a1a0c97ea752545',
+            'thumbnail': r're:https://.+/.+\.jpg',
              'episode': 'Tips for Travelers to Japan / Ramen Vending Machines'
          },
      }, {
@@ -244,7 +247,7 @@ class NhkVodIE(NhkBaseIE):
              'title': 'おはよう日本（7時台） - 10月8日放送',
              'series': 'おはよう日本（7時台）',
              'episode': '10月8日放送',
-            'thumbnail': 'md5:d733b1c8e965ab68fb02b2d347d0e9b4',
+            'thumbnail': r're:https://.+/.+\.jpg',
              'description': 'md5:9c1d6cbeadb827b955b20e99ab920ff0',
          },
          'skip': 'expires 2023-10-15',
@@ -254,17 +257,100 @@ class NhkVodIE(NhkBaseIE):
          'info_dict': {
              'id': 'nw_vod_v_en_3004_952_20230723091000_01_1690074552',
              'ext': 'mp4',
-            'title': 'Barakan Discovers AMAMI OSHIMA: Isson\'s Treasure Island',
+            'title': 'Barakan Discovers - AMAMI OSHIMA: Isson\'s Treasure Isla',
              'description': 'md5:5db620c46a0698451cc59add8816b797',
-            'thumbnail': 'md5:67d9ff28009ba379bfa85ad1aaa0e2bd',
+            'thumbnail': r're:https://.+/.+\.jpg',
              'release_date': '20230905',
              'timestamp': 1690103400,
              'duration': 2939,
              'release_timestamp': 1693898699,
-            'modified_timestamp': 1698057495,
-            'modified_date': '20231023',
              'upload_date': '20230723',
+            'modified_timestamp': 1707217907,
+            'modified_date': '20240206',
+            'episode': 'AMAMI OSHIMA: Isson\'s Treasure Isla',
+            'series': 'Barakan Discovers',
+        },
+    }, {
+        # /ondemand/video/ url with alphabetical character in 5th position of id
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a07/',
+        'info_dict': {
+            'id': 'nw_c_en_9999-a07',
+            'ext': 'mp4',
+            'episode': 'Mini-Dramas on SDGs: Ep 1 Close the Gender Gap [Director\'s Cut]',
+            'series': 'Mini-Dramas on SDGs',
+            'modified_date': '20240206',
+            'title': 'Mini-Dramas on SDGs - Mini-Dramas on SDGs: Ep 1 Close the Gender Gap [Director\'s Cut]',
+            'description': 'md5:3f9dcb4db22fceb675d90448a040d3f6',
+            'timestamp': 1621962360,
+            'duration': 189,
+            'release_date': '20230903',
+            'modified_timestamp': 1707217907,
+            'upload_date': '20210525',
+            'thumbnail': r're:https://.+/.+\.jpg',
+            'release_timestamp': 1693713487,
+        },
+    }, {
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999d17/',
+        'info_dict': {
+            'id': 'nw_c_en_9999-d17',
+            'ext': 'mp4',
+            'title': 'Flowers of snow blossom - The 72 Pentads of Yamato',
+            'description': 'Today’s focus: Snow',
+            'release_timestamp': 1693792402,
+            'release_date': '20230904',
+            'upload_date': '20220128',
+            'timestamp': 1643370960,
+            'thumbnail': r're:https://.+/.+\.jpg',
+            'duration': 136,
+            'series': '',
+            'modified_date': '20240206',
+            'modified_timestamp': 1707217907,
+        },
+    }, {
+        # new /shows/ url format
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/2032307/',
+        'info_dict': {
+            'id': 'nw_vod_v_en_2032_307_20240321113000_01_1710990282',
+            'ext': 'mp4',
+            'title': 'Japanology Plus - 20th Anniversary Special Part 1',
+            'description': 'md5:817d41fc8e54339ad2a916161ea24faf',
+            'episode': '20th Anniversary Special Part 1',
+            'series': 'Japanology Plus',
+            'thumbnail': r're:https://.+/.+\.jpg',
+            'duration': 1680,
+            'timestamp': 1711020600,
+            'upload_date': '20240321',
+            'release_timestamp': 1711022683,
+            'release_date': '20240321',
+            'modified_timestamp': 1711031012,
+            'modified_date': '20240321',
          },
+    }, {
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/3020025/',
+        'info_dict': {
+            'id': 'nw_vod_v_en_3020_025_20230325144000_01_1679723944',
+            'ext': 'mp4',
+            'title': '100 Ideas to Save the World - Working Styles Evolve',
+            'description': 'md5:9e6c7778eaaf4f7b4af83569649f84d9',
+            'episode': 'Working Styles Evolve',
+            'series': '100 Ideas to Save the World',
+            'thumbnail': r're:https://.+/.+\.jpg',
+            'duration': 899,
+            'upload_date': '20230325',
+            'timestamp': 1679755200,
+            'release_date': '20230905',
+            'release_timestamp': 1693880540,
+            'modified_date': '20240206',
+            'modified_timestamp': 1707217907,
+        },
+    }, {
+        # new /shows/audio/ url format
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/audio/livinginjapan-20231001-1/',
+        'only_matching': True,
+    }, {
+        # valid url even if can't be found in wild; support needed for clip entries extraction
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/9999o80/',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
@@ -272,18 +358,21 @@ def _real_extract(self, url):
  
  
  class NhkVodProgramIE(NhkBaseIE):
-    _VALID_URL = rf'{NhkBaseIE._BASE_URL_REGEX}/program{NhkBaseIE._TYPE_REGEX}(?P<id>\w+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?'
+    _VALID_URL = rf'''(?x)
+        {NhkBaseIE._BASE_URL_REGEX}(?:shows|tv)/
+        (?:(?P<type>audio)/programs/)?(?P<id>\w+)/?
+        (?:\?(?:[^#]+&)?type=(?P<episode_type>clip|(?:radio|tv)Episode))?'''
      _TESTS = [{
          # video program episodes
-        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo',
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/sumo/',
          'info_dict': {
              'id': 'sumo',
              'title': 'GRAND SUMO Highlights',
              'description': 'md5:fc20d02dc6ce85e4b72e0273aa52fdbf',
          },
-        'playlist_mincount': 0,
+        'playlist_mincount': 1,
      }, {
-        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway',
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/japanrailway/',
          'info_dict': {
              'id': 'japanrailway',
              'title': 'Japan Railway Journal',
@@ -292,40 +381,68 @@ class NhkVodProgramIE(NhkBaseIE):
          'playlist_mincount': 12,
      }, {
          # video program clips
-        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip',
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/japanrailway/?type=clip',
          'info_dict': {
              'id': 'japanrailway',
              'title': 'Japan Railway Journal',
              'description': 'md5:ea39d93af7d05835baadf10d1aae0e3f',
          },
-        'playlist_mincount': 5,
-    }, {
-        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/',
-        'only_matching': True,
+        'playlist_mincount': 12,
      }, {
          # audio program
-        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/',
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/audio/programs/livinginjapan/',
+        'info_dict': {
+            'id': 'livinginjapan',
+            'title': 'Living in Japan',
+            'description': 'md5:665bb36ec2a12c5a7f598ee713fc2b54',
+        },
+        'playlist_mincount': 12,
+    }, {
+        # /tv/ program url
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/tv/designtalksplus/',
+        'info_dict': {
+            'id': 'designtalksplus',
+            'title': 'DESIGN TALKS plus',
+            'description': 'md5:47b3b3a9f10d4ac7b33b53b70a7d2837',
+        },
+        'playlist_mincount': 20,
+    }, {
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/10yearshayaomiyazaki/',
          'only_matching': True,
      }]
  
+    @classmethod
+    def suitable(cls, url):
+        return False if NhkVodIE.suitable(url) else super().suitable(url)
+
+    def _extract_meta_from_class_elements(self, class_values, html):
+        for class_value in class_values:
+            if value := clean_html(get_element_by_class(class_value, html)):
+                return value
+
      def _real_extract(self, url):
          lang, m_type, program_id, episode_type = self._match_valid_url(url).group('lang', 'type', 'id', 'episode_type')
          episodes = self._call_api(
-            program_id, lang, m_type == 'video', False, episode_type == 'clip')
+            program_id, lang, m_type != 'audio', False, episode_type == 'clip')
  
-        entries = []
-        for episode in episodes:
-            episode_path = episode.get('url')
-            if not episode_path:
-                continue
-            entries.append(self._extract_episode_info(
-                urljoin(url, episode_path), episode))
+        def entries():
+            for episode in episodes:
+                if episode_path := episode.get('url'):
+                    yield self._extract_episode_info(urljoin(url, episode_path), episode)
  
          html = self._download_webpage(url, program_id)
-        program_title = clean_html(get_element_by_class('p-programDetail__title', html))
-        program_description = clean_html(get_element_by_class('p-programDetail__text', html))
+        program_title = self._extract_meta_from_class_elements([
+            'p-programDetail__title',  # /ondemand/program/
+            'pProgramHero__logoText',  # /shows/
+            'tAudioProgramMain__title',  # /shows/audio/programs/
+            'p-program-name'], html)  # /tv/
+        program_description = self._extract_meta_from_class_elements([
+            'p-programDetail__text',  # /ondemand/program/
+            'pProgramHero__description',  # /shows/
+            'tAudioProgramMain__info',  # /shows/audio/programs/
+            'p-program-description'], html)  # /tv/
  
-        return self.playlist_result(entries, program_id, program_title, program_description)
+        return self.playlist_result(entries(), program_id, program_title, program_description)
  
  
  class NhkForSchoolBangumiIE(InfoExtractor):
@@ -473,22 +590,21 @@ class NhkRadiruIE(InfoExtractor):
      IE_DESC = 'NHK らじる (Radiru/Rajiru)'
      _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
      _TESTS = [{
-        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3853544',
-        'skip': 'Episode expired on 2023-04-16',
+        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3926210',
+        'skip': 'Episode expired on 2024-02-24',
          'info_dict': {
-            'channel': 'NHK-FM',
-            'uploader': 'NHK-FM',
-            'description': 'md5:94b08bdeadde81a97df4ec882acce3e9',
+            'title': 'ジャズ・トゥナイト　シリーズＪＡＺＺジャイアンツ　５６　ジョニー・ホッジス',
+            'id': '0449_01_3926210',
              'ext': 'm4a',
-            'id': '0449_01_3853544',
              'series': 'ジャズ・トゥナイト',
+            'uploader': 'NHK-FM',
+            'channel': 'NHK-FM',
              'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
-            'timestamp': 1680969600,
-            'title': 'ジャズ・トゥナイト　ＮＥＷジャズ特集',
-            'upload_date': '20230408',
-            'release_timestamp': 1680962400,
-            'release_date': '20230408',
-            'was_live': True,
+            'release_date': '20240217',
+            'description': 'md5:a456ee8e5e59e6dd2a7d32e62386e811',
+            'timestamp': 1708185600,
+            'release_timestamp': 1708178400,
+            'upload_date': '20240217',
          },
      }, {
          # playlist, airs every weekday so it should _hopefully_ be okay forever
@@ -519,7 +635,8 @@ class NhkRadiruIE(InfoExtractor):
              'series': 'らじる文庫 by ラジオ深夜便 ',
              'release_timestamp': 1481126700,
              'upload_date': '20211101',
-        }
+        },
+        'expected_warnings': ['Unable to download JSON metadata', 'Failed to get extended description'],
      }, {
          # news
          'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109',
@@ -539,9 +656,28 @@ class NhkRadiruIE(InfoExtractor):
          },
      }]
  
+    _API_URL_TMPL = None
+
+    def _extract_extended_description(self, episode_id, episode):
+        service, _, area = traverse_obj(episode, ('aa_vinfo2', {str}, {lambda x: (x or '').partition(',')}))
+        aa_vinfo3 = traverse_obj(episode, ('aa_vinfo3', {str}))
+        detail_url = try_call(
+            lambda: self._API_URL_TMPL.format(service=service, area=area, dateid=aa_vinfo3))
+        if not detail_url:
+            return
+
+        full_meta = traverse_obj(
+            self._download_json(detail_url, episode_id, 'Downloading extended metadata', fatal=False),
+            ('list', service, 0, {dict})) or {}
+        return join_nonempty('subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta)
+
      def _extract_episode_info(self, headline, programme_id, series_meta):
          episode_id = f'{programme_id}_{headline["headline_id"]}'
          episode = traverse_obj(headline, ('file_list', 0, {dict}))
+        description = self._extract_extended_description(episode_id, episode)
+        if not description:
+            self.report_warning('Failed to get extended description, falling back to summary')
+            description = traverse_obj(episode, ('file_title_sub', {str}))
  
          return {
              **series_meta,
@@ -551,14 +687,21 @@ def _extract_episode_info(self, headline, programme_id, series_meta):
              'was_live': True,
              'series': series_meta.get('title'),
              'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'),
+            'description': description,
              **traverse_obj(episode, {
                  'title': 'file_title',
-                'description': 'file_title_sub',
                  'timestamp': ('open_time', {unified_timestamp}),
                  'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}),
              }),
          }
  
+    def _real_initialize(self):
+        if self._API_URL_TMPL:
+            return
+        api_config = self._download_xml(
+            'https://www.nhk.or.jp/radio/config/config_web.xml', None, 'Downloading API config', fatal=False)
+        NhkRadiruIE._API_URL_TMPL = try_call(lambda: f'https:{api_config.find(".//url_program_detail").text}')
+
      def _real_extract(self, url):
          site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline')
          programme_id = f'{site_id}_{corner_id}'
@@ -665,7 +808,7 @@ def _real_extract(self, url):
  
          noa_info = self._download_json(
              f'https:{config.find(".//url_program_noa").text}'.format(area=data.find('areakey').text),
-            station, note=f'Downloading {area} station metadata')
+            station, note=f'Downloading {area} station metadata', fatal=False)
          present_info = traverse_obj(noa_info, ('nowonair_list', self._NOA_STATION_IDS.get(station), 'present'))
  
          return {