[extractor/generic] Don't return JW player without formats

[yt-dlp.git] / yt_dlp / extractor / nhk.py
diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py

index 626c6379b3bb2fa7a81a1fb5ae754ce748b381e3..60d76d1b118c4f1f42c1b160ff5fd0c38c6b9fdc 100644 (file)
--- a/yt_dlp/extractor/nhk.py
+++ b/yt_dlp/extractor/nhk.py
@@ -1,5 +1,3 @@
-from __future__ import unicode_literals
-
  import re
  
  from .common import InfoExtractor
@@ -13,7 +11,7 @@
  
  
  class NhkBaseIE(InfoExtractor):
-    _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json'
+    _API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json'
      _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
      _TYPE_REGEX = r'/(?P<type>video|audio)/'
  
@@ -29,7 +27,7 @@ def _call_api(self, m_id, lang, is_video, is_episode, is_clip):
      def _extract_episode_info(self, url, episode=None):
          fetch_episode = episode is None
          lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups()
-        if episode_id.isdigit():
+        if len(episode_id) == 7:
              episode_id = episode_id[:4] + '-' + episode_id[4:]
  
          is_video = m_type == 'video'
@@ -91,7 +89,8 @@ def get_clean_field(key):
  
  
  class NhkVodIE(NhkBaseIE):
-    _VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
+    # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg
+    _VALID_URL = r'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
      # Content available only for a limited period of time. Visit
      # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
      _TESTS = [{
@@ -131,6 +130,19 @@ class NhkVodIE(NhkBaseIE):
      }, {
          'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
          'only_matching': True,
+    }, {
+        # video, alphabetic character in ID #29670
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/',
+        'only_matching': True,
+        'info_dict': {
+            'id': 'qfjay6cg',
+            'ext': 'mp4',
+            'title': 'DESIGN TALKS plus - Fishermen’s Finery',
+            'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448',
+            'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$',
+            'upload_date': '20210615',
+            'timestamp': 1623722008,
+        }
      }]
  
      def _real_extract(self, url):
@@ -309,7 +321,9 @@ def _real_extract(self, url):
  
          webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id)
  
-        title = self._og_search_title(webpage, fatal=False) or self._html_extract_title(webpage, fatal=False) or self._html_search_regex(r'<h3>([^<]+?)とは？\s*</h3>', webpage, 'title', fatal=False)
+        title = (self._og_search_title(webpage)
+                 or self._html_extract_title(webpage)
+                 or self._html_search_regex(r'<h3>([^<]+?)とは？\s*</h3>', webpage, 'title', fatal=False))
          title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None
          description = self._html_search_regex(
              r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',