[extractor] Deprecate `_sort_formats`

[yt-dlp.git] / yt_dlp / extractor / nhk.py
diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py

index 626c6379b3bb2fa7a81a1fb5ae754ce748b381e3..59702b247ee57cca283b1635f656d00e1804f15b 100644 (file)
--- a/yt_dlp/extractor/nhk.py
+++ b/yt_dlp/extractor/nhk.py
@@ -1,5 +1,3 @@
-from __future__ import unicode_literals
-
  import re
  
  from .common import InfoExtractor
@@ -13,7 +11,7 @@
  
  
  class NhkBaseIE(InfoExtractor):
-    _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json'
+    _API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json'
      _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
      _TYPE_REGEX = r'/(?P<type>video|audio)/'
  
@@ -29,7 +27,7 @@ def _call_api(self, m_id, lang, is_video, is_episode, is_clip):
      def _extract_episode_info(self, url, episode=None):
          fetch_episode = episode is None
          lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups()
-        if episode_id.isdigit():
+        if len(episode_id) == 7:
              episode_id = episode_id[:4] + '-' + episode_id[4:]
  
          is_video = m_type == 'video'
@@ -80,7 +78,6 @@ def get_clean_field(key):
                      m3u8_id='hls', fatal=False)
                  for f in info['formats']:
                      f['language'] = lang
-                self._sort_formats(info['formats'])
              else:
                  info.update({
                      '_type': 'url_transparent',
@@ -91,7 +88,8 @@ def get_clean_field(key):
  
  
  class NhkVodIE(NhkBaseIE):
-    _VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
+    # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg
+    _VALID_URL = r'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
      # Content available only for a limited period of time. Visit
      # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
      _TESTS = [{
@@ -131,6 +129,19 @@ class NhkVodIE(NhkBaseIE):
      }, {
          'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
          'only_matching': True,
+    }, {
+        # video, alphabetic character in ID #29670
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/',
+        'only_matching': True,
+        'info_dict': {
+            'id': 'qfjay6cg',
+            'ext': 'mp4',
+            'title': 'DESIGN TALKS plus - Fishermen’s Finery',
+            'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448',
+            'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$',
+            'upload_date': '20210615',
+            'timestamp': 1623722008,
+        }
      }]
  
      def _real_extract(self, url):
@@ -228,7 +239,6 @@ def _real_extract(self, url):
          formats = self._extract_m3u8_formats(
              f'https://nhks-vh.akamaihd.net/i/das/{video_id[0:8]}/{video_id}_V_000.f4v/master.m3u8',
              video_id, ext='mp4', m3u8_id='hls')
-        self._sort_formats(formats)
  
          duration = parse_duration(base_values.get('r_duration'))
  
@@ -309,7 +319,8 @@ def _real_extract(self, url):
  
          webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id)
  
-        title = self._og_search_title(webpage, fatal=False) or self._html_extract_title(webpage, fatal=False) or self._html_search_regex(r'<h3>([^<]+?)とは？\s*</h3>', webpage, 'title', fatal=False)
+        title = (self._generic_title('', webpage)
+                 or self._html_search_regex(r'<h3>([^<]+?)とは？\s*</h3>', webpage, 'title', fatal=False))
          title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None
          description = self._html_search_regex(
              r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',