]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/nhk.py
[extractor] Deprecate `_sort_formats`
[yt-dlp.git] / yt_dlp / extractor / nhk.py
index 626c6379b3bb2fa7a81a1fb5ae754ce748b381e3..59702b247ee57cca283b1635f656d00e1804f15b 100644 (file)
@@ -1,5 +1,3 @@
-from __future__ import unicode_literals
-
 import re
 
 from .common import InfoExtractor
@@ -13,7 +11,7 @@
 
 
 class NhkBaseIE(InfoExtractor):
-    _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json'
+    _API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json'
     _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
     _TYPE_REGEX = r'/(?P<type>video|audio)/'
 
@@ -29,7 +27,7 @@ def _call_api(self, m_id, lang, is_video, is_episode, is_clip):
     def _extract_episode_info(self, url, episode=None):
         fetch_episode = episode is None
         lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups()
-        if episode_id.isdigit():
+        if len(episode_id) == 7:
             episode_id = episode_id[:4] + '-' + episode_id[4:]
 
         is_video = m_type == 'video'
@@ -80,7 +78,6 @@ def get_clean_field(key):
                     m3u8_id='hls', fatal=False)
                 for f in info['formats']:
                     f['language'] = lang
-                self._sort_formats(info['formats'])
             else:
                 info.update({
                     '_type': 'url_transparent',
@@ -91,7 +88,8 @@ def get_clean_field(key):
 
 
 class NhkVodIE(NhkBaseIE):
-    _VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
+    # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg
+    _VALID_URL = r'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
     # Content available only for a limited period of time. Visit
     # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
     _TESTS = [{
@@ -131,6 +129,19 @@ class NhkVodIE(NhkBaseIE):
     }, {
         'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
         'only_matching': True,
+    }, {
+        # video, alphabetic character in ID #29670
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/',
+        'only_matching': True,
+        'info_dict': {
+            'id': 'qfjay6cg',
+            'ext': 'mp4',
+            'title': 'DESIGN TALKS plus - Fishermen’s Finery',
+            'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448',
+            'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$',
+            'upload_date': '20210615',
+            'timestamp': 1623722008,
+        }
     }]
 
     def _real_extract(self, url):
@@ -228,7 +239,6 @@ def _real_extract(self, url):
         formats = self._extract_m3u8_formats(
             f'https://nhks-vh.akamaihd.net/i/das/{video_id[0:8]}/{video_id}_V_000.f4v/master.m3u8',
             video_id, ext='mp4', m3u8_id='hls')
-        self._sort_formats(formats)
 
         duration = parse_duration(base_values.get('r_duration'))
 
@@ -309,7 +319,8 @@ def _real_extract(self, url):
 
         webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id)
 
-        title = self._og_search_title(webpage, fatal=False) or self._html_extract_title(webpage, fatal=False) or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False)
+        title = (self._generic_title('', webpage)
+                 or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False))
         title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None
         description = self._html_search_regex(
             r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',