]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/nhk.py
[ie/youtube] Suppress "Unavailable videos are hidden" warning (#10159)
[yt-dlp.git] / yt_dlp / extractor / nhk.py
index 8bb017a73210de8736131f2b004af1630fd26205..0bd6edfcbaf816934a0c1e0ae6a8d227ed724f46 100644 (file)
@@ -4,6 +4,7 @@
 from ..utils import (
     ExtractorError,
     clean_html,
+    filter_dict,
     get_element_by_class,
     int_or_none,
     join_nonempty,
@@ -104,7 +105,7 @@ def get_clean_field(key):
             if not img_path:
                 continue
             thumbnails.append({
-                'id': '%dp' % h,
+                'id': f'{h}p',
                 'height': h,
                 'width': w,
                 'url': 'https://www3.nhk.or.jp' + img_path,
@@ -211,7 +212,7 @@ class NhkVodIE(NhkBaseIE):
             'series': 'Living in Japan',
             'description': 'md5:0a0e2077d8f07a03071e990a6f51bfab',
             'thumbnail': r're:https://.+/.+\.jpg',
-            'episode': 'Tips for Travelers to Japan / Ramen Vending Machines'
+            'episode': 'Tips for Travelers to Japan / Ramen Vending Machines',
         },
     }, {
         'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/',
@@ -458,7 +459,7 @@ class NhkForSchoolBangumiIE(InfoExtractor):
             'upload_date': '20140402',
             'ext': 'mp4',
 
-            'chapters': 'count:12'
+            'chapters': 'count:12',
         },
         'params': {
             # m3u8 download
@@ -521,7 +522,8 @@ class NhkForSchoolSubjectIE(InfoExtractor):
         'eigo', 'tokkatsu',
         'tokushi', 'sonota',
     )
-    _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>%s)/?(?:[\?#].*)?$' % '|'.join(re.escape(s) for s in KNOWN_SUBJECTS)
+    _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>{})/?(?:[\?#].*)?$'.format(
+        '|'.join(re.escape(s) for s in KNOWN_SUBJECTS))
 
     _TESTS = [{
         'url': 'https://www.nhk.or.jp/school/sougou/',
@@ -551,9 +553,8 @@ def _real_extract(self, url):
 
 
 class NhkForSchoolProgramListIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>(?:%s)/[a-zA-Z0-9_-]+)' % (
-        '|'.join(re.escape(s) for s in NhkForSchoolSubjectIE.KNOWN_SUBJECTS)
-    )
+    _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>(?:{})/[a-zA-Z0-9_-]+)'.format(
+        '|'.join(re.escape(s) for s in NhkForSchoolSubjectIE.KNOWN_SUBJECTS))
     _TESTS = [{
         'url': 'https://www.nhk.or.jp/school/sougou/q/',
         'info_dict': {
@@ -590,21 +591,22 @@ class NhkRadiruIE(InfoExtractor):
     IE_DESC = 'NHK らじる (Radiru/Rajiru)'
     _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
     _TESTS = [{
-        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3926210',
-        'skip': 'Episode expired on 2024-02-24',
+        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_4003239',
+        'skip': 'Episode expired on 2024-06-09',
         'info_dict': {
-            'title': 'ã\82¸ã\83£ã\82ºã\83»ã\83\88ã\82¥ã\83\8aã\82¤ã\83\88ã\80\80ã\82·ã\83ªã\83¼ã\82ºï¼ªï¼¡ï¼ºï¼ºã\82¸ã\83£ã\82¤ã\82¢ã\83³ã\83\84ã\80\80ï¼\95ï¼\96ã\80\80ã\82¸ã\83§ã\83\8bã\83¼ã\83»ã\83\9bã\83\83ã\82¸ã\82¹',
-            'id': '0449_01_3926210',
+            'title': 'ã\82¸ã\83£ã\82ºã\83»ã\83\88ã\82¥ã\83\8aã\82¤ã\83\88ã\80\80ã\82¸ã\83£ã\82ºã\80\8cï¼®ï½\89ï½\87ï½\88ï½\94ã\80\80ï½\81ï½\8eï½\84ã\80\80Dï½\81ï½\99ã\80\8dç\89¹é\9b\86',
+            'id': '0449_01_4003239',
             'ext': 'm4a',
+            'uploader': 'NHK FM 東京',
+            'description': 'md5:ad05f3c3f3f6e99b2e69f9b5e49551dc',
             'series': 'ジャズ・トゥナイト',
-            'uploader': 'NHK-FM',
-            'channel': 'NHK-FM',
+            'channel': 'NHK FM 東京',
             'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
-            'release_date': '20240217',
-            'description': 'md5:a456ee8e5e59e6dd2a7d32e62386e811',
-            'timestamp': 1708185600,
-            'release_timestamp': 1708178400,
-            'upload_date': '20240217',
+            'upload_date': '20240601',
+            'series_id': '0449_01',
+            'release_date': '20240601',
+            'timestamp': 1717257600,
+            'release_timestamp': 1717250400,
         },
     }, {
         # playlist, airs every weekday so it should _hopefully_ be okay forever
@@ -613,71 +615,145 @@ class NhkRadiruIE(InfoExtractor):
             'id': '0458_01',
             'title': 'ベストオブクラシック',
             'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。',
-            'channel': 'NHK-FM',
-            'uploader': 'NHK-FM',
             'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg',
+            'series_id': '0458_01',
+            'uploader': 'NHK FM',
+            'channel': 'NHK FM',
+            'series': 'ベストオブクラシック',
         },
         'playlist_mincount': 3,
     }, {
         # one with letters in the id
-        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470',
-        'note': 'Expires on 2024-03-31',
+        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F683_01_3910688',
+        'note': 'Expires on 2025-03-31',
         'info_dict': {
-            'id': 'F300_06_3738470',
+            'id': 'F683_01_3910688',
             'ext': 'm4a',
-            'title': '有島武郎「一房のぶどう」',
-            'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n(2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より)',
-            'channel': 'NHKラジオ第1、NHK-FM',
-            'uploader': 'NHKラジオ第1、NHK-FM',
-            'timestamp': 1635757200,
-            'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg',
-            'release_date': '20161207',
-            'series': 'らじる文庫 by ラジオ深夜便 ',
-            'release_timestamp': 1481126700,
-            'upload_date': '20211101',
+            'title': '夏目漱石「文鳥」第1回',
+            'series': '【らじる文庫】夏目漱石「文鳥」(全4回)',
+            'series_id': 'F683_01',
+            'description': '朗読:浅井理アナウンサー',
+            'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F683/img/roudoku_05_rod_640.jpg',
+            'upload_date': '20240106',
+            'release_date': '20240106',
+            'uploader': 'NHK R1',
+            'release_timestamp': 1704511800,
+            'channel': 'NHK R1',
+            'timestamp': 1704512700,
         },
-        'expected_warnings': ['Unable to download JSON metadata', 'Failed to get extended description'],
+        'expected_warnings': ['Unable to download JSON metadata',
+                              'Failed to get extended metadata. API returned Error 1: Invalid parameters'],
     }, {
         # news
-        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109',
-        'skip': 'Expires on 2023-04-17',
+        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_4012173',
         'info_dict': {
-            'id': 'F261_01_3855109',
+            'id': 'F261_01_4012173',
             'ext': 'm4a',
             'channel': 'NHKラジオ第1',
             'uploader': 'NHKラジオ第1',
-            'timestamp': 1681635900,
-            'release_date': '20230416',
             'series': 'NHKラジオニュース',
-            'title': 'å\8d\88å¾\8cï¼\96時のNHKニュース',
+            'title': 'å\8d\88å\89\8dï¼\90時のNHKニュース',
             'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
-            'upload_date': '20230416',
-            'release_timestamp': 1681635600,
+            'release_timestamp': 1718290800,
+            'release_date': '20240613',
+            'timestamp': 1718291400,
+            'upload_date': '20240613',
         },
+    }, {
+        # fallback when extended metadata fails
+        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=2834_01_4009298',
+        'skip': 'Expires on 2024-06-07',
+        'info_dict': {
+            'id': '2834_01_4009298',
+            'title': 'まち☆キラ!開成町特集',
+            'ext': 'm4a',
+            'release_date': '20240531',
+            'upload_date': '20240531',
+            'series': 'はま☆キラ!',
+            'thumbnail': 'https://www.nhk.or.jp/prog/img/2834/g2834.jpg',
+            'channel': 'NHK R1,FM',
+            'description': '',
+            'timestamp': 1717123800,
+            'uploader': 'NHK R1,FM',
+            'release_timestamp': 1717120800,
+            'series_id': '2834_01',
+        },
+        'expected_warnings': ['Failed to get extended metadata. API returned empty list.'],
     }]
 
     _API_URL_TMPL = None
 
-    def _extract_extended_description(self, episode_id, episode):
-        service, _, area = traverse_obj(episode, ('aa_vinfo2', {str}, {lambda x: (x or '').partition(',')}))
-        aa_vinfo3 = traverse_obj(episode, ('aa_vinfo3', {str}))
+    def _extract_extended_metadata(self, episode_id, aa_vinfo):
+        service, _, area = traverse_obj(aa_vinfo, (2, {str}, {lambda x: (x or '').partition(',')}))
         detail_url = try_call(
-            lambda: self._API_URL_TMPL.format(service=service, area=area, dateid=aa_vinfo3))
+            lambda: self._API_URL_TMPL.format(area=area, service=service, dateid=aa_vinfo[3]))
         if not detail_url:
-            return
+            return {}
+
+        response = self._download_json(
+            detail_url, episode_id, 'Downloading extended metadata',
+            'Failed to download extended metadata', fatal=False, expected_status=400)
+        if not response:
+            return {}
+
+        if error := traverse_obj(response, ('error', {dict})):
+            self.report_warning(
+                'Failed to get extended metadata. API returned '
+                f'Error {join_nonempty("code", "message", from_dict=error, delim=": ")}')
+            return {}
+
+        full_meta = traverse_obj(response, ('list', service, 0, {dict}))
+        if not full_meta:
+            self.report_warning('Failed to get extended metadata. API returned empty list.')
+            return {}
+
+        station = ' '.join(traverse_obj(full_meta, (('service', 'area'), 'name', {str}))) or None
+        thumbnails = [{
+            'id': str(id_),
+            'preference': 1 if id_.startswith('thumbnail') else -2 if id_.startswith('logo') else -1,
+            **traverse_obj(thumb, {
+                'url': 'url',
+                'width': ('width', {int_or_none}),
+                'height': ('height', {int_or_none}),
+            }),
+        } for id_, thumb in traverse_obj(full_meta, ('images', {dict.items}, lambda _, v: v[1]['url']))]
+
+        return filter_dict({
+            'channel': station,
+            'uploader': station,
+            'description': join_nonempty(
+                'subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta),
+            'thumbnails': thumbnails,
+            **traverse_obj(full_meta, {
+                'title': ('title', {str}),
+                'timestamp': ('end_time', {unified_timestamp}),
+                'release_timestamp': ('start_time', {unified_timestamp}),
+            }),
+        })
+
+    def _extract_episode_info(self, episode, programme_id, series_meta):
+        episode_id = f'{programme_id}_{episode["id"]}'
+        aa_vinfo = traverse_obj(episode, ('aa_contents_id', {lambda x: x.split(';')}))
+        extended_metadata = self._extract_extended_metadata(episode_id, aa_vinfo)
+        fallback_start_time, _, fallback_end_time = traverse_obj(
+            aa_vinfo, (4, {str}, {lambda x: (x or '').partition('_')}))
 
-        full_meta = traverse_obj(
-            self._download_json(detail_url, episode_id, 'Downloading extended metadata', fatal=False),
-            ('list', service, 0, {dict})) or {}
-        return join_nonempty('subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta)
+        return {
+            **series_meta,
+            'id': episode_id,
+            'formats': self._extract_m3u8_formats(episode.get('stream_url'), episode_id, fatal=False),
+            'container': 'm4a_dash',  # force fixup, AAC-only HLS
+            'was_live': True,
+            'title': episode.get('program_title'),
+            'description': episode.get('program_sub_title'),  # fallback
+            'timestamp': unified_timestamp(fallback_end_time),
+            'release_timestamp': unified_timestamp(fallback_start_time),
+            **extended_metadata,
+        }
 
-    def _extract_episode_info(self, headline, programme_id, series_meta):
+    def _extract_news_info(self, headline, programme_id, series_meta):
         episode_id = f'{programme_id}_{headline["headline_id"]}'
         episode = traverse_obj(headline, ('file_list', 0, {dict}))
-        description = self._extract_extended_description(episode_id, episode)
-        if not description:
-            self.report_warning('Failed to get extended description, falling back to summary')
-            description = traverse_obj(episode, ('file_title_sub', {str}))
 
         return {
             **series_meta,
@@ -687,9 +763,9 @@ def _extract_episode_info(self, headline, programme_id, series_meta):
             'was_live': True,
             'series': series_meta.get('title'),
             'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'),
-            'description': description,
             **traverse_obj(episode, {
-                'title': 'file_title',
+                'title': ('file_title', {str}),
+                'description': ('file_title_sub', {str}),
                 'timestamp': ('open_time', {unified_timestamp}),
                 'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}),
             }),
@@ -706,32 +782,58 @@ def _real_extract(self, url):
         site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline')
         programme_id = f'{site_id}_{corner_id}'
 
-        if site_id == 'F261':
-            json_url = 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json'
-        else:
-            json_url = f'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json'
-
-        meta = self._download_json(json_url, programme_id)['main']
+        if site_id == 'F261':  # XXX: News programmes use old API (for now?)
+            meta = self._download_json(
+                'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json', programme_id)['main']
+            series_meta = traverse_obj(meta, {
+                'title': ('program_name', {str}),
+                'channel': ('media_name', {str}),
+                'uploader': ('media_name', {str}),
+                'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}),
+            }, get_all=False)
+
+            if headline_id:
+                headline = traverse_obj(
+                    meta, ('detail_list', lambda _, v: v['headline_id'] == headline_id, any))
+                if not headline:
+                    raise ExtractorError('Content not found; it has most likely expired', expected=True)
+                return self._extract_news_info(headline, programme_id, series_meta)
+
+            def news_entries():
+                for headline in traverse_obj(meta, ('detail_list', ..., {dict})):
+                    yield self._extract_news_info(headline, programme_id, series_meta)
+
+            return self.playlist_result(
+                news_entries(), programme_id, description=meta.get('site_detail'), **series_meta)
+
+        meta = self._download_json(
+            'https://www.nhk.or.jp/radio-api/app/v1/web/ondemand/series', programme_id, query={
+                'site_id': site_id,
+                'corner_site_id': corner_id,
+            })
 
-        series_meta = traverse_obj(meta, {
-            'title': 'program_name',
-            'channel': 'media_name',
-            'uploader': 'media_name',
-            'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}),
-        }, get_all=False)
+        fallback_station = join_nonempty('NHK', traverse_obj(meta, ('radio_broadcast', {str})), delim=' ')
+        series_meta = {
+            'series': join_nonempty('title', 'corner_name', delim=' ', from_dict=meta),
+            'series_id': programme_id,
+            'thumbnail': traverse_obj(meta, ('thumbnail_url', {url_or_none})),
+            'channel': fallback_station,
+            'uploader': fallback_station,
+        }
 
         if headline_id:
-            return self._extract_episode_info(
-                traverse_obj(meta, (
-                    'detail_list', lambda _, v: v['headline_id'] == headline_id), get_all=False),
-                programme_id, series_meta)
+            episode = traverse_obj(meta, ('episodes', lambda _, v: v['id'] == int(headline_id), any))
+            if not episode:
+                raise ExtractorError('Content not found; it has most likely expired', expected=True)
+            return self._extract_episode_info(episode, programme_id, series_meta)
 
         def entries():
-            for headline in traverse_obj(meta, ('detail_list', ..., {dict})):
-                yield self._extract_episode_info(headline, programme_id, series_meta)
+            for episode in traverse_obj(meta, ('episodes', ..., {dict})):
+                yield self._extract_episode_info(episode, programme_id, series_meta)
 
         return self.playlist_result(
-            entries(), programme_id, playlist_description=meta.get('site_detail'), **series_meta)
+            entries(), programme_id, title=series_meta.get('series'),
+            description=meta.get('series_description'), **series_meta)
 
 
 class NhkRadioNewsPageIE(InfoExtractor):
@@ -747,7 +849,7 @@ class NhkRadioNewsPageIE(InfoExtractor):
             'channel': 'NHKラジオ第1',
             'uploader': 'NHKラジオ第1',
             'title': 'NHKラジオニュース',
-        }
+        },
     }]
 
     def _real_extract(self, url):
@@ -789,7 +891,7 @@ class NhkRadiruLiveIE(InfoExtractor):
             'ext': 'm4a',
             'thumbnail': 'https://www.nhk.or.jp/common/img/media/fm-200x200.png',
             'live_status': 'is_live',
-        }
+        },
     }]
 
     _NOA_STATION_IDS = {'r1': 'n1', 'r2': 'n2', 'fm': 'n3'}
@@ -803,8 +905,8 @@ def _real_extract(self, url):
         data = config.find(f'.//data//area[.="{area}"]/..')
 
         if not data:
-            raise ExtractorError('Invalid area. Valid areas are: %s' % ', '.join(
-                [i.text for i in config.findall('.//data//area')]), expected=True)
+            raise ExtractorError('Invalid area. Valid areas are: {}'.format(', '.join(
+                [i.text for i in config.findall('.//data//area')])), expected=True)
 
         noa_info = self._download_json(
             f'https:{config.find(".//url_program_noa").text}'.format(area=data.find('areakey').text),
@@ -812,7 +914,7 @@ def _real_extract(self, url):
         present_info = traverse_obj(noa_info, ('nowonair_list', self._NOA_STATION_IDS.get(station), 'present'))
 
         return {
-            'title': ' '.join(traverse_obj(present_info, (('service', 'area',), 'name', {str}))),
+            'title': ' '.join(traverse_obj(present_info, (('service', 'area'), 'name', {str}))),
             'id': join_nonempty(station, area),
             'thumbnails': traverse_obj(present_info, ('service', 'images', ..., {
                 'url': 'url',