]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/nhk.py
f6b5c501bb05fc65e0f727a61cd68607b8409027
[yt-dlp.git] / yt_dlp / extractor / nhk.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5 ExtractorError,
6 int_or_none,
7 join_nonempty,
8 parse_duration,
9 traverse_obj,
10 unescapeHTML,
11 unified_timestamp,
12 url_or_none,
13 urljoin,
14 )
15
16
17 class NhkBaseIE(InfoExtractor):
18 _API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json'
19 _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
20 _TYPE_REGEX = r'/(?P<type>video|audio)/'
21
22 def _call_api(self, m_id, lang, is_video, is_episode, is_clip):
23 return self._download_json(
24 self._API_URL_TEMPLATE % (
25 'v' if is_video else 'r',
26 'clip' if is_clip else 'esd',
27 'episode' if is_episode else 'program',
28 m_id, lang, '/all' if is_video else ''),
29 m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] or []
30
31 def _get_api_info(self, refresh=True):
32 if not refresh:
33 return self.cache.load('nhk', 'api_info')
34
35 self.cache.store('nhk', 'api_info', {})
36 movie_player_js = self._download_webpage(
37 'https://movie-a.nhk.or.jp/world/player/js/movie-player.js', None,
38 note='Downloading stream API information')
39 api_info = {
40 'url': self._search_regex(
41 r'prod:[^;]+\bapiUrl:\s*[\'"]([^\'"]+)[\'"]', movie_player_js, None, 'stream API url'),
42 'token': self._search_regex(
43 r'prod:[^;]+\btoken:\s*[\'"]([^\'"]+)[\'"]', movie_player_js, None, 'stream API token'),
44 }
45 self.cache.store('nhk', 'api_info', api_info)
46 return api_info
47
48 def _extract_formats_and_subtitles(self, vod_id):
49 for refresh in (False, True):
50 api_info = self._get_api_info(refresh)
51 if not api_info:
52 continue
53
54 api_url = api_info.pop('url')
55 stream_url = traverse_obj(
56 self._download_json(
57 api_url, vod_id, 'Downloading stream url info', fatal=False, query={
58 **api_info,
59 'type': 'json',
60 'optional_id': vod_id,
61 'active_flg': 1,
62 }),
63 ('meta', 0, 'movie_url', ('mb_auto', 'auto_sp', 'auto_pc'), {url_or_none}), get_all=False)
64 if stream_url:
65 return self._extract_m3u8_formats_and_subtitles(stream_url, vod_id)
66
67 raise ExtractorError('Unable to extract stream url')
68
69 def _extract_episode_info(self, url, episode=None):
70 fetch_episode = episode is None
71 lang, m_type, episode_id = NhkVodIE._match_valid_url(url).group('lang', 'type', 'id')
72 is_video = m_type == 'video'
73
74 if is_video:
75 episode_id = episode_id[:4] + '-' + episode_id[4:]
76
77 if fetch_episode:
78 episode = self._call_api(
79 episode_id, lang, is_video, True, episode_id[:4] == '9999')[0]
80 title = episode.get('sub_title_clean') or episode['sub_title']
81
82 def get_clean_field(key):
83 return episode.get(key + '_clean') or episode.get(key)
84
85 series = get_clean_field('title')
86
87 thumbnails = []
88 for s, w, h in [('', 640, 360), ('_l', 1280, 720)]:
89 img_path = episode.get('image' + s)
90 if not img_path:
91 continue
92 thumbnails.append({
93 'id': '%dp' % h,
94 'height': h,
95 'width': w,
96 'url': 'https://www3.nhk.or.jp' + img_path,
97 })
98
99 info = {
100 'id': episode_id + '-' + lang,
101 'title': '%s - %s' % (series, title) if series and title else title,
102 'description': get_clean_field('description'),
103 'thumbnails': thumbnails,
104 'series': series,
105 'episode': title,
106 }
107 if is_video:
108 vod_id = episode['vod_id']
109 formats, subs = self._extract_formats_and_subtitles(vod_id)
110
111 info.update({
112 'id': vod_id,
113 'formats': formats,
114 'subtitles': subs,
115 })
116
117 else:
118 if fetch_episode:
119 audio_path = episode['audio']['audio']
120 info['formats'] = self._extract_m3u8_formats(
121 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path,
122 episode_id, 'm4a', entry_protocol='m3u8_native',
123 m3u8_id='hls', fatal=False)
124 for f in info['formats']:
125 f['language'] = lang
126 else:
127 info.update({
128 '_type': 'url_transparent',
129 'ie_key': NhkVodIE.ie_key(),
130 'url': url,
131 })
132 return info
133
134
135 class NhkVodIE(NhkBaseIE):
136 # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg
137 _VALID_URL = [rf'{NhkBaseIE._BASE_URL_REGEX}/(?P<type>video)/(?P<id>[0-9a-z]+)',
138 rf'{NhkBaseIE._BASE_URL_REGEX}/(?P<type>audio)/(?P<id>[^/?#]+?-\d{{8}}-[0-9a-z]+)']
139 # Content available only for a limited period of time. Visit
140 # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
141 _TESTS = [{
142 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2049126/',
143 'info_dict': {
144 'id': 'nw_vod_v_en_2049_126_20230413233000_01_1681398302',
145 'ext': 'mp4',
146 'title': 'Japan Railway Journal - The Tohoku Shinkansen: Full Speed Ahead',
147 'description': 'md5:49f7c5b206e03868a2fdf0d0814b92f6',
148 'thumbnail': 'md5:51bcef4a21936e7fea1ff4e06353f463',
149 'episode': 'The Tohoku Shinkansen: Full Speed Ahead',
150 'series': 'Japan Railway Journal',
151 },
152 }, {
153 # video clip
154 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/',
155 'md5': '153c3016dfd252ba09726588149cf0e7',
156 'info_dict': {
157 'id': 'lpZXIwaDE6_Z-976CPsFdxyICyWUzlT5',
158 'ext': 'mp4',
159 'title': 'Dining with the Chef - Chef Saito\'s Family recipe: MENCHI-KATSU',
160 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
161 'thumbnail': 'md5:d6a4d9b6e9be90aaadda0bcce89631ed',
162 'series': 'Dining with the Chef',
163 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU',
164 },
165 }, {
166 # radio
167 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/livinginjapan-20231001-1/',
168 'info_dict': {
169 'id': 'livinginjapan-20231001-1-en',
170 'ext': 'm4a',
171 'title': 'Living in Japan - Tips for Travelers to Japan / Ramen Vending Machines',
172 'series': 'Living in Japan',
173 'description': 'md5:850611969932874b4a3309e0cae06c2f',
174 'thumbnail': 'md5:960622fb6e06054a4a1a0c97ea752545',
175 'episode': 'Tips for Travelers to Japan / Ramen Vending Machines'
176 },
177 }, {
178 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/',
179 'only_matching': True,
180 }, {
181 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/',
182 'only_matching': True,
183 }, {
184 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/',
185 'only_matching': True,
186 }, {
187 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
188 'only_matching': True,
189 }, {
190 # video, alphabetic character in ID #29670
191 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/',
192 'info_dict': {
193 'id': 'qfjay6cg',
194 'ext': 'mp4',
195 'title': 'DESIGN TALKS plus - Fishermen’s Finery',
196 'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448',
197 'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$',
198 'upload_date': '20210615',
199 'timestamp': 1623722008,
200 },
201 'skip': '404 Not Found',
202 }, {
203 # japanese-language, longer id than english
204 'url': 'https://www3.nhk.or.jp/nhkworld/ja/ondemand/video/0020271111/',
205 'info_dict': {
206 'id': 'nw_ja_v_jvod_ohayou_20231008',
207 'ext': 'mp4',
208 'title': 'おはよう日本(7時台) - 10月8日放送',
209 'series': 'おはよう日本(7時台)',
210 'episode': '10月8日放送',
211 'thumbnail': 'md5:d733b1c8e965ab68fb02b2d347d0e9b4',
212 'description': 'md5:9c1d6cbeadb827b955b20e99ab920ff0',
213 },
214 'skip': 'expires 2023-10-15',
215 }]
216
217 def _real_extract(self, url):
218 return self._extract_episode_info(url)
219
220
221 class NhkVodProgramIE(NhkBaseIE):
222 _VALID_URL = rf'{NhkBaseIE._BASE_URL_REGEX}/program{NhkBaseIE._TYPE_REGEX}(?P<id>\w+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?'
223 _TESTS = [{
224 # video program episodes
225 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo',
226 'info_dict': {
227 'id': 'sumo',
228 'title': 'GRAND SUMO Highlights',
229 },
230 'playlist_mincount': 12,
231 }, {
232 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway',
233 'info_dict': {
234 'id': 'japanrailway',
235 'title': 'Japan Railway Journal',
236 },
237 'playlist_mincount': 12,
238 }, {
239 # video program clips
240 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip',
241 'info_dict': {
242 'id': 'japanrailway',
243 'title': 'Japan Railway Journal',
244 },
245 'playlist_mincount': 5,
246 }, {
247 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/',
248 'only_matching': True,
249 }, {
250 # audio program
251 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/',
252 'only_matching': True,
253 }]
254
255 def _real_extract(self, url):
256 lang, m_type, program_id, episode_type = self._match_valid_url(url).group('lang', 'type', 'id', 'episode_type')
257 episodes = self._call_api(
258 program_id, lang, m_type == 'video', False, episode_type == 'clip')
259
260 entries = []
261 for episode in episodes:
262 episode_path = episode.get('url')
263 if not episode_path:
264 continue
265 entries.append(self._extract_episode_info(
266 urljoin(url, episode_path), episode))
267
268 program_title = None
269 if entries:
270 program_title = entries[0].get('series')
271
272 return self.playlist_result(entries, program_id, program_title)
273
274
275 class NhkForSchoolBangumiIE(InfoExtractor):
276 _VALID_URL = r'https?://www2\.nhk\.or\.jp/school/movie/(?P<type>bangumi|clip)\.cgi\?das_id=(?P<id>[a-zA-Z0-9_-]+)'
277 _TESTS = [{
278 'url': 'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id=D0005150191_00000',
279 'info_dict': {
280 'id': 'D0005150191_00003',
281 'title': 'にている かな',
282 'duration': 599.999,
283 'timestamp': 1396414800,
284
285 'upload_date': '20140402',
286 'ext': 'mp4',
287
288 'chapters': 'count:12'
289 },
290 'params': {
291 # m3u8 download
292 'skip_download': True,
293 },
294 }]
295
296 def _real_extract(self, url):
297 program_type, video_id = self._match_valid_url(url).groups()
298
299 webpage = self._download_webpage(
300 f'https://www2.nhk.or.jp/school/movie/{program_type}.cgi?das_id={video_id}', video_id)
301
302 # searches all variables
303 base_values = {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)}
304 # and programObj values too
305 program_values = {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)}
306 # extract all chapters
307 chapter_durations = [parse_duration(g.group(1)) for g in re.finditer(r'chapterTime\.push\(\'([0-9:]+?)\'\);', webpage)]
308 chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'<div class="cpTitle"><span>(scene\s*\d+)?</span>([^<]+?)</div>', webpage)]
309
310 # this is how player_core.js is actually doing (!)
311 version = base_values.get('r_version') or program_values.get('version')
312 if version:
313 video_id = f'{video_id.split("_")[0]}_{version}'
314
315 formats = self._extract_m3u8_formats(
316 f'https://nhks-vh.akamaihd.net/i/das/{video_id[0:8]}/{video_id}_V_000.f4v/master.m3u8',
317 video_id, ext='mp4', m3u8_id='hls')
318
319 duration = parse_duration(base_values.get('r_duration'))
320
321 chapters = None
322 if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles):
323 start_time = chapter_durations
324 end_time = chapter_durations[1:] + [duration]
325 chapters = [{
326 'start_time': s,
327 'end_time': e,
328 'title': t,
329 } for s, e, t in zip(start_time, end_time, chapter_titles)]
330
331 return {
332 'id': video_id,
333 'title': program_values.get('name'),
334 'duration': parse_duration(base_values.get('r_duration')),
335 'timestamp': unified_timestamp(base_values['r_upload']),
336 'formats': formats,
337 'chapters': chapters,
338 }
339
340
341 class NhkForSchoolSubjectIE(InfoExtractor):
342 IE_DESC = 'Portal page for each school subjects, like Japanese (kokugo, 国語) or math (sansuu/suugaku or 算数・数学)'
343 KNOWN_SUBJECTS = (
344 'rika', 'syakai', 'kokugo',
345 'sansuu', 'seikatsu', 'doutoku',
346 'ongaku', 'taiiku', 'zukou',
347 'gijutsu', 'katei', 'sougou',
348 'eigo', 'tokkatsu',
349 'tokushi', 'sonota',
350 )
351 _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>%s)/?(?:[\?#].*)?$' % '|'.join(re.escape(s) for s in KNOWN_SUBJECTS)
352
353 _TESTS = [{
354 'url': 'https://www.nhk.or.jp/school/sougou/',
355 'info_dict': {
356 'id': 'sougou',
357 'title': '総合的な学習の時間',
358 },
359 'playlist_mincount': 16,
360 }, {
361 'url': 'https://www.nhk.or.jp/school/rika/',
362 'info_dict': {
363 'id': 'rika',
364 'title': '理科',
365 },
366 'playlist_mincount': 15,
367 }]
368
369 def _real_extract(self, url):
370 subject_id = self._match_id(url)
371 webpage = self._download_webpage(url, subject_id)
372
373 return self.playlist_from_matches(
374 re.finditer(rf'href="((?:https?://www\.nhk\.or\.jp)?/school/{re.escape(subject_id)}/[^/]+/)"', webpage),
375 subject_id,
376 self._html_search_regex(r'(?s)<span\s+class="subjectName">\s*<img\s*[^<]+>\s*([^<]+?)</span>', webpage, 'title', fatal=False),
377 lambda g: urljoin(url, g.group(1)))
378
379
380 class NhkForSchoolProgramListIE(InfoExtractor):
381 _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>(?:%s)/[a-zA-Z0-9_-]+)' % (
382 '|'.join(re.escape(s) for s in NhkForSchoolSubjectIE.KNOWN_SUBJECTS)
383 )
384 _TESTS = [{
385 'url': 'https://www.nhk.or.jp/school/sougou/q/',
386 'info_dict': {
387 'id': 'sougou/q',
388 'title': 'Q~こどものための哲学',
389 },
390 'playlist_mincount': 20,
391 }]
392
393 def _real_extract(self, url):
394 program_id = self._match_id(url)
395
396 webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id)
397
398 title = (self._generic_title('', webpage)
399 or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False))
400 title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None
401 description = self._html_search_regex(
402 r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',
403 webpage, 'description', fatal=False, group=0)
404
405 bangumi_list = self._download_json(
406 f'https://www.nhk.or.jp/school/{program_id}/meta/program.json', program_id)
407 # they're always bangumi
408 bangumis = [
409 self.url_result(f'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id={x}')
410 for x in traverse_obj(bangumi_list, ('part', ..., 'part-video-dasid')) or []]
411
412 return self.playlist_result(bangumis, program_id, title, description)
413
414
415 class NhkRadiruIE(InfoExtractor):
416 _GEO_COUNTRIES = ['JP']
417 IE_DESC = 'NHK らじる (Radiru/Rajiru)'
418 _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
419 _TESTS = [{
420 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3853544',
421 'skip': 'Episode expired on 2023-04-16',
422 'info_dict': {
423 'channel': 'NHK-FM',
424 'description': 'md5:94b08bdeadde81a97df4ec882acce3e9',
425 'ext': 'm4a',
426 'id': '0449_01_3853544',
427 'series': 'ジャズ・トゥナイト',
428 'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
429 'timestamp': 1680969600,
430 'title': 'ジャズ・トゥナイト NEWジャズ特集',
431 'upload_date': '20230408',
432 'release_timestamp': 1680962400,
433 'release_date': '20230408',
434 'was_live': True,
435 },
436 }, {
437 # playlist, airs every weekday so it should _hopefully_ be okay forever
438 'url': 'https://www.nhk.or.jp/radio/ondemand/detail.html?p=0458_01',
439 'info_dict': {
440 'id': '0458_01',
441 'title': 'ベストオブクラシック',
442 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。',
443 'channel': 'NHK-FM',
444 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg',
445 },
446 'playlist_mincount': 3,
447 }, {
448 # one with letters in the id
449 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470',
450 'note': 'Expires on 2024-03-31',
451 'info_dict': {
452 'id': 'F300_06_3738470',
453 'ext': 'm4a',
454 'title': '有島武郎「一房のぶどう」',
455 'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n(2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より)',
456 'channel': 'NHKラジオ第1、NHK-FM',
457 'timestamp': 1635757200,
458 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg',
459 'release_date': '20161207',
460 'series': 'らじる文庫 by ラジオ深夜便 ',
461 'release_timestamp': 1481126700,
462 'upload_date': '20211101',
463 }
464 }, {
465 # news
466 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109',
467 'skip': 'Expires on 2023-04-17',
468 'info_dict': {
469 'id': 'F261_01_3855109',
470 'ext': 'm4a',
471 'channel': 'NHKラジオ第1',
472 'timestamp': 1681635900,
473 'release_date': '20230416',
474 'series': 'NHKラジオニュース',
475 'title': '午後6時のNHKニュース',
476 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
477 'upload_date': '20230416',
478 'release_timestamp': 1681635600,
479 },
480 }]
481
482 def _extract_episode_info(self, headline, programme_id, series_meta):
483 episode_id = f'{programme_id}_{headline["headline_id"]}'
484 episode = traverse_obj(headline, ('file_list', 0, {dict}))
485
486 return {
487 **series_meta,
488 'id': episode_id,
489 'formats': self._extract_m3u8_formats(episode.get('file_name'), episode_id, fatal=False),
490 'container': 'm4a_dash', # force fixup, AAC-only HLS
491 'was_live': True,
492 'series': series_meta.get('title'),
493 'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'),
494 **traverse_obj(episode, {
495 'title': 'file_title',
496 'description': 'file_title_sub',
497 'timestamp': ('open_time', {unified_timestamp}),
498 'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}),
499 }),
500 }
501
502 def _real_extract(self, url):
503 site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline')
504 programme_id = f'{site_id}_{corner_id}'
505
506 if site_id == 'F261':
507 json_url = 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json'
508 else:
509 json_url = f'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json'
510
511 meta = self._download_json(json_url, programme_id)['main']
512
513 series_meta = traverse_obj(meta, {
514 'title': 'program_name',
515 'channel': 'media_name',
516 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}),
517 }, get_all=False)
518
519 if headline_id:
520 return self._extract_episode_info(
521 traverse_obj(meta, (
522 'detail_list', lambda _, v: v['headline_id'] == headline_id), get_all=False),
523 programme_id, series_meta)
524
525 def entries():
526 for headline in traverse_obj(meta, ('detail_list', ..., {dict})):
527 yield self._extract_episode_info(headline, programme_id, series_meta)
528
529 return self.playlist_result(
530 entries(), programme_id, playlist_description=meta.get('site_detail'), **series_meta)
531
532
533 class NhkRadioNewsPageIE(InfoExtractor):
534 _VALID_URL = r'https?://www\.nhk\.or\.jp/radionews/?(?:$|[?#])'
535 _TESTS = [{
536 # airs daily, on-the-hour most hours
537 'url': 'https://www.nhk.or.jp/radionews/',
538 'playlist_mincount': 5,
539 'info_dict': {
540 'id': 'F261_01',
541 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
542 'description': 'md5:bf2c5b397e44bc7eb26de98d8f15d79d',
543 'channel': 'NHKラジオ第1',
544 'title': 'NHKラジオニュース',
545 }
546 }]
547
548 def _real_extract(self, url):
549 return self.url_result('https://www.nhk.or.jp/radio/ondemand/detail.html?p=F261_01', NhkRadiruIE)
550
551
552 class NhkRadiruLiveIE(InfoExtractor):
553 _GEO_COUNTRIES = ['JP']
554 _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/player/\?ch=(?P<id>r[12]|fm)'
555 _TESTS = [{
556 # radio 1, no area specified
557 'url': 'https://www.nhk.or.jp/radio/player/?ch=r1',
558 'info_dict': {
559 'id': 'r1-tokyo',
560 'title': 're:^NHKネットラジオ第1 東京.+$',
561 'ext': 'm4a',
562 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r1-200x200.png',
563 'live_status': 'is_live',
564 },
565 }, {
566 # radio 2, area specified
567 # (the area doesnt actually matter, r2 is national)
568 'url': 'https://www.nhk.or.jp/radio/player/?ch=r2',
569 'params': {'extractor_args': {'nhkradirulive': {'area': ['fukuoka']}}},
570 'info_dict': {
571 'id': 'r2-fukuoka',
572 'title': 're:^NHKネットラジオ第2 福岡.+$',
573 'ext': 'm4a',
574 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r2-200x200.png',
575 'live_status': 'is_live',
576 },
577 }, {
578 # fm, area specified
579 'url': 'https://www.nhk.or.jp/radio/player/?ch=fm',
580 'params': {'extractor_args': {'nhkradirulive': {'area': ['sapporo']}}},
581 'info_dict': {
582 'id': 'fm-sapporo',
583 'title': 're:^NHKネットラジオFM 札幌.+$',
584 'ext': 'm4a',
585 'thumbnail': 'https://www.nhk.or.jp/common/img/media/fm-200x200.png',
586 'live_status': 'is_live',
587 }
588 }]
589
590 _NOA_STATION_IDS = {'r1': 'n1', 'r2': 'n2', 'fm': 'n3'}
591
592 def _real_extract(self, url):
593 station = self._match_id(url)
594 area = self._configuration_arg('area', ['tokyo'])[0]
595
596 config = self._download_xml(
597 'https://www.nhk.or.jp/radio/config/config_web.xml', station, 'Downloading area information')
598 data = config.find(f'.//data//area[.="{area}"]/..')
599
600 if not data:
601 raise ExtractorError('Invalid area. Valid areas are: %s' % ', '.join(
602 [i.text for i in config.findall('.//data//area')]), expected=True)
603
604 noa_info = self._download_json(
605 f'https:{config.find(".//url_program_noa").text}'.format(area=data.find('areakey').text),
606 station, note=f'Downloading {area} station metadata')
607 present_info = traverse_obj(noa_info, ('nowonair_list', self._NOA_STATION_IDS.get(station), 'present'))
608
609 return {
610 'title': ' '.join(traverse_obj(present_info, (('service', 'area',), 'name', {str}))),
611 'id': join_nonempty(station, area),
612 'thumbnails': traverse_obj(present_info, ('service', 'images', ..., {
613 'url': 'url',
614 'width': ('width', {int_or_none}),
615 'height': ('height', {int_or_none}),
616 })),
617 'formats': self._extract_m3u8_formats(data.find(f'{station}hls').text, station),
618 'is_live': True,
619 }