]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/nhk.py
[ie/tenplay] Add support for seasons (#7939)
[yt-dlp.git] / yt_dlp / extractor / nhk.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5 ExtractorError,
6 int_or_none,
7 join_nonempty,
8 parse_duration,
9 traverse_obj,
10 unescapeHTML,
11 unified_timestamp,
12 url_or_none,
13 urljoin,
14 )
15
16
17 class NhkBaseIE(InfoExtractor):
18 _API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json'
19 _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
20 _TYPE_REGEX = r'/(?P<type>video|audio)/'
21
22 def _call_api(self, m_id, lang, is_video, is_episode, is_clip):
23 return self._download_json(
24 self._API_URL_TEMPLATE % (
25 'v' if is_video else 'r',
26 'clip' if is_clip else 'esd',
27 'episode' if is_episode else 'program',
28 m_id, lang, '/all' if is_video else ''),
29 m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] or []
30
31 def _get_api_info(self, refresh=True):
32 if not refresh:
33 return self.cache.load('nhk', 'api_info')
34
35 self.cache.store('nhk', 'api_info', {})
36 movie_player_js = self._download_webpage(
37 'https://movie-a.nhk.or.jp/world/player/js/movie-player.js', None,
38 note='Downloading stream API information')
39 api_info = {
40 'url': self._search_regex(
41 r'prod:[^;]+\bapiUrl:\s*[\'"]([^\'"]+)[\'"]', movie_player_js, None, 'stream API url'),
42 'token': self._search_regex(
43 r'prod:[^;]+\btoken:\s*[\'"]([^\'"]+)[\'"]', movie_player_js, None, 'stream API token'),
44 }
45 self.cache.store('nhk', 'api_info', api_info)
46 return api_info
47
48 def _extract_formats_and_subtitles(self, vod_id):
49 for refresh in (False, True):
50 api_info = self._get_api_info(refresh)
51 if not api_info:
52 continue
53
54 api_url = api_info.pop('url')
55 stream_url = traverse_obj(
56 self._download_json(
57 api_url, vod_id, 'Downloading stream url info', fatal=False, query={
58 **api_info,
59 'type': 'json',
60 'optional_id': vod_id,
61 'active_flg': 1,
62 }),
63 ('meta', 0, 'movie_url', ('mb_auto', 'auto_sp', 'auto_pc'), {url_or_none}), get_all=False)
64 if stream_url:
65 return self._extract_m3u8_formats_and_subtitles(stream_url, vod_id)
66
67 raise ExtractorError('Unable to extract stream url')
68
69 def _extract_episode_info(self, url, episode=None):
70 fetch_episode = episode is None
71 lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups()
72 if len(episode_id) == 7:
73 episode_id = episode_id[:4] + '-' + episode_id[4:]
74
75 is_video = m_type == 'video'
76 if fetch_episode:
77 episode = self._call_api(
78 episode_id, lang, is_video, True, episode_id[:4] == '9999')[0]
79 title = episode.get('sub_title_clean') or episode['sub_title']
80
81 def get_clean_field(key):
82 return episode.get(key + '_clean') or episode.get(key)
83
84 series = get_clean_field('title')
85
86 thumbnails = []
87 for s, w, h in [('', 640, 360), ('_l', 1280, 720)]:
88 img_path = episode.get('image' + s)
89 if not img_path:
90 continue
91 thumbnails.append({
92 'id': '%dp' % h,
93 'height': h,
94 'width': w,
95 'url': 'https://www3.nhk.or.jp' + img_path,
96 })
97
98 info = {
99 'id': episode_id + '-' + lang,
100 'title': '%s - %s' % (series, title) if series and title else title,
101 'description': get_clean_field('description'),
102 'thumbnails': thumbnails,
103 'series': series,
104 'episode': title,
105 }
106 if is_video:
107 vod_id = episode['vod_id']
108 formats, subs = self._extract_formats_and_subtitles(vod_id)
109
110 info.update({
111 'id': vod_id,
112 'formats': formats,
113 'subtitles': subs,
114 })
115
116 else:
117 if fetch_episode:
118 audio_path = episode['audio']['audio']
119 info['formats'] = self._extract_m3u8_formats(
120 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path,
121 episode_id, 'm4a', entry_protocol='m3u8_native',
122 m3u8_id='hls', fatal=False)
123 for f in info['formats']:
124 f['language'] = lang
125 else:
126 info.update({
127 '_type': 'url_transparent',
128 'ie_key': NhkVodIE.ie_key(),
129 'url': url,
130 })
131 return info
132
133
134 class NhkVodIE(NhkBaseIE):
135 # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg
136 _VALID_URL = r'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
137 # Content available only for a limited period of time. Visit
138 # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
139 _TESTS = [{
140 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2061601/',
141 'info_dict': {
142 'id': 'yd8322ch',
143 'ext': 'mp4',
144 'description': 'md5:109c8b05d67a62d0592f2b445d2cd898',
145 'title': 'GRAND SUMO Highlights - [Recap] May Tournament Day 1 (Opening Day)',
146 'upload_date': '20230514',
147 'timestamp': 1684083791,
148 'series': 'GRAND SUMO Highlights',
149 'episode': '[Recap] May Tournament Day 1 (Opening Day)',
150 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1684084443/4028649.jpg?w=1920&h=1080',
151 },
152 }, {
153 # video clip
154 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/',
155 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca',
156 'info_dict': {
157 'id': 'a95j5iza',
158 'ext': 'mp4',
159 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU",
160 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
161 'timestamp': 1565965194,
162 'upload_date': '20190816',
163 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1567086278/3715195.jpg?w=1920&h=1080',
164 'series': 'Dining with the Chef',
165 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU',
166 },
167 }, {
168 # audio clip
169 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/',
170 'info_dict': {
171 'id': 'r_inventions-20201104-1-en',
172 'ext': 'm4a',
173 'title': "Japan's Top Inventions - Miniature Video Cameras",
174 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b',
175 },
176 'skip': '404 Not Found',
177 }, {
178 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/',
179 'only_matching': True,
180 }, {
181 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/',
182 'only_matching': True,
183 }, {
184 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/',
185 'only_matching': True,
186 }, {
187 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
188 'only_matching': True,
189 }, {
190 # video, alphabetic character in ID #29670
191 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/',
192 'info_dict': {
193 'id': 'qfjay6cg',
194 'ext': 'mp4',
195 'title': 'DESIGN TALKS plus - Fishermen’s Finery',
196 'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448',
197 'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$',
198 'upload_date': '20210615',
199 'timestamp': 1623722008,
200 },
201 'skip': '404 Not Found',
202 }]
203
204 def _real_extract(self, url):
205 return self._extract_episode_info(url)
206
207
208 class NhkVodProgramIE(NhkBaseIE):
209 _VALID_URL = r'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
210 _TESTS = [{
211 # video program episodes
212 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo',
213 'info_dict': {
214 'id': 'sumo',
215 'title': 'GRAND SUMO Highlights',
216 },
217 'playlist_mincount': 12,
218 }, {
219 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway',
220 'info_dict': {
221 'id': 'japanrailway',
222 'title': 'Japan Railway Journal',
223 },
224 'playlist_mincount': 12,
225 }, {
226 # video program clips
227 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip',
228 'info_dict': {
229 'id': 'japanrailway',
230 'title': 'Japan Railway Journal',
231 },
232 'playlist_mincount': 5,
233 }, {
234 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/',
235 'only_matching': True,
236 }, {
237 # audio program
238 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/',
239 'only_matching': True,
240 }]
241
242 def _real_extract(self, url):
243 lang, m_type, program_id, episode_type = self._match_valid_url(url).groups()
244
245 episodes = self._call_api(
246 program_id, lang, m_type == 'video', False, episode_type == 'clip')
247
248 entries = []
249 for episode in episodes:
250 episode_path = episode.get('url')
251 if not episode_path:
252 continue
253 entries.append(self._extract_episode_info(
254 urljoin(url, episode_path), episode))
255
256 program_title = None
257 if entries:
258 program_title = entries[0].get('series')
259
260 return self.playlist_result(entries, program_id, program_title)
261
262
263 class NhkForSchoolBangumiIE(InfoExtractor):
264 _VALID_URL = r'https?://www2\.nhk\.or\.jp/school/movie/(?P<type>bangumi|clip)\.cgi\?das_id=(?P<id>[a-zA-Z0-9_-]+)'
265 _TESTS = [{
266 'url': 'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id=D0005150191_00000',
267 'info_dict': {
268 'id': 'D0005150191_00003',
269 'title': 'にている かな',
270 'duration': 599.999,
271 'timestamp': 1396414800,
272
273 'upload_date': '20140402',
274 'ext': 'mp4',
275
276 'chapters': 'count:12'
277 },
278 'params': {
279 # m3u8 download
280 'skip_download': True,
281 },
282 }]
283
284 def _real_extract(self, url):
285 program_type, video_id = self._match_valid_url(url).groups()
286
287 webpage = self._download_webpage(
288 f'https://www2.nhk.or.jp/school/movie/{program_type}.cgi?das_id={video_id}', video_id)
289
290 # searches all variables
291 base_values = {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)}
292 # and programObj values too
293 program_values = {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)}
294 # extract all chapters
295 chapter_durations = [parse_duration(g.group(1)) for g in re.finditer(r'chapterTime\.push\(\'([0-9:]+?)\'\);', webpage)]
296 chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'<div class="cpTitle"><span>(scene\s*\d+)?</span>([^<]+?)</div>', webpage)]
297
298 # this is how player_core.js is actually doing (!)
299 version = base_values.get('r_version') or program_values.get('version')
300 if version:
301 video_id = f'{video_id.split("_")[0]}_{version}'
302
303 formats = self._extract_m3u8_formats(
304 f'https://nhks-vh.akamaihd.net/i/das/{video_id[0:8]}/{video_id}_V_000.f4v/master.m3u8',
305 video_id, ext='mp4', m3u8_id='hls')
306
307 duration = parse_duration(base_values.get('r_duration'))
308
309 chapters = None
310 if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles):
311 start_time = chapter_durations
312 end_time = chapter_durations[1:] + [duration]
313 chapters = [{
314 'start_time': s,
315 'end_time': e,
316 'title': t,
317 } for s, e, t in zip(start_time, end_time, chapter_titles)]
318
319 return {
320 'id': video_id,
321 'title': program_values.get('name'),
322 'duration': parse_duration(base_values.get('r_duration')),
323 'timestamp': unified_timestamp(base_values['r_upload']),
324 'formats': formats,
325 'chapters': chapters,
326 }
327
328
329 class NhkForSchoolSubjectIE(InfoExtractor):
330 IE_DESC = 'Portal page for each school subjects, like Japanese (kokugo, 国語) or math (sansuu/suugaku or 算数・数学)'
331 KNOWN_SUBJECTS = (
332 'rika', 'syakai', 'kokugo',
333 'sansuu', 'seikatsu', 'doutoku',
334 'ongaku', 'taiiku', 'zukou',
335 'gijutsu', 'katei', 'sougou',
336 'eigo', 'tokkatsu',
337 'tokushi', 'sonota',
338 )
339 _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>%s)/?(?:[\?#].*)?$' % '|'.join(re.escape(s) for s in KNOWN_SUBJECTS)
340
341 _TESTS = [{
342 'url': 'https://www.nhk.or.jp/school/sougou/',
343 'info_dict': {
344 'id': 'sougou',
345 'title': '総合的な学習の時間',
346 },
347 'playlist_mincount': 16,
348 }, {
349 'url': 'https://www.nhk.or.jp/school/rika/',
350 'info_dict': {
351 'id': 'rika',
352 'title': '理科',
353 },
354 'playlist_mincount': 15,
355 }]
356
357 def _real_extract(self, url):
358 subject_id = self._match_id(url)
359 webpage = self._download_webpage(url, subject_id)
360
361 return self.playlist_from_matches(
362 re.finditer(rf'href="((?:https?://www\.nhk\.or\.jp)?/school/{re.escape(subject_id)}/[^/]+/)"', webpage),
363 subject_id,
364 self._html_search_regex(r'(?s)<span\s+class="subjectName">\s*<img\s*[^<]+>\s*([^<]+?)</span>', webpage, 'title', fatal=False),
365 lambda g: urljoin(url, g.group(1)))
366
367
368 class NhkForSchoolProgramListIE(InfoExtractor):
369 _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>(?:%s)/[a-zA-Z0-9_-]+)' % (
370 '|'.join(re.escape(s) for s in NhkForSchoolSubjectIE.KNOWN_SUBJECTS)
371 )
372 _TESTS = [{
373 'url': 'https://www.nhk.or.jp/school/sougou/q/',
374 'info_dict': {
375 'id': 'sougou/q',
376 'title': 'Q~こどものための哲学',
377 },
378 'playlist_mincount': 20,
379 }]
380
381 def _real_extract(self, url):
382 program_id = self._match_id(url)
383
384 webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id)
385
386 title = (self._generic_title('', webpage)
387 or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False))
388 title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None
389 description = self._html_search_regex(
390 r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',
391 webpage, 'description', fatal=False, group=0)
392
393 bangumi_list = self._download_json(
394 f'https://www.nhk.or.jp/school/{program_id}/meta/program.json', program_id)
395 # they're always bangumi
396 bangumis = [
397 self.url_result(f'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id={x}')
398 for x in traverse_obj(bangumi_list, ('part', ..., 'part-video-dasid')) or []]
399
400 return self.playlist_result(bangumis, program_id, title, description)
401
402
403 class NhkRadiruIE(InfoExtractor):
404 _GEO_COUNTRIES = ['JP']
405 IE_DESC = 'NHK らじる (Radiru/Rajiru)'
406 _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
407 _TESTS = [{
408 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3853544',
409 'skip': 'Episode expired on 2023-04-16',
410 'info_dict': {
411 'channel': 'NHK-FM',
412 'description': 'md5:94b08bdeadde81a97df4ec882acce3e9',
413 'ext': 'm4a',
414 'id': '0449_01_3853544',
415 'series': 'ジャズ・トゥナイト',
416 'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
417 'timestamp': 1680969600,
418 'title': 'ジャズ・トゥナイト NEWジャズ特集',
419 'upload_date': '20230408',
420 'release_timestamp': 1680962400,
421 'release_date': '20230408',
422 'was_live': True,
423 },
424 }, {
425 # playlist, airs every weekday so it should _hopefully_ be okay forever
426 'url': 'https://www.nhk.or.jp/radio/ondemand/detail.html?p=0458_01',
427 'info_dict': {
428 'id': '0458_01',
429 'title': 'ベストオブクラシック',
430 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。',
431 'channel': 'NHK-FM',
432 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg',
433 },
434 'playlist_mincount': 3,
435 }, {
436 # one with letters in the id
437 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470',
438 'note': 'Expires on 2024-03-31',
439 'info_dict': {
440 'id': 'F300_06_3738470',
441 'ext': 'm4a',
442 'title': '有島武郎「一房のぶどう」',
443 'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n(2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より)',
444 'channel': 'NHKラジオ第1、NHK-FM',
445 'timestamp': 1635757200,
446 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg',
447 'release_date': '20161207',
448 'series': 'らじる文庫 by ラジオ深夜便 ',
449 'release_timestamp': 1481126700,
450 'upload_date': '20211101',
451 }
452 }, {
453 # news
454 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109',
455 'skip': 'Expires on 2023-04-17',
456 'info_dict': {
457 'id': 'F261_01_3855109',
458 'ext': 'm4a',
459 'channel': 'NHKラジオ第1',
460 'timestamp': 1681635900,
461 'release_date': '20230416',
462 'series': 'NHKラジオニュース',
463 'title': '午後6時のNHKニュース',
464 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
465 'upload_date': '20230416',
466 'release_timestamp': 1681635600,
467 },
468 }]
469
470 def _extract_episode_info(self, headline, programme_id, series_meta):
471 episode_id = f'{programme_id}_{headline["headline_id"]}'
472 episode = traverse_obj(headline, ('file_list', 0, {dict}))
473
474 return {
475 **series_meta,
476 'id': episode_id,
477 'formats': self._extract_m3u8_formats(episode.get('file_name'), episode_id, fatal=False),
478 'container': 'm4a_dash', # force fixup, AAC-only HLS
479 'was_live': True,
480 'series': series_meta.get('title'),
481 'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'),
482 **traverse_obj(episode, {
483 'title': 'file_title',
484 'description': 'file_title_sub',
485 'timestamp': ('open_time', {unified_timestamp}),
486 'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}),
487 }),
488 }
489
490 def _real_extract(self, url):
491 site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline')
492 programme_id = f'{site_id}_{corner_id}'
493
494 if site_id == 'F261':
495 json_url = 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json'
496 else:
497 json_url = f'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json'
498
499 meta = self._download_json(json_url, programme_id)['main']
500
501 series_meta = traverse_obj(meta, {
502 'title': 'program_name',
503 'channel': 'media_name',
504 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}),
505 }, get_all=False)
506
507 if headline_id:
508 return self._extract_episode_info(
509 traverse_obj(meta, (
510 'detail_list', lambda _, v: v['headline_id'] == headline_id), get_all=False),
511 programme_id, series_meta)
512
513 def entries():
514 for headline in traverse_obj(meta, ('detail_list', ..., {dict})):
515 yield self._extract_episode_info(headline, programme_id, series_meta)
516
517 return self.playlist_result(
518 entries(), programme_id, playlist_description=meta.get('site_detail'), **series_meta)
519
520
521 class NhkRadioNewsPageIE(InfoExtractor):
522 _VALID_URL = r'https?://www\.nhk\.or\.jp/radionews/?(?:$|[?#])'
523 _TESTS = [{
524 # airs daily, on-the-hour most hours
525 'url': 'https://www.nhk.or.jp/radionews/',
526 'playlist_mincount': 5,
527 'info_dict': {
528 'id': 'F261_01',
529 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
530 'description': 'md5:bf2c5b397e44bc7eb26de98d8f15d79d',
531 'channel': 'NHKラジオ第1',
532 'title': 'NHKラジオニュース',
533 }
534 }]
535
536 def _real_extract(self, url):
537 return self.url_result('https://www.nhk.or.jp/radio/ondemand/detail.html?p=F261_01', NhkRadiruIE)
538
539
540 class NhkRadiruLiveIE(InfoExtractor):
541 _GEO_COUNTRIES = ['JP']
542 _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/player/\?ch=(?P<id>r[12]|fm)'
543 _TESTS = [{
544 # radio 1, no area specified
545 'url': 'https://www.nhk.or.jp/radio/player/?ch=r1',
546 'info_dict': {
547 'id': 'r1-tokyo',
548 'title': 're:^NHKネットラジオ第1 東京.+$',
549 'ext': 'm4a',
550 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r1-200x200.png',
551 'live_status': 'is_live',
552 },
553 }, {
554 # radio 2, area specified
555 # (the area doesnt actually matter, r2 is national)
556 'url': 'https://www.nhk.or.jp/radio/player/?ch=r2',
557 'params': {'extractor_args': {'nhkradirulive': {'area': ['fukuoka']}}},
558 'info_dict': {
559 'id': 'r2-fukuoka',
560 'title': 're:^NHKネットラジオ第2 福岡.+$',
561 'ext': 'm4a',
562 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r2-200x200.png',
563 'live_status': 'is_live',
564 },
565 }, {
566 # fm, area specified
567 'url': 'https://www.nhk.or.jp/radio/player/?ch=fm',
568 'params': {'extractor_args': {'nhkradirulive': {'area': ['sapporo']}}},
569 'info_dict': {
570 'id': 'fm-sapporo',
571 'title': 're:^NHKネットラジオFM 札幌.+$',
572 'ext': 'm4a',
573 'thumbnail': 'https://www.nhk.or.jp/common/img/media/fm-200x200.png',
574 'live_status': 'is_live',
575 }
576 }]
577
578 _NOA_STATION_IDS = {'r1': 'n1', 'r2': 'n2', 'fm': 'n3'}
579
580 def _real_extract(self, url):
581 station = self._match_id(url)
582 area = self._configuration_arg('area', ['tokyo'])[0]
583
584 config = self._download_xml(
585 'https://www.nhk.or.jp/radio/config/config_web.xml', station, 'Downloading area information')
586 data = config.find(f'.//data//area[.="{area}"]/..')
587
588 if not data:
589 raise ExtractorError('Invalid area. Valid areas are: %s' % ', '.join(
590 [i.text for i in config.findall('.//data//area')]), expected=True)
591
592 noa_info = self._download_json(
593 f'https:{config.find(".//url_program_noa").text}'.format(area=data.find('areakey').text),
594 station, note=f'Downloading {area} station metadata')
595 present_info = traverse_obj(noa_info, ('nowonair_list', self._NOA_STATION_IDS.get(station), 'present'))
596
597 return {
598 'title': ' '.join(traverse_obj(present_info, (('service', 'area',), 'name', {str}))),
599 'id': join_nonempty(station, area),
600 'thumbnails': traverse_obj(present_info, ('service', 'images', ..., {
601 'url': 'url',
602 'width': ('width', {int_or_none}),
603 'height': ('height', {int_or_none}),
604 })),
605 'formats': self._extract_m3u8_formats(data.find(f'{station}hls').text, station),
606 'is_live': True,
607 }