3 from .common
import InfoExtractor
17 class NhkBaseIE(InfoExtractor
):
18 _API_URL_TEMPLATE
= 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json'
19 _BASE_URL_REGEX
= r
'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
20 _TYPE_REGEX
= r
'/(?P<type>video|audio)/'
22 def _call_api(self
, m_id
, lang
, is_video
, is_episode
, is_clip
):
23 return self
._download
_json
(
24 self
._API
_URL
_TEMPLATE
% (
25 'v' if is_video
else 'r',
26 'clip' if is_clip
else 'esd',
27 'episode' if is_episode
else 'program',
28 m_id
, lang
, '/all' if is_video
else ''),
29 m_id
, query
={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'}
)['data']['episodes'] or []
31 def _get_api_info(self
, refresh
=True):
33 return self
.cache
.load('nhk', 'api_info')
35 self
.cache
.store('nhk', 'api_info', {})
36 movie_player_js
= self
._download
_webpage
(
37 'https://movie-a.nhk.or.jp/world/player/js/movie-player.js', None,
38 note
='Downloading stream API information')
40 'url': self
._search
_regex
(
41 r
'prod:[^;]+\bapiUrl:\s*[\'"]([^\'"]+)[\'"]', movie_player_js, None, 'stream API url'),
42 'token': self._search_regex(
43 r'prod:[^;]+\btoken:\s*[\'"]([^
\'"]+)[\'"]', movie_player_js, None, 'stream API token
'),
45 self.cache.store('nhk
', 'api_info
', api_info)
48 def _extract_formats_and_subtitles(self, vod_id):
49 for refresh in (False, True):
50 api_info = self._get_api_info(refresh)
54 api_url = api_info.pop('url
')
55 stream_url = traverse_obj(
57 api_url, vod_id, 'Downloading stream url info
', fatal=False, query={
60 'optional_id
': vod_id,
63 ('meta
', 0, 'movie_url
', ('mb_auto
', 'auto_sp
', 'auto_pc
'), {url_or_none}), get_all=False)
65 return self._extract_m3u8_formats_and_subtitles(stream_url, vod_id)
67 raise ExtractorError('Unable to extract stream url
')
69 def _extract_episode_info(self, url, episode=None):
70 fetch_episode = episode is None
71 lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups()
72 if len(episode_id) == 7:
73 episode_id = episode_id[:4] + '-' + episode_id[4:]
75 is_video = m_type == 'video
'
77 episode = self._call_api(
78 episode_id, lang, is_video, True, episode_id[:4] == '9999')[0]
79 title = episode.get('sub_title_clean
') or episode['sub_title
']
81 def get_clean_field(key):
82 return episode.get(key + '_clean
') or episode.get(key)
84 series = get_clean_field('title
')
87 for s, w, h in [('', 640, 360), ('_l
', 1280, 720)]:
88 img_path = episode.get('image
' + s)
95 'url
': 'https
://www3
.nhk
.or.jp
' + img_path,
99 'id': episode_id + '-' + lang,
100 'title
': '%s - %s' % (series, title) if series and title else title,
101 'description
': get_clean_field('description
'),
102 'thumbnails
': thumbnails,
107 vod_id = episode['vod_id
']
108 formats, subs = self._extract_formats_and_subtitles(vod_id)
118 audio_path = episode['audio
']['audio
']
119 info['formats
'] = self._extract_m3u8_formats(
120 'https
://nhkworld
-vh
.akamaihd
.net
/i
%s/master
.m3u8
' % audio_path,
121 episode_id, 'm4a
', entry_protocol='m3u8_native
',
122 m3u8_id='hls
', fatal=False)
123 for f in info['formats
']:
127 '_type
': 'url_transparent
',
128 'ie_key
': NhkVodIE.ie_key(),
134 class NhkVodIE(NhkBaseIE):
135 # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg
136 _VALID_URL = r'%s%s(?P
<id>[0-9a
-z
]{7}|
[^
/]+?
-\d{8}
-[0-9a
-z
]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
137 # Content available only for a limited period of time. Visit
138 # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
140 'url
': 'https
://www3
.nhk
.or.jp
/nhkworld
/en
/ondemand
/video
/2061601/',
144 'description
': 'md5
:109c8b05d67a62d0592f2b445d2cd898
',
145 'title
': 'GRAND SUMO Highlights
- [Recap
] May Tournament Day
1 (Opening Day
)',
146 'upload_date
': '20230514',
147 'timestamp
': 1684083791,
148 'series
': 'GRAND SUMO Highlights
',
149 'episode
': '[Recap
] May Tournament Day
1 (Opening Day
)',
150 'thumbnail
': 'https
://mz
-edge
.stream
.co
.jp
/thumbs
/aid
/t1684084443
/4028649.jpg?w
=1920&h
=1080',
154 'url
': 'https
://www3
.nhk
.or.jp
/nhkworld
/en
/ondemand
/video
/9999011/',
155 'md5
': '7a90abcfe610ec22a6bfe15bd46b30ca
',
159 'title
': "Dining with the Chef - Chef Saito's Family recipe
: MENCHI
-KATSU
",
160 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
161 'timestamp': 1565965194,
162 'upload_date': '20190816',
163 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1567086278/3715195.jpg?w=1920&h=1080',
164 'series': 'Dining with the Chef',
165 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU',
169 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/',
171 'id': 'r_inventions-20201104-1-en',
173 'title': "Japan
's Top Inventions - Miniature Video Cameras",
174 'description
': 'md5
:07ea722bdbbb4936fdd360b6a480c25b
',
176 'skip
': '404 Not Found
',
178 'url
': 'https
://www3
.nhk
.or.jp
/nhkworld
/en
/ondemand
/video
/2015173/',
179 'only_matching
': True,
181 'url
': 'https
://www3
.nhk
.or.jp
/nhkworld
/en
/ondemand
/audio
/plugin
-20190404-1/',
182 'only_matching
': True,
184 'url
': 'https
://www3
.nhk
.or.jp
/nhkworld
/fr
/ondemand
/audio
/plugin
-20190404-1/',
185 'only_matching
': True,
187 'url
': 'https
://www3
.nhk
.or.jp
/nhkworld
/en
/ondemand
/audio
/j_art
-20150903-1/',
188 'only_matching
': True,
190 # video, alphabetic character in ID #29670
191 'url
': 'https
://www3
.nhk
.or.jp
/nhkworld
/en
/ondemand
/video
/9999a34
/',
195 'title
': 'DESIGN TALKS plus
- Fishermen’s Finery
',
196 'description
': 'md5
:8a8f958aaafb0d7cb59d38de53f1e448
',
197 'thumbnail
': r're
:^https?
:/(/[a
-z0
-9.-]+)+\
.jpg
\?w
=1920&h
=1080$
',
198 'upload_date
': '20210615',
199 'timestamp
': 1623722008,
201 'skip
': '404 Not Found
',
204 def _real_extract(self, url):
205 return self._extract_episode_info(url)
208 class NhkVodProgramIE(NhkBaseIE):
209 _VALID_URL = r'%s/program
%s(?P
<id>[0-9a
-z
]+)(?
:.+?
\btype
=(?P
<episode_type
>clip|
(?
:radio|tv
)Episode
))?
' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
211 # video program episodes
212 'url
': 'https
://www3
.nhk
.or.jp
/nhkworld
/en
/ondemand
/program
/video
/sumo
',
215 'title
': 'GRAND SUMO Highlights
',
217 'playlist_mincount
': 12,
219 'url
': 'https
://www3
.nhk
.or.jp
/nhkworld
/en
/ondemand
/program
/video
/japanrailway
',
221 'id': 'japanrailway
',
222 'title
': 'Japan Railway Journal
',
224 'playlist_mincount
': 12,
226 # video program clips
227 'url
': 'https
://www3
.nhk
.or.jp
/nhkworld
/en
/ondemand
/program
/video
/japanrailway
/?
type=clip
',
229 'id': 'japanrailway
',
230 'title
': 'Japan Railway Journal
',
232 'playlist_mincount
': 5,
234 'url
': 'https
://www3
.nhk
.or.jp
/nhkworld
/en
/ondemand
/program
/video
/10yearshayaomiyazaki
/',
235 'only_matching
': True,
238 'url
': 'https
://www3
.nhk
.or.jp
/nhkworld
/en
/ondemand
/program
/audio
/listener
/',
239 'only_matching
': True,
242 def _real_extract(self, url):
243 lang, m_type, program_id, episode_type = self._match_valid_url(url).groups()
245 episodes = self._call_api(
246 program_id, lang, m_type == 'video
', False, episode_type == 'clip
')
249 for episode in episodes:
250 episode_path = episode.get('url
')
253 entries.append(self._extract_episode_info(
254 urljoin(url, episode_path), episode))
258 program_title = entries[0].get('series
')
260 return self.playlist_result(entries, program_id, program_title)
263 class NhkForSchoolBangumiIE(InfoExtractor):
264 _VALID_URL = r'https?
://www2\
.nhk\
.or\
.jp
/school
/movie
/(?P
<type>bangumi|clip
)\
.cgi
\?das_id
=(?P
<id>[a
-zA
-Z0
-9_-]+)'
266 'url
': 'https
://www2
.nhk
.or.jp
/school
/movie
/bangumi
.cgi?das_id
=D0005150191_00000
',
268 'id': 'D0005150191_00003
',
271 'timestamp
': 1396414800,
273 'upload_date
': '20140402',
276 'chapters
': 'count
:12'
280 'skip_download
': True,
284 def _real_extract(self, url):
285 program_type, video_id = self._match_valid_url(url).groups()
287 webpage = self._download_webpage(
288 f'https
://www2
.nhk
.or.jp
/school
/movie
/{program_type}
.cgi?das_id
={video_id}
', video_id)
290 # searches all variables
291 base_values = {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)}
292 # and programObj values too
293 program_values = {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)}
294 # extract all chapters
295 chapter_durations = [parse_duration(g.group(1)) for g in re.finditer(r'chapterTime\
.push\
(\'([0-9:]+?
)\'\
);', webpage)]
296 chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'<div
class="cpTitle"><span
>(scene\s
*\d
+)?
</span
>([^
<]+?
)</div
>', webpage)]
298 # this is how player_core.js is actually doing (!)
299 version = base_values.get('r_version
') or program_values.get('version
')
301 video_id = f'{video_id.split("_")[0]}_{version}
'
303 formats = self._extract_m3u8_formats(
304 f'https
://nhks
-vh
.akamaihd
.net
/i
/das
/{video_id[0:8]}
/{video_id}_V_000
.f4v
/master
.m3u8
',
305 video_id, ext='mp4
', m3u8_id='hls
')
307 duration = parse_duration(base_values.get('r_duration
'))
310 if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles):
311 start_time = chapter_durations
312 end_time = chapter_durations[1:] + [duration]
317 } for s, e, t in zip(start_time, end_time, chapter_titles)]
321 'title
': program_values.get('name
'),
322 'duration
': parse_duration(base_values.get('r_duration
')),
323 'timestamp
': unified_timestamp(base_values['r_upload
']),
325 'chapters
': chapters,
329 class NhkForSchoolSubjectIE(InfoExtractor):
330 IE_DESC = 'Portal page
for each school subjects
, like
Japanese (kokugo
, 国語
) or math (sansuu
/suugaku
or 算数・数学
)'
332 'rika
', 'syakai
', 'kokugo
',
333 'sansuu
', 'seikatsu
', 'doutoku
',
334 'ongaku
', 'taiiku
', 'zukou
',
335 'gijutsu
', 'katei
', 'sougou
',
339 _VALID_URL = r'https?
://www\
.nhk\
.or\
.jp
/school
/(?P
<id>%s)/?
(?
:[\?#].*)?$' % '|'.join(re.escape(s) for s in KNOWN_SUBJECTS)
342 'url': 'https://www.nhk.or.jp/school/sougou/',
345 'title': '総合的な学習の時間',
347 'playlist_mincount': 16,
349 'url': 'https://www.nhk.or.jp/school/rika/',
354 'playlist_mincount': 15,
357 def _real_extract(self
, url
):
358 subject_id
= self
._match
_id
(url
)
359 webpage
= self
._download
_webpage
(url
, subject_id
)
361 return self
.playlist_from_matches(
362 re
.finditer(rf
'href="((?:https?://www\.nhk\.or\.jp)?/school/{re.escape(subject_id)}/[^/]+/)"', webpage
),
364 self
._html
_search
_regex
(r
'(?s)<span\s+class="subjectName">\s*<img\s*[^<]+>\s*([^<]+?)</span>', webpage
, 'title', fatal
=False),
365 lambda g
: urljoin(url
, g
.group(1)))
368 class NhkForSchoolProgramListIE(InfoExtractor
):
369 _VALID_URL
= r
'https?://www\.nhk\.or\.jp/school/(?P<id>(?:%s)/[a-zA-Z0-9_-]+)' % (
370 '|'.join(re
.escape(s
) for s
in NhkForSchoolSubjectIE
.KNOWN_SUBJECTS
)
373 'url': 'https://www.nhk.or.jp/school/sougou/q/',
376 'title': 'Q~こどものための哲学',
378 'playlist_mincount': 20,
381 def _real_extract(self
, url
):
382 program_id
= self
._match
_id
(url
)
384 webpage
= self
._download
_webpage
(f
'https://www.nhk.or.jp/school/{program_id}/', program_id
)
386 title
= (self
._generic
_title
('', webpage
)
387 or self
._html
_search
_regex
(r
'<h3>([^<]+?)とは?\s*</h3>', webpage
, 'title', fatal
=False))
388 title
= re
.sub(r
'\s*\|\s*NHK\s+for\s+School\s*$', '', title
) if title
else None
389 description
= self
._html
_search
_regex
(
390 r
'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',
391 webpage
, 'description', fatal
=False, group
=0)
393 bangumi_list
= self
._download
_json
(
394 f
'https://www.nhk.or.jp/school/{program_id}/meta/program.json', program_id
)
395 # they're always bangumi
397 self
.url_result(f
'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id={x}')
398 for x
in traverse_obj(bangumi_list
, ('part', ..., 'part-video-dasid')) or []]
400 return self
.playlist_result(bangumis
, program_id
, title
, description
)
403 class NhkRadiruIE(InfoExtractor
):
404 _GEO_COUNTRIES
= ['JP']
405 IE_DESC
= 'NHK らじる (Radiru/Rajiru)'
406 _VALID_URL
= r
'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
408 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3853544',
409 'skip': 'Episode expired on 2023-04-16',
412 'description': 'md5:94b08bdeadde81a97df4ec882acce3e9',
414 'id': '0449_01_3853544',
415 'series': 'ジャズ・トゥナイト',
416 'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
417 'timestamp': 1680969600,
418 'title': 'ジャズ・トゥナイト NEWジャズ特集',
419 'upload_date': '20230408',
420 'release_timestamp': 1680962400,
421 'release_date': '20230408',
425 # playlist, airs every weekday so it should _hopefully_ be okay forever
426 'url': 'https://www.nhk.or.jp/radio/ondemand/detail.html?p=0458_01',
429 'title': 'ベストオブクラシック',
430 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。',
432 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg',
434 'playlist_mincount': 3,
436 # one with letters in the id
437 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470',
438 'note': 'Expires on 2024-03-31',
440 'id': 'F300_06_3738470',
442 'title': '有島武郎「一房のぶどう」',
443 'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n(2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より)',
444 'channel': 'NHKラジオ第1、NHK-FM',
445 'timestamp': 1635757200,
446 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg',
447 'release_date': '20161207',
448 'series': 'らじる文庫 by ラジオ深夜便 ',
449 'release_timestamp': 1481126700,
450 'upload_date': '20211101',
454 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109',
455 'skip': 'Expires on 2023-04-17',
457 'id': 'F261_01_3855109',
459 'channel': 'NHKラジオ第1',
460 'timestamp': 1681635900,
461 'release_date': '20230416',
462 'series': 'NHKラジオニュース',
463 'title': '午後6時のNHKニュース',
464 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
465 'upload_date': '20230416',
466 'release_timestamp': 1681635600,
470 def _extract_episode_info(self
, headline
, programme_id
, series_meta
):
471 episode_id
= f
'{programme_id}_{headline["headline_id"]}'
472 episode
= traverse_obj(headline
, ('file_list', 0, {dict}
))
477 'formats': self
._extract
_m
3u8_formats
(episode
.get('file_name'), episode_id
, fatal
=False),
478 'container': 'm4a_dash', # force fixup, AAC-only HLS
480 'series': series_meta
.get('title'),
481 'thumbnail': url_or_none(headline
.get('headline_image')) or series_meta
.get('thumbnail'),
482 **traverse_obj(episode
, {
483 'title': 'file_title',
484 'description': 'file_title_sub',
485 'timestamp': ('open_time', {unified_timestamp}
),
486 'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}
, {unified_timestamp}
),
490 def _real_extract(self
, url
):
491 site_id
, corner_id
, headline_id
= self
._match
_valid
_url
(url
).group('site', 'corner', 'headline')
492 programme_id
= f
'{site_id}_{corner_id}'
494 if site_id
== 'F261':
495 json_url
= 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json'
497 json_url
= f
'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json'
499 meta
= self
._download
_json
(json_url
, programme_id
)['main']
501 series_meta
= traverse_obj(meta
, {
502 'title': 'program_name',
503 'channel': 'media_name',
504 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}
),
508 return self
._extract
_episode
_info
(
510 'detail_list', lambda _
, v
: v
['headline_id'] == headline_id
), get_all
=False),
511 programme_id
, series_meta
)
514 for headline
in traverse_obj(meta
, ('detail_list', ..., {dict}
)):
515 yield self
._extract
_episode
_info
(headline
, programme_id
, series_meta
)
517 return self
.playlist_result(
518 entries(), programme_id
, playlist_description
=meta
.get('site_detail'), **series_meta
)
521 class NhkRadioNewsPageIE(InfoExtractor
):
522 _VALID_URL
= r
'https?://www\.nhk\.or\.jp/radionews/?(?:$|[?#])'
524 # airs daily, on-the-hour most hours
525 'url': 'https://www.nhk.or.jp/radionews/',
526 'playlist_mincount': 5,
529 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
530 'description': 'md5:bf2c5b397e44bc7eb26de98d8f15d79d',
531 'channel': 'NHKラジオ第1',
532 'title': 'NHKラジオニュース',
536 def _real_extract(self
, url
):
537 return self
.url_result('https://www.nhk.or.jp/radio/ondemand/detail.html?p=F261_01', NhkRadiruIE
)
540 class NhkRadiruLiveIE(InfoExtractor
):
541 _GEO_COUNTRIES
= ['JP']
542 _VALID_URL
= r
'https?://www\.nhk\.or\.jp/radio/player/\?ch=(?P<id>r[12]|fm)'
544 # radio 1, no area specified
545 'url': 'https://www.nhk.or.jp/radio/player/?ch=r1',
548 'title': 're:^NHKネットラジオ第1 東京.+$',
550 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r1-200x200.png',
551 'live_status': 'is_live',
554 # radio 2, area specified
555 # (the area doesnt actually matter, r2 is national)
556 'url': 'https://www.nhk.or.jp/radio/player/?ch=r2',
557 'params': {'extractor_args': {'nhkradirulive': {'area': ['fukuoka']}
}},
560 'title': 're:^NHKネットラジオ第2 福岡.+$',
562 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r2-200x200.png',
563 'live_status': 'is_live',
567 'url': 'https://www.nhk.or.jp/radio/player/?ch=fm',
568 'params': {'extractor_args': {'nhkradirulive': {'area': ['sapporo']}
}},
571 'title': 're:^NHKネットラジオFM 札幌.+$',
573 'thumbnail': 'https://www.nhk.or.jp/common/img/media/fm-200x200.png',
574 'live_status': 'is_live',
578 _NOA_STATION_IDS
= {'r1': 'n1', 'r2': 'n2', 'fm': 'n3'}
580 def _real_extract(self
, url
):
581 station
= self
._match
_id
(url
)
582 area
= self
._configuration
_arg
('area', ['tokyo'])[0]
584 config
= self
._download
_xml
(
585 'https://www.nhk.or.jp/radio/config/config_web.xml', station
, 'Downloading area information')
586 data
= config
.find(f
'.//data//area[.="{area}"]/..')
589 raise ExtractorError('Invalid area. Valid areas are: %s' % ', '.join(
590 [i
.text
for i
in config
.findall('.//data//area')]), expected
=True)
592 noa_info
= self
._download
_json
(
593 f
'https:{config.find(".//url_program_noa").text}'.format(area
=data
.find('areakey').text
),
594 station
, note
=f
'Downloading {area} station metadata')
595 present_info
= traverse_obj(noa_info
, ('nowonair_list', self
._NOA
_STATION
_IDS
.get(station
), 'present'))
598 'title': ' '.join(traverse_obj(present_info
, (('service', 'area',), 'name', {str}
))),
599 'id': join_nonempty(station
, area
),
600 'thumbnails': traverse_obj(present_info
, ('service', 'images', ..., {
602 'width': ('width', {int_or_none}
),
603 'height': ('height', {int_or_none}
),
605 'formats': self
._extract
_m
3u8_formats
(data
.find(f
'{station}hls').text
, station
),