3 from .common
import InfoExtractor
17 class NhkBaseIE(InfoExtractor
):
18 _API_URL_TEMPLATE
= 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json'
19 _BASE_URL_REGEX
= r
'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
20 _TYPE_REGEX
= r
'/(?P<type>video|audio)/'
22 def _call_api(self
, m_id
, lang
, is_video
, is_episode
, is_clip
):
23 return self
._download
_json
(
24 self
._API
_URL
_TEMPLATE
% (
25 'v' if is_video
else 'r',
26 'clip' if is_clip
else 'esd',
27 'episode' if is_episode
else 'program',
28 m_id
, lang
, '/all' if is_video
else ''),
29 m_id
, query
={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'}
)['data']['episodes'] or []
31 def _extract_episode_info(self
, url
, episode
=None):
32 fetch_episode
= episode
is None
33 lang
, m_type
, episode_id
= NhkVodIE
._match
_valid
_url
(url
).groups()
34 if len(episode_id
) == 7:
35 episode_id
= episode_id
[:4] + '-' + episode_id
[4:]
37 is_video
= m_type
== 'video'
39 episode
= self
._call
_api
(
40 episode_id
, lang
, is_video
, True, episode_id
[:4] == '9999')[0]
41 title
= episode
.get('sub_title_clean') or episode
['sub_title']
43 def get_clean_field(key
):
44 return episode
.get(key
+ '_clean') or episode
.get(key
)
46 series
= get_clean_field('title')
49 for s
, w
, h
in [('', 640, 360), ('_l', 1280, 720)]:
50 img_path
= episode
.get('image' + s
)
57 'url': 'https://www3.nhk.or.jp' + img_path
,
61 'id': episode_id
+ '-' + lang
,
62 'title': '%s - %s' % (series
, title
) if series
and title
else title
,
63 'description': get_clean_field('description'),
64 'thumbnails': thumbnails
,
69 vod_id
= episode
['vod_id']
71 '_type': 'url_transparent',
73 'url': 'https://movie-s.nhk.or.jp/v/refid/nhkworld/prefid/' + vod_id
,
78 audio_path
= episode
['audio']['audio']
79 info
['formats'] = self
._extract
_m
3u8_formats
(
80 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path
,
81 episode_id
, 'm4a', entry_protocol
='m3u8_native',
82 m3u8_id
='hls', fatal
=False)
83 for f
in info
['formats']:
87 '_type': 'url_transparent',
88 'ie_key': NhkVodIE
.ie_key(),
94 class NhkVodIE(NhkBaseIE
):
95 # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg
96 _VALID_URL
= r
'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE
._BASE
_URL
_REGEX
, NhkBaseIE
._TYPE
_REGEX
)
97 # Content available only for a limited period of time. Visit
98 # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
100 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2061601/',
104 'description': 'md5:109c8b05d67a62d0592f2b445d2cd898',
105 'title': 'GRAND SUMO Highlights - [Recap] May Tournament Day 1 (Opening Day)',
106 'upload_date': '20230514',
107 'timestamp': 1684083791,
108 'series': 'GRAND SUMO Highlights',
109 'episode': '[Recap] May Tournament Day 1 (Opening Day)',
110 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1684084443/4028649.jpg?w=1920&h=1080',
114 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/',
115 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca',
119 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU",
120 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
121 'timestamp': 1565965194,
122 'upload_date': '20190816',
123 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1567086278/3715195.jpg?w=1920&h=1080',
124 'series': 'Dining with the Chef',
125 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU',
129 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/',
131 'id': 'r_inventions-20201104-1-en',
133 'title': "Japan's Top Inventions - Miniature Video Cameras",
134 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b',
136 'skip': '404 Not Found',
138 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/',
139 'only_matching': True,
141 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/',
142 'only_matching': True,
144 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/',
145 'only_matching': True,
147 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
148 'only_matching': True,
150 # video, alphabetic character in ID #29670
151 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/',
155 'title': 'DESIGN TALKS plus - Fishermen’s Finery',
156 'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448',
157 'thumbnail': r
're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$',
158 'upload_date': '20210615',
159 'timestamp': 1623722008,
161 'skip': '404 Not Found',
164 def _real_extract(self
, url
):
165 return self
._extract
_episode
_info
(url
)
168 class NhkVodProgramIE(NhkBaseIE
):
169 _VALID_URL
= r
'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE
._BASE
_URL
_REGEX
, NhkBaseIE
._TYPE
_REGEX
)
171 # video program episodes
172 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo',
175 'title': 'GRAND SUMO Highlights',
177 'playlist_mincount': 12,
179 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway',
181 'id': 'japanrailway',
182 'title': 'Japan Railway Journal',
184 'playlist_mincount': 12,
186 # video program clips
187 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip',
189 'id': 'japanrailway',
190 'title': 'Japan Railway Journal',
192 'playlist_mincount': 5,
194 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/',
195 'only_matching': True,
198 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/',
199 'only_matching': True,
202 def _real_extract(self
, url
):
203 lang
, m_type
, program_id
, episode_type
= self
._match
_valid
_url
(url
).groups()
205 episodes
= self
._call
_api
(
206 program_id
, lang
, m_type
== 'video', False, episode_type
== 'clip')
209 for episode
in episodes
:
210 episode_path
= episode
.get('url')
213 entries
.append(self
._extract
_episode
_info
(
214 urljoin(url
, episode_path
), episode
))
218 program_title
= entries
[0].get('series')
220 return self
.playlist_result(entries
, program_id
, program_title
)
223 class NhkForSchoolBangumiIE(InfoExtractor
):
224 _VALID_URL
= r
'https?://www2\.nhk\.or\.jp/school/movie/(?P<type>bangumi|clip)\.cgi\?das_id=(?P<id>[a-zA-Z0-9_-]+)'
226 'url': 'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id=D0005150191_00000',
228 'id': 'D0005150191_00003',
231 'timestamp': 1396414800,
233 'upload_date': '20140402',
236 'chapters': 'count:12'
240 'skip_download': True,
244 def _real_extract(self
, url
):
245 program_type
, video_id
= self
._match
_valid
_url
(url
).groups()
247 webpage
= self
._download
_webpage
(
248 f
'https://www2.nhk.or.jp/school/movie/{program_type}.cgi?das_id={video_id}', video_id
)
250 # searches all variables
251 base_values
= {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)}
252 # and programObj values too
253 program_values
= {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)}
254 # extract all chapters
255 chapter_durations
= [parse_duration(g
.group(1)) for g
in re
.finditer(r
'chapterTime\.push\(\'([0-9:]+?
)\'\
);', webpage)]
256 chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'<div
class="cpTitle"><span
>(scene\s
*\d
+)?
</span
>([^
<]+?
)</div
>', webpage)]
258 # this is how player_core.js is actually doing (!)
259 version = base_values.get('r_version
') or program_values.get('version
')
261 video_id = f'{video_id.split("_")[0]}_{version}
'
263 formats = self._extract_m3u8_formats(
264 f'https
://nhks
-vh
.akamaihd
.net
/i
/das
/{video_id[0:8]}
/{video_id}_V_000
.f4v
/master
.m3u8
',
265 video_id, ext='mp4
', m3u8_id='hls
')
267 duration = parse_duration(base_values.get('r_duration
'))
270 if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles):
271 start_time = chapter_durations
272 end_time = chapter_durations[1:] + [duration]
277 } for s, e, t in zip(start_time, end_time, chapter_titles)]
281 'title
': program_values.get('name
'),
282 'duration
': parse_duration(base_values.get('r_duration
')),
283 'timestamp
': unified_timestamp(base_values['r_upload
']),
285 'chapters
': chapters,
289 class NhkForSchoolSubjectIE(InfoExtractor):
290 IE_DESC = 'Portal page
for each school subjects
, like
Japanese (kokugo
, 国語
) or math (sansuu
/suugaku
or 算数・数学
)'
292 'rika
', 'syakai
', 'kokugo
',
293 'sansuu
', 'seikatsu
', 'doutoku
',
294 'ongaku
', 'taiiku
', 'zukou
',
295 'gijutsu
', 'katei
', 'sougou
',
299 _VALID_URL = r'https?
://www\
.nhk\
.or\
.jp
/school
/(?P
<id>%s)/?
(?
:[\?#].*)?$' % '|'.join(re.escape(s) for s in KNOWN_SUBJECTS)
302 'url': 'https://www.nhk.or.jp/school/sougou/',
305 'title': '総合的な学習の時間',
307 'playlist_mincount': 16,
309 'url': 'https://www.nhk.or.jp/school/rika/',
314 'playlist_mincount': 15,
317 def _real_extract(self
, url
):
318 subject_id
= self
._match
_id
(url
)
319 webpage
= self
._download
_webpage
(url
, subject_id
)
321 return self
.playlist_from_matches(
322 re
.finditer(rf
'href="((?:https?://www\.nhk\.or\.jp)?/school/{re.escape(subject_id)}/[^/]+/)"', webpage
),
324 self
._html
_search
_regex
(r
'(?s)<span\s+class="subjectName">\s*<img\s*[^<]+>\s*([^<]+?)</span>', webpage
, 'title', fatal
=False),
325 lambda g
: urljoin(url
, g
.group(1)))
328 class NhkForSchoolProgramListIE(InfoExtractor
):
329 _VALID_URL
= r
'https?://www\.nhk\.or\.jp/school/(?P<id>(?:%s)/[a-zA-Z0-9_-]+)' % (
330 '|'.join(re
.escape(s
) for s
in NhkForSchoolSubjectIE
.KNOWN_SUBJECTS
)
333 'url': 'https://www.nhk.or.jp/school/sougou/q/',
336 'title': 'Q~こどものための哲学',
338 'playlist_mincount': 20,
341 def _real_extract(self
, url
):
342 program_id
= self
._match
_id
(url
)
344 webpage
= self
._download
_webpage
(f
'https://www.nhk.or.jp/school/{program_id}/', program_id
)
346 title
= (self
._generic
_title
('', webpage
)
347 or self
._html
_search
_regex
(r
'<h3>([^<]+?)とは?\s*</h3>', webpage
, 'title', fatal
=False))
348 title
= re
.sub(r
'\s*\|\s*NHK\s+for\s+School\s*$', '', title
) if title
else None
349 description
= self
._html
_search
_regex
(
350 r
'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',
351 webpage
, 'description', fatal
=False, group
=0)
353 bangumi_list
= self
._download
_json
(
354 f
'https://www.nhk.or.jp/school/{program_id}/meta/program.json', program_id
)
355 # they're always bangumi
357 self
.url_result(f
'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id={x}')
358 for x
in traverse_obj(bangumi_list
, ('part', ..., 'part-video-dasid')) or []]
360 return self
.playlist_result(bangumis
, program_id
, title
, description
)
363 class NhkRadiruIE(InfoExtractor
):
364 _GEO_COUNTRIES
= ['JP']
365 IE_DESC
= 'NHK らじる (Radiru/Rajiru)'
366 _VALID_URL
= r
'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
368 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3853544',
369 'skip': 'Episode expired on 2023-04-16',
372 'description': 'md5:94b08bdeadde81a97df4ec882acce3e9',
374 'id': '0449_01_3853544',
375 'series': 'ジャズ・トゥナイト',
376 'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
377 'timestamp': 1680969600,
378 'title': 'ジャズ・トゥナイト NEWジャズ特集',
379 'upload_date': '20230408',
380 'release_timestamp': 1680962400,
381 'release_date': '20230408',
385 # playlist, airs every weekday so it should _hopefully_ be okay forever
386 'url': 'https://www.nhk.or.jp/radio/ondemand/detail.html?p=0458_01',
389 'title': 'ベストオブクラシック',
390 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。',
392 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg',
394 'playlist_mincount': 3,
396 # one with letters in the id
397 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470',
398 'note': 'Expires on 2024-03-31',
400 'id': 'F300_06_3738470',
402 'title': '有島武郎「一房のぶどう」',
403 'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n(2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より)',
404 'channel': 'NHKラジオ第1、NHK-FM',
405 'timestamp': 1635757200,
406 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg',
407 'release_date': '20161207',
408 'series': 'らじる文庫 by ラジオ深夜便 ',
409 'release_timestamp': 1481126700,
410 'upload_date': '20211101',
414 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109',
415 'skip': 'Expires on 2023-04-17',
417 'id': 'F261_01_3855109',
419 'channel': 'NHKラジオ第1',
420 'timestamp': 1681635900,
421 'release_date': '20230416',
422 'series': 'NHKラジオニュース',
423 'title': '午後6時のNHKニュース',
424 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
425 'upload_date': '20230416',
426 'release_timestamp': 1681635600,
430 def _extract_episode_info(self
, headline
, programme_id
, series_meta
):
431 episode_id
= f
'{programme_id}_{headline["headline_id"]}'
432 episode
= traverse_obj(headline
, ('file_list', 0, {dict}
))
437 'formats': self
._extract
_m
3u8_formats
(episode
.get('file_name'), episode_id
, fatal
=False),
438 'container': 'm4a_dash', # force fixup, AAC-only HLS
440 'series': series_meta
.get('title'),
441 'thumbnail': url_or_none(headline
.get('headline_image')) or series_meta
.get('thumbnail'),
442 **traverse_obj(episode
, {
443 'title': 'file_title',
444 'description': 'file_title_sub',
445 'timestamp': ('open_time', {unified_timestamp}
),
446 'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}
, {unified_timestamp}
),
450 def _real_extract(self
, url
):
451 site_id
, corner_id
, headline_id
= self
._match
_valid
_url
(url
).group('site', 'corner', 'headline')
452 programme_id
= f
'{site_id}_{corner_id}'
454 if site_id
== 'F261':
455 json_url
= 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json'
457 json_url
= f
'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json'
459 meta
= self
._download
_json
(json_url
, programme_id
)['main']
461 series_meta
= traverse_obj(meta
, {
462 'title': 'program_name',
463 'channel': 'media_name',
464 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}
),
468 return self
._extract
_episode
_info
(
470 'detail_list', lambda _
, v
: v
['headline_id'] == headline_id
), get_all
=False),
471 programme_id
, series_meta
)
474 for headline
in traverse_obj(meta
, ('detail_list', ..., {dict}
)):
475 yield self
._extract
_episode
_info
(headline
, programme_id
, series_meta
)
477 return self
.playlist_result(
478 entries(), programme_id
, playlist_description
=meta
.get('site_detail'), **series_meta
)
481 class NhkRadioNewsPageIE(InfoExtractor
):
482 _VALID_URL
= r
'https?://www\.nhk\.or\.jp/radionews/?(?:$|[?#])'
484 # airs daily, on-the-hour most hours
485 'url': 'https://www.nhk.or.jp/radionews/',
486 'playlist_mincount': 5,
489 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
490 'description': 'md5:bf2c5b397e44bc7eb26de98d8f15d79d',
491 'channel': 'NHKラジオ第1',
492 'title': 'NHKラジオニュース',
496 def _real_extract(self
, url
):
497 return self
.url_result('https://www.nhk.or.jp/radio/ondemand/detail.html?p=F261_01', NhkRadiruIE
)
500 class NhkRadiruLiveIE(InfoExtractor
):
501 _GEO_COUNTRIES
= ['JP']
502 _VALID_URL
= r
'https?://www\.nhk\.or\.jp/radio/player/\?ch=(?P<id>r[12]|fm)'
504 # radio 1, no area specified
505 'url': 'https://www.nhk.or.jp/radio/player/?ch=r1',
508 'title': 're:^NHKネットラジオ第1 東京.+$',
510 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r1-200x200.png',
511 'live_status': 'is_live',
514 # radio 2, area specified
515 # (the area doesnt actually matter, r2 is national)
516 'url': 'https://www.nhk.or.jp/radio/player/?ch=r2',
517 'params': {'extractor_args': {'nhkradirulive': {'area': ['fukuoka']}
}},
520 'title': 're:^NHKネットラジオ第2 福岡.+$',
522 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r2-200x200.png',
523 'live_status': 'is_live',
527 'url': 'https://www.nhk.or.jp/radio/player/?ch=fm',
528 'params': {'extractor_args': {'nhkradirulive': {'area': ['sapporo']}
}},
531 'title': 're:^NHKネットラジオFM 札幌.+$',
533 'thumbnail': 'https://www.nhk.or.jp/common/img/media/fm-200x200.png',
534 'live_status': 'is_live',
538 _NOA_STATION_IDS
= {'r1': 'n1', 'r2': 'n2', 'fm': 'n3'}
540 def _real_extract(self
, url
):
541 station
= self
._match
_id
(url
)
542 area
= self
._configuration
_arg
('area', ['tokyo'])[0]
544 config
= self
._download
_xml
(
545 'https://www.nhk.or.jp/radio/config/config_web.xml', station
, 'Downloading area information')
546 data
= config
.find(f
'.//data//area[.="{area}"]/..')
549 raise ExtractorError('Invalid area. Valid areas are: %s' % ', '.join(
550 [i
.text
for i
in config
.findall('.//data//area')]), expected
=True)
552 noa_info
= self
._download
_json
(
553 f
'https:{config.find(".//url_program_noa").text}'.format(area
=data
.find('areakey').text
),
554 station
, note
=f
'Downloading {area} station metadata')
555 present_info
= traverse_obj(noa_info
, ('nowonair_list', self
._NOA
_STATION
_IDS
.get(station
), 'present'))
558 'title': ' '.join(traverse_obj(present_info
, (('service', 'area',), 'name', {str}
))),
559 'id': join_nonempty(station
, area
),
560 'thumbnails': traverse_obj(present_info
, ('service', 'images', ..., {
562 'width': ('width', {int_or_none}
),
563 'height': ('height', {int_or_none}
),
565 'formats': self
._extract
_m
3u8_formats
(data
.find(f
'{station}hls').text
, station
),