3 from .common
import InfoExtractor
13 class NhkBaseIE(InfoExtractor
):
14 _API_URL_TEMPLATE
= 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json'
15 _BASE_URL_REGEX
= r
'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
16 _TYPE_REGEX
= r
'/(?P<type>video|audio)/'
18 def _call_api(self
, m_id
, lang
, is_video
, is_episode
, is_clip
):
19 return self
._download
_json
(
20 self
._API
_URL
_TEMPLATE
% (
21 'v' if is_video
else 'r',
22 'clip' if is_clip
else 'esd',
23 'episode' if is_episode
else 'program',
24 m_id
, lang
, '/all' if is_video
else ''),
25 m_id
, query
={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'}
)['data']['episodes'] or []
27 def _extract_episode_info(self
, url
, episode
=None):
28 fetch_episode
= episode
is None
29 lang
, m_type
, episode_id
= NhkVodIE
._match
_valid
_url
(url
).groups()
30 if len(episode_id
) == 7:
31 episode_id
= episode_id
[:4] + '-' + episode_id
[4:]
33 is_video
= m_type
== 'video'
35 episode
= self
._call
_api
(
36 episode_id
, lang
, is_video
, True, episode_id
[:4] == '9999')[0]
37 title
= episode
.get('sub_title_clean') or episode
['sub_title']
39 def get_clean_field(key
):
40 return episode
.get(key
+ '_clean') or episode
.get(key
)
42 series
= get_clean_field('title')
45 for s
, w
, h
in [('', 640, 360), ('_l', 1280, 720)]:
46 img_path
= episode
.get('image' + s
)
53 'url': 'https://www3.nhk.or.jp' + img_path
,
57 'id': episode_id
+ '-' + lang
,
58 'title': '%s - %s' % (series
, title
) if series
and title
else title
,
59 'description': get_clean_field('description'),
60 'thumbnails': thumbnails
,
65 vod_id
= episode
['vod_id']
67 '_type': 'url_transparent',
69 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id
,
74 audio_path
= episode
['audio']['audio']
75 info
['formats'] = self
._extract
_m
3u8_formats
(
76 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path
,
77 episode_id
, 'm4a', entry_protocol
='m3u8_native',
78 m3u8_id
='hls', fatal
=False)
79 for f
in info
['formats']:
83 '_type': 'url_transparent',
84 'ie_key': NhkVodIE
.ie_key(),
90 class NhkVodIE(NhkBaseIE
):
91 # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg
92 _VALID_URL
= r
'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE
._BASE
_URL
_REGEX
, NhkBaseIE
._TYPE
_REGEX
)
93 # Content available only for a limited period of time. Visit
94 # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
97 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/',
98 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca',
102 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU",
103 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
104 'timestamp': 1565965194,
105 'upload_date': '20190816',
109 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/',
111 'id': 'r_inventions-20201104-1-en',
113 'title': "Japan's Top Inventions - Miniature Video Cameras",
114 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b',
118 'skip_download': True,
121 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/',
122 'only_matching': True,
124 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/',
125 'only_matching': True,
127 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/',
128 'only_matching': True,
130 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
131 'only_matching': True,
133 # video, alphabetic character in ID #29670
134 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/',
135 'only_matching': True,
139 'title': 'DESIGN TALKS plus - Fishermen’s Finery',
140 'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448',
141 'thumbnail': r
're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$',
142 'upload_date': '20210615',
143 'timestamp': 1623722008,
147 def _real_extract(self
, url
):
148 return self
._extract
_episode
_info
(url
)
151 class NhkVodProgramIE(NhkBaseIE
):
152 _VALID_URL
= r
'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE
._BASE
_URL
_REGEX
, NhkBaseIE
._TYPE
_REGEX
)
154 # video program episodes
155 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway',
157 'id': 'japanrailway',
158 'title': 'Japan Railway Journal',
160 'playlist_mincount': 1,
162 # video program clips
163 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip',
165 'id': 'japanrailway',
166 'title': 'Japan Railway Journal',
168 'playlist_mincount': 5,
170 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/',
171 'only_matching': True,
174 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/',
175 'only_matching': True,
178 def _real_extract(self
, url
):
179 lang
, m_type
, program_id
, episode_type
= self
._match
_valid
_url
(url
).groups()
181 episodes
= self
._call
_api
(
182 program_id
, lang
, m_type
== 'video', False, episode_type
== 'clip')
185 for episode
in episodes
:
186 episode_path
= episode
.get('url')
189 entries
.append(self
._extract
_episode
_info
(
190 urljoin(url
, episode_path
), episode
))
194 program_title
= entries
[0].get('series')
196 return self
.playlist_result(entries
, program_id
, program_title
)
199 class NhkForSchoolBangumiIE(InfoExtractor
):
200 _VALID_URL
= r
'https?://www2\.nhk\.or\.jp/school/movie/(?P<type>bangumi|clip)\.cgi\?das_id=(?P<id>[a-zA-Z0-9_-]+)'
202 'url': 'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id=D0005150191_00000',
204 'id': 'D0005150191_00003',
207 'timestamp': 1396414800,
209 'upload_date': '20140402',
212 'chapters': 'count:12'
216 'skip_download': True,
220 def _real_extract(self
, url
):
221 program_type
, video_id
= self
._match
_valid
_url
(url
).groups()
223 webpage
= self
._download
_webpage
(
224 f
'https://www2.nhk.or.jp/school/movie/{program_type}.cgi?das_id={video_id}', video_id
)
226 # searches all variables
227 base_values
= {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)}
228 # and programObj values too
229 program_values
= {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)}
230 # extract all chapters
231 chapter_durations
= [parse_duration(g
.group(1)) for g
in re
.finditer(r
'chapterTime\.push\(\'([0-9:]+?
)\'\
);', webpage)]
232 chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'<div
class="cpTitle"><span
>(scene\s
*\d
+)?
</span
>([^
<]+?
)</div
>', webpage)]
234 # this is how player_core.js is actually doing (!)
235 version = base_values.get('r_version
') or program_values.get('version
')
237 video_id = f'{video_id.split("_")[0]}_{version}
'
239 formats = self._extract_m3u8_formats(
240 f'https
://nhks
-vh
.akamaihd
.net
/i
/das
/{video_id[0:8]}
/{video_id}_V_000
.f4v
/master
.m3u8
',
241 video_id, ext='mp4
', m3u8_id='hls
')
243 duration = parse_duration(base_values.get('r_duration
'))
246 if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles):
247 start_time = chapter_durations
248 end_time = chapter_durations[1:] + [duration]
253 } for s, e, t in zip(start_time, end_time, chapter_titles)]
257 'title
': program_values.get('name
'),
258 'duration
': parse_duration(base_values.get('r_duration
')),
259 'timestamp
': unified_timestamp(base_values['r_upload
']),
261 'chapters
': chapters,
265 class NhkForSchoolSubjectIE(InfoExtractor):
266 IE_DESC = 'Portal page
for each school subjects
, like
Japanese (kokugo
, 国語
) or math (sansuu
/suugaku
or 算数・数学
)'
268 'rika
', 'syakai
', 'kokugo
',
269 'sansuu
', 'seikatsu
', 'doutoku
',
270 'ongaku
', 'taiiku
', 'zukou
',
271 'gijutsu
', 'katei
', 'sougou
',
275 _VALID_URL = r'https?
://www\
.nhk\
.or\
.jp
/school
/(?P
<id>%s)/?
(?
:[\?#].*)?$' % '|'.join(re.escape(s) for s in KNOWN_SUBJECTS)
278 'url': 'https://www.nhk.or.jp/school/sougou/',
281 'title': '総合的な学習の時間',
283 'playlist_mincount': 16,
285 'url': 'https://www.nhk.or.jp/school/rika/',
290 'playlist_mincount': 15,
293 def _real_extract(self
, url
):
294 subject_id
= self
._match
_id
(url
)
295 webpage
= self
._download
_webpage
(url
, subject_id
)
297 return self
.playlist_from_matches(
298 re
.finditer(rf
'href="((?:https?://www\.nhk\.or\.jp)?/school/{re.escape(subject_id)}/[^/]+/)"', webpage
),
300 self
._html
_search
_regex
(r
'(?s)<span\s+class="subjectName">\s*<img\s*[^<]+>\s*([^<]+?)</span>', webpage
, 'title', fatal
=False),
301 lambda g
: urljoin(url
, g
.group(1)))
304 class NhkForSchoolProgramListIE(InfoExtractor
):
305 _VALID_URL
= r
'https?://www\.nhk\.or\.jp/school/(?P<id>(?:%s)/[a-zA-Z0-9_-]+)' % (
306 '|'.join(re
.escape(s
) for s
in NhkForSchoolSubjectIE
.KNOWN_SUBJECTS
)
309 'url': 'https://www.nhk.or.jp/school/sougou/q/',
312 'title': 'Q~こどものための哲学',
314 'playlist_mincount': 20,
317 def _real_extract(self
, url
):
318 program_id
= self
._match
_id
(url
)
320 webpage
= self
._download
_webpage
(f
'https://www.nhk.or.jp/school/{program_id}/', program_id
)
322 title
= (self
._generic
_title
('', webpage
)
323 or self
._html
_search
_regex
(r
'<h3>([^<]+?)とは?\s*</h3>', webpage
, 'title', fatal
=False))
324 title
= re
.sub(r
'\s*\|\s*NHK\s+for\s+School\s*$', '', title
) if title
else None
325 description
= self
._html
_search
_regex
(
326 r
'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',
327 webpage
, 'description', fatal
=False, group
=0)
329 bangumi_list
= self
._download
_json
(
330 f
'https://www.nhk.or.jp/school/{program_id}/meta/program.json', program_id
)
331 # they're always bangumi
333 self
.url_result(f
'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id={x}')
334 for x
in traverse_obj(bangumi_list
, ('part', ..., 'part-video-dasid')) or []]
336 return self
.playlist_result(bangumis
, program_id
, title
, description
)