3 from .common
import InfoExtractor
13 class NhkBaseIE(InfoExtractor
):
14 _API_URL_TEMPLATE
= 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json'
15 _BASE_URL_REGEX
= r
'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
16 _TYPE_REGEX
= r
'/(?P<type>video|audio)/'
18 def _call_api(self
, m_id
, lang
, is_video
, is_episode
, is_clip
):
19 return self
._download
_json
(
20 self
._API
_URL
_TEMPLATE
% (
21 'v' if is_video
else 'r',
22 'clip' if is_clip
else 'esd',
23 'episode' if is_episode
else 'program',
24 m_id
, lang
, '/all' if is_video
else ''),
25 m_id
, query
={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'}
)['data']['episodes'] or []
27 def _extract_episode_info(self
, url
, episode
=None):
28 fetch_episode
= episode
is None
29 lang
, m_type
, episode_id
= NhkVodIE
._match
_valid
_url
(url
).groups()
30 if len(episode_id
) == 7:
31 episode_id
= episode_id
[:4] + '-' + episode_id
[4:]
33 is_video
= m_type
== 'video'
35 episode
= self
._call
_api
(
36 episode_id
, lang
, is_video
, True, episode_id
[:4] == '9999')[0]
37 title
= episode
.get('sub_title_clean') or episode
['sub_title']
39 def get_clean_field(key
):
40 return episode
.get(key
+ '_clean') or episode
.get(key
)
42 series
= get_clean_field('title')
45 for s
, w
, h
in [('', 640, 360), ('_l', 1280, 720)]:
46 img_path
= episode
.get('image' + s
)
53 'url': 'https://www3.nhk.or.jp' + img_path
,
57 'id': episode_id
+ '-' + lang
,
58 'title': '%s - %s' % (series
, title
) if series
and title
else title
,
59 'description': get_clean_field('description'),
60 'thumbnails': thumbnails
,
65 vod_id
= episode
['vod_id']
67 '_type': 'url_transparent',
69 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id
,
74 audio_path
= episode
['audio']['audio']
75 info
['formats'] = self
._extract
_m
3u8_formats
(
76 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path
,
77 episode_id
, 'm4a', entry_protocol
='m3u8_native',
78 m3u8_id
='hls', fatal
=False)
79 for f
in info
['formats']:
81 self
._sort
_formats
(info
['formats'])
84 '_type': 'url_transparent',
85 'ie_key': NhkVodIE
.ie_key(),
91 class NhkVodIE(NhkBaseIE
):
92 # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg
93 _VALID_URL
= r
'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE
._BASE
_URL
_REGEX
, NhkBaseIE
._TYPE
_REGEX
)
94 # Content available only for a limited period of time. Visit
95 # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
98 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/',
99 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca',
103 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU",
104 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
105 'timestamp': 1565965194,
106 'upload_date': '20190816',
110 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/',
112 'id': 'r_inventions-20201104-1-en',
114 'title': "Japan's Top Inventions - Miniature Video Cameras",
115 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b',
119 'skip_download': True,
122 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/',
123 'only_matching': True,
125 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/',
126 'only_matching': True,
128 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/',
129 'only_matching': True,
131 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
132 'only_matching': True,
134 # video, alphabetic character in ID #29670
135 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/',
136 'only_matching': True,
140 'title': 'DESIGN TALKS plus - Fishermen’s Finery',
141 'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448',
142 'thumbnail': r
're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$',
143 'upload_date': '20210615',
144 'timestamp': 1623722008,
148 def _real_extract(self
, url
):
149 return self
._extract
_episode
_info
(url
)
152 class NhkVodProgramIE(NhkBaseIE
):
153 _VALID_URL
= r
'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE
._BASE
_URL
_REGEX
, NhkBaseIE
._TYPE
_REGEX
)
155 # video program episodes
156 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway',
158 'id': 'japanrailway',
159 'title': 'Japan Railway Journal',
161 'playlist_mincount': 1,
163 # video program clips
164 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip',
166 'id': 'japanrailway',
167 'title': 'Japan Railway Journal',
169 'playlist_mincount': 5,
171 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/',
172 'only_matching': True,
175 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/',
176 'only_matching': True,
179 def _real_extract(self
, url
):
180 lang
, m_type
, program_id
, episode_type
= self
._match
_valid
_url
(url
).groups()
182 episodes
= self
._call
_api
(
183 program_id
, lang
, m_type
== 'video', False, episode_type
== 'clip')
186 for episode
in episodes
:
187 episode_path
= episode
.get('url')
190 entries
.append(self
._extract
_episode
_info
(
191 urljoin(url
, episode_path
), episode
))
195 program_title
= entries
[0].get('series')
197 return self
.playlist_result(entries
, program_id
, program_title
)
200 class NhkForSchoolBangumiIE(InfoExtractor
):
201 _VALID_URL
= r
'https?://www2\.nhk\.or\.jp/school/movie/(?P<type>bangumi|clip)\.cgi\?das_id=(?P<id>[a-zA-Z0-9_-]+)'
203 'url': 'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id=D0005150191_00000',
205 'id': 'D0005150191_00003',
208 'timestamp': 1396414800,
210 'upload_date': '20140402',
213 'chapters': 'count:12'
217 'skip_download': True,
221 def _real_extract(self
, url
):
222 program_type
, video_id
= self
._match
_valid
_url
(url
).groups()
224 webpage
= self
._download
_webpage
(
225 f
'https://www2.nhk.or.jp/school/movie/{program_type}.cgi?das_id={video_id}', video_id
)
227 # searches all variables
228 base_values
= {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)}
229 # and programObj values too
230 program_values
= {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)}
231 # extract all chapters
232 chapter_durations
= [parse_duration(g
.group(1)) for g
in re
.finditer(r
'chapterTime\.push\(\'([0-9:]+?
)\'\
);', webpage)]
233 chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'<div
class="cpTitle"><span
>(scene\s
*\d
+)?
</span
>([^
<]+?
)</div
>', webpage)]
235 # this is how player_core.js is actually doing (!)
236 version = base_values.get('r_version
') or program_values.get('version
')
238 video_id = f'{video_id.split("_")[0]}_{version}
'
240 formats = self._extract_m3u8_formats(
241 f'https
://nhks
-vh
.akamaihd
.net
/i
/das
/{video_id[0:8]}
/{video_id}_V_000
.f4v
/master
.m3u8
',
242 video_id, ext='mp4
', m3u8_id='hls
')
243 self._sort_formats(formats)
245 duration = parse_duration(base_values.get('r_duration
'))
248 if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles):
249 start_time = chapter_durations
250 end_time = chapter_durations[1:] + [duration]
255 } for s, e, t in zip(start_time, end_time, chapter_titles)]
259 'title
': program_values.get('name
'),
260 'duration
': parse_duration(base_values.get('r_duration
')),
261 'timestamp
': unified_timestamp(base_values['r_upload
']),
263 'chapters
': chapters,
267 class NhkForSchoolSubjectIE(InfoExtractor):
268 IE_DESC = 'Portal page
for each school subjects
, like
Japanese (kokugo
, 国語
) or math (sansuu
/suugaku
or 算数・数学
)'
270 'rika
', 'syakai
', 'kokugo
',
271 'sansuu
', 'seikatsu
', 'doutoku
',
272 'ongaku
', 'taiiku
', 'zukou
',
273 'gijutsu
', 'katei
', 'sougou
',
277 _VALID_URL = r'https?
://www\
.nhk\
.or\
.jp
/school
/(?P
<id>%s)/?
(?
:[\?#].*)?$' % '|'.join(re.escape(s) for s in KNOWN_SUBJECTS)
280 'url': 'https://www.nhk.or.jp/school/sougou/',
283 'title': '総合的な学習の時間',
285 'playlist_mincount': 16,
287 'url': 'https://www.nhk.or.jp/school/rika/',
292 'playlist_mincount': 15,
295 def _real_extract(self
, url
):
296 subject_id
= self
._match
_id
(url
)
297 webpage
= self
._download
_webpage
(url
, subject_id
)
299 return self
.playlist_from_matches(
300 re
.finditer(rf
'href="((?:https?://www\.nhk\.or\.jp)?/school/{re.escape(subject_id)}/[^/]+/)"', webpage
),
302 self
._html
_search
_regex
(r
'(?s)<span\s+class="subjectName">\s*<img\s*[^<]+>\s*([^<]+?)</span>', webpage
, 'title', fatal
=False),
303 lambda g
: urljoin(url
, g
.group(1)))
306 class NhkForSchoolProgramListIE(InfoExtractor
):
307 _VALID_URL
= r
'https?://www\.nhk\.or\.jp/school/(?P<id>(?:%s)/[a-zA-Z0-9_-]+)' % (
308 '|'.join(re
.escape(s
) for s
in NhkForSchoolSubjectIE
.KNOWN_SUBJECTS
)
311 'url': 'https://www.nhk.or.jp/school/sougou/q/',
314 'title': 'Q~こどものための哲学',
316 'playlist_mincount': 20,
319 def _real_extract(self
, url
):
320 program_id
= self
._match
_id
(url
)
322 webpage
= self
._download
_webpage
(f
'https://www.nhk.or.jp/school/{program_id}/', program_id
)
324 title
= (self
._og
_search
_title
(webpage
)
325 or self
._html
_extract
_title
(webpage
)
326 or self
._html
_search
_regex
(r
'<h3>([^<]+?)とは?\s*</h3>', webpage
, 'title', fatal
=False))
327 title
= re
.sub(r
'\s*\|\s*NHK\s+for\s+School\s*$', '', title
) if title
else None
328 description
= self
._html
_search
_regex
(
329 r
'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',
330 webpage
, 'description', fatal
=False, group
=0)
332 bangumi_list
= self
._download
_json
(
333 f
'https://www.nhk.or.jp/school/{program_id}/meta/program.json', program_id
)
334 # they're always bangumi
336 self
.url_result(f
'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id={x}')
337 for x
in traverse_obj(bangumi_list
, ('part', ..., 'part-video-dasid')) or []]
339 return self
.playlist_result(bangumis
, program_id
, title
, description
)