3 from .common
import InfoExtractor
13 class NhkBaseIE(InfoExtractor
):
14 _API_URL_TEMPLATE
= 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json'
15 _BASE_URL_REGEX
= r
'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
16 _TYPE_REGEX
= r
'/(?P<type>video|audio)/'
18 def _call_api(self
, m_id
, lang
, is_video
, is_episode
, is_clip
):
19 return self
._download
_json
(
20 self
._API
_URL
_TEMPLATE
% (
21 'v' if is_video
else 'r',
22 'clip' if is_clip
else 'esd',
23 'episode' if is_episode
else 'program',
24 m_id
, lang
, '/all' if is_video
else ''),
25 m_id
, query
={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'}
)['data']['episodes'] or []
27 def _extract_episode_info(self
, url
, episode
=None):
28 fetch_episode
= episode
is None
29 lang
, m_type
, episode_id
= NhkVodIE
._match
_valid
_url
(url
).groups()
30 if episode_id
.isdigit():
31 episode_id
= episode_id
[:4] + '-' + episode_id
[4:]
33 is_video
= m_type
== 'video'
35 episode
= self
._call
_api
(
36 episode_id
, lang
, is_video
, True, episode_id
[:4] == '9999')[0]
37 title
= episode
.get('sub_title_clean') or episode
['sub_title']
39 def get_clean_field(key
):
40 return episode
.get(key
+ '_clean') or episode
.get(key
)
42 series
= get_clean_field('title')
45 for s
, w
, h
in [('', 640, 360), ('_l', 1280, 720)]:
46 img_path
= episode
.get('image' + s
)
53 'url': 'https://www3.nhk.or.jp' + img_path
,
57 'id': episode_id
+ '-' + lang
,
58 'title': '%s - %s' % (series
, title
) if series
and title
else title
,
59 'description': get_clean_field('description'),
60 'thumbnails': thumbnails
,
65 vod_id
= episode
['vod_id']
67 '_type': 'url_transparent',
69 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id
,
74 audio_path
= episode
['audio']['audio']
75 info
['formats'] = self
._extract
_m
3u8_formats
(
76 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path
,
77 episode_id
, 'm4a', entry_protocol
='m3u8_native',
78 m3u8_id
='hls', fatal
=False)
79 for f
in info
['formats']:
81 self
._sort
_formats
(info
['formats'])
84 '_type': 'url_transparent',
85 'ie_key': NhkVodIE
.ie_key(),
91 class NhkVodIE(NhkBaseIE
):
92 _VALID_URL
= r
'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE
._BASE
_URL
_REGEX
, NhkBaseIE
._TYPE
_REGEX
)
93 # Content available only for a limited period of time. Visit
94 # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
97 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/',
98 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca',
102 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU",
103 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
104 'timestamp': 1565965194,
105 'upload_date': '20190816',
109 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/',
111 'id': 'r_inventions-20201104-1-en',
113 'title': "Japan's Top Inventions - Miniature Video Cameras",
114 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b',
118 'skip_download': True,
121 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/',
122 'only_matching': True,
124 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/',
125 'only_matching': True,
127 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/',
128 'only_matching': True,
130 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
131 'only_matching': True,
134 def _real_extract(self
, url
):
135 return self
._extract
_episode
_info
(url
)
138 class NhkVodProgramIE(NhkBaseIE
):
139 _VALID_URL
= r
'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE
._BASE
_URL
_REGEX
, NhkBaseIE
._TYPE
_REGEX
)
141 # video program episodes
142 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway',
144 'id': 'japanrailway',
145 'title': 'Japan Railway Journal',
147 'playlist_mincount': 1,
149 # video program clips
150 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip',
152 'id': 'japanrailway',
153 'title': 'Japan Railway Journal',
155 'playlist_mincount': 5,
157 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/',
158 'only_matching': True,
161 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/',
162 'only_matching': True,
165 def _real_extract(self
, url
):
166 lang
, m_type
, program_id
, episode_type
= self
._match
_valid
_url
(url
).groups()
168 episodes
= self
._call
_api
(
169 program_id
, lang
, m_type
== 'video', False, episode_type
== 'clip')
172 for episode
in episodes
:
173 episode_path
= episode
.get('url')
176 entries
.append(self
._extract
_episode
_info
(
177 urljoin(url
, episode_path
), episode
))
181 program_title
= entries
[0].get('series')
183 return self
.playlist_result(entries
, program_id
, program_title
)
186 class NhkForSchoolBangumiIE(InfoExtractor
):
187 _VALID_URL
= r
'https?://www2\.nhk\.or\.jp/school/movie/(?P<type>bangumi|clip)\.cgi\?das_id=(?P<id>[a-zA-Z0-9_-]+)'
189 'url': 'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id=D0005150191_00000',
191 'id': 'D0005150191_00003',
194 'timestamp': 1396414800,
196 'upload_date': '20140402',
199 'chapters': 'count:12'
203 'skip_download': True,
207 def _real_extract(self
, url
):
208 program_type
, video_id
= self
._match
_valid
_url
(url
).groups()
210 webpage
= self
._download
_webpage
(
211 f
'https://www2.nhk.or.jp/school/movie/{program_type}.cgi?das_id={video_id}', video_id
)
213 # searches all variables
214 base_values
= {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)}
215 # and programObj values too
216 program_values
= {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)}
217 # extract all chapters
218 chapter_durations
= [parse_duration(g
.group(1)) for g
in re
.finditer(r
'chapterTime\.push\(\'([0-9:]+?
)\'\
);', webpage)]
219 chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'<div
class="cpTitle"><span
>(scene\s
*\d
+)?
</span
>([^
<]+?
)</div
>', webpage)]
221 # this is how player_core.js is actually doing (!)
222 version = base_values.get('r_version
') or program_values.get('version
')
224 video_id = f'{video_id.split("_")[0]}_{version}
'
226 formats = self._extract_m3u8_formats(
227 f'https
://nhks
-vh
.akamaihd
.net
/i
/das
/{video_id[0:8]}
/{video_id}_V_000
.f4v
/master
.m3u8
',
228 video_id, ext='mp4
', m3u8_id='hls
')
229 self._sort_formats(formats)
231 duration = parse_duration(base_values.get('r_duration
'))
234 if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles):
235 start_time = chapter_durations
236 end_time = chapter_durations[1:] + [duration]
241 } for s, e, t in zip(start_time, end_time, chapter_titles)]
245 'title
': program_values.get('name
'),
246 'duration
': parse_duration(base_values.get('r_duration
')),
247 'timestamp
': unified_timestamp(base_values['r_upload
']),
249 'chapters
': chapters,
253 class NhkForSchoolSubjectIE(InfoExtractor):
254 IE_DESC = 'Portal page
for each school subjects
, like
Japanese (kokugo
, 国語
) or math (sansuu
/suugaku
or 算数・数学
)'
256 'rika
', 'syakai
', 'kokugo
',
257 'sansuu
', 'seikatsu
', 'doutoku
',
258 'ongaku
', 'taiiku
', 'zukou
',
259 'gijutsu
', 'katei
', 'sougou
',
263 _VALID_URL = r'https?
://www\
.nhk\
.or\
.jp
/school
/(?P
<id>%s)/?
(?
:[\?#].*)?$' % '|'.join(re.escape(s) for s in KNOWN_SUBJECTS)
266 'url': 'https://www.nhk.or.jp/school/sougou/',
269 'title': '総合的な学習の時間',
271 'playlist_mincount': 16,
273 'url': 'https://www.nhk.or.jp/school/rika/',
278 'playlist_mincount': 15,
281 def _real_extract(self
, url
):
282 subject_id
= self
._match
_id
(url
)
283 webpage
= self
._download
_webpage
(url
, subject_id
)
285 return self
.playlist_from_matches(
286 re
.finditer(rf
'href="((?:https?://www\.nhk\.or\.jp)?/school/{re.escape(subject_id)}/[^/]+/)"', webpage
),
288 self
._html
_search
_regex
(r
'(?s)<span\s+class="subjectName">\s*<img\s*[^<]+>\s*([^<]+?)</span>', webpage
, 'title', fatal
=False),
289 lambda g
: urljoin(url
, g
.group(1)))
292 class NhkForSchoolProgramListIE(InfoExtractor
):
293 _VALID_URL
= r
'https?://www\.nhk\.or\.jp/school/(?P<id>(?:%s)/[a-zA-Z0-9_-]+)' % (
294 '|'.join(re
.escape(s
) for s
in NhkForSchoolSubjectIE
.KNOWN_SUBJECTS
)
297 'url': 'https://www.nhk.or.jp/school/sougou/q/',
300 'title': 'Q~こどものための哲学',
302 'playlist_mincount': 20,
305 def _real_extract(self
, url
):
306 program_id
= self
._match
_id
(url
)
308 webpage
= self
._download
_webpage
(f
'https://www.nhk.or.jp/school/{program_id}/', program_id
)
310 title
= (self
._og
_search
_title
(webpage
)
311 or self
._html
_extract
_title
(webpage
)
312 or self
._html
_search
_regex
(r
'<h3>([^<]+?)とは?\s*</h3>', webpage
, 'title', fatal
=False))
313 title
= re
.sub(r
'\s*\|\s*NHK\s+for\s+School\s*$', '', title
) if title
else None
314 description
= self
._html
_search
_regex
(
315 r
'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',
316 webpage
, 'description', fatal
=False, group
=0)
318 bangumi_list
= self
._download
_json
(
319 f
'https://www.nhk.or.jp/school/{program_id}/meta/program.json', program_id
)
320 # they're always bangumi
322 self
.url_result(f
'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id={x}')
323 for x
in traverse_obj(bangumi_list
, ('part', ..., 'part-video-dasid')) or []]
325 return self
.playlist_result(bangumis
, program_id
, title
, description
)