1 from __future__
import unicode_literals
5 from .common
import InfoExtractor
15 class NhkBaseIE(InfoExtractor
):
16 _API_URL_TEMPLATE
= 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json'
17 _BASE_URL_REGEX
= r
'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
18 _TYPE_REGEX
= r
'/(?P<type>video|audio)/'
20 def _call_api(self
, m_id
, lang
, is_video
, is_episode
, is_clip
):
21 return self
._download
_json
(
22 self
._API
_URL
_TEMPLATE
% (
23 'v' if is_video
else 'r',
24 'clip' if is_clip
else 'esd',
25 'episode' if is_episode
else 'program',
26 m_id
, lang
, '/all' if is_video
else ''),
27 m_id
, query
={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'}
)['data']['episodes'] or []
29 def _extract_episode_info(self
, url
, episode
=None):
30 fetch_episode
= episode
is None
31 lang
, m_type
, episode_id
= NhkVodIE
._match
_valid
_url
(url
).groups()
32 if episode_id
.isdigit():
33 episode_id
= episode_id
[:4] + '-' + episode_id
[4:]
35 is_video
= m_type
== 'video'
37 episode
= self
._call
_api
(
38 episode_id
, lang
, is_video
, True, episode_id
[:4] == '9999')[0]
39 title
= episode
.get('sub_title_clean') or episode
['sub_title']
41 def get_clean_field(key
):
42 return episode
.get(key
+ '_clean') or episode
.get(key
)
44 series
= get_clean_field('title')
47 for s
, w
, h
in [('', 640, 360), ('_l', 1280, 720)]:
48 img_path
= episode
.get('image' + s
)
55 'url': 'https://www3.nhk.or.jp' + img_path
,
59 'id': episode_id
+ '-' + lang
,
60 'title': '%s - %s' % (series
, title
) if series
and title
else title
,
61 'description': get_clean_field('description'),
62 'thumbnails': thumbnails
,
67 vod_id
= episode
['vod_id']
69 '_type': 'url_transparent',
71 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id
,
76 audio_path
= episode
['audio']['audio']
77 info
['formats'] = self
._extract
_m
3u8_formats
(
78 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path
,
79 episode_id
, 'm4a', entry_protocol
='m3u8_native',
80 m3u8_id
='hls', fatal
=False)
81 for f
in info
['formats']:
83 self
._sort
_formats
(info
['formats'])
86 '_type': 'url_transparent',
87 'ie_key': NhkVodIE
.ie_key(),
93 class NhkVodIE(NhkBaseIE
):
94 _VALID_URL
= r
'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE
._BASE
_URL
_REGEX
, NhkBaseIE
._TYPE
_REGEX
)
95 # Content available only for a limited period of time. Visit
96 # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
99 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/',
100 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca',
104 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU",
105 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
106 'timestamp': 1565965194,
107 'upload_date': '20190816',
111 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/',
113 'id': 'r_inventions-20201104-1-en',
115 'title': "Japan's Top Inventions - Miniature Video Cameras",
116 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b',
120 'skip_download': True,
123 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/',
124 'only_matching': True,
126 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/',
127 'only_matching': True,
129 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/',
130 'only_matching': True,
132 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
133 'only_matching': True,
136 def _real_extract(self
, url
):
137 return self
._extract
_episode
_info
(url
)
140 class NhkVodProgramIE(NhkBaseIE
):
141 _VALID_URL
= r
'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE
._BASE
_URL
_REGEX
, NhkBaseIE
._TYPE
_REGEX
)
143 # video program episodes
144 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway',
146 'id': 'japanrailway',
147 'title': 'Japan Railway Journal',
149 'playlist_mincount': 1,
151 # video program clips
152 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip',
154 'id': 'japanrailway',
155 'title': 'Japan Railway Journal',
157 'playlist_mincount': 5,
159 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/',
160 'only_matching': True,
163 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/',
164 'only_matching': True,
167 def _real_extract(self
, url
):
168 lang
, m_type
, program_id
, episode_type
= self
._match
_valid
_url
(url
).groups()
170 episodes
= self
._call
_api
(
171 program_id
, lang
, m_type
== 'video', False, episode_type
== 'clip')
174 for episode
in episodes
:
175 episode_path
= episode
.get('url')
178 entries
.append(self
._extract
_episode
_info
(
179 urljoin(url
, episode_path
), episode
))
183 program_title
= entries
[0].get('series')
185 return self
.playlist_result(entries
, program_id
, program_title
)
188 class NhkForSchoolBangumiIE(InfoExtractor
):
189 _VALID_URL
= r
'https?://www2\.nhk\.or\.jp/school/movie/(?P<type>bangumi|clip)\.cgi\?das_id=(?P<id>[a-zA-Z0-9_-]+)'
191 'url': 'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id=D0005150191_00000',
193 'id': 'D0005150191_00003',
196 'timestamp': 1396414800,
198 'upload_date': '20140402',
201 'chapters': 'count:12'
205 'skip_download': True,
209 def _real_extract(self
, url
):
210 program_type
, video_id
= self
._match
_valid
_url
(url
).groups()
212 webpage
= self
._download
_webpage
(
213 f
'https://www2.nhk.or.jp/school/movie/{program_type}.cgi?das_id={video_id}', video_id
)
215 # searches all variables
216 base_values
= {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)}
217 # and programObj values too
218 program_values
= {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)}
219 # extract all chapters
220 chapter_durations
= [parse_duration(g
.group(1)) for g
in re
.finditer(r
'chapterTime\.push\(\'([0-9:]+?
)\'\
);', webpage)]
221 chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'<div
class="cpTitle"><span
>(scene\s
*\d
+)?
</span
>([^
<]+?
)</div
>', webpage)]
223 # this is how player_core.js is actually doing (!)
224 version = base_values.get('r_version
') or program_values.get('version
')
226 video_id = f'{video_id.split("_")[0]}_{version}
'
228 formats = self._extract_m3u8_formats(
229 f'https
://nhks
-vh
.akamaihd
.net
/i
/das
/{video_id[0:8]}
/{video_id}_V_000
.f4v
/master
.m3u8
',
230 video_id, ext='mp4
', m3u8_id='hls
')
231 self._sort_formats(formats)
233 duration = parse_duration(base_values.get('r_duration
'))
236 if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles):
237 start_time = chapter_durations
238 end_time = chapter_durations[1:] + [duration]
243 } for s, e, t in zip(start_time, end_time, chapter_titles)]
247 'title
': program_values.get('name
'),
248 'duration
': parse_duration(base_values.get('r_duration
')),
249 'timestamp
': unified_timestamp(base_values['r_upload
']),
251 'chapters
': chapters,
255 class NhkForSchoolSubjectIE(InfoExtractor):
256 IE_DESC = 'Portal page
for each school subjects
, like
Japanese (kokugo
, 国語
) or math (sansuu
/suugaku
or 算数・数学
)'
258 'rika
', 'syakai
', 'kokugo
',
259 'sansuu
', 'seikatsu
', 'doutoku
',
260 'ongaku
', 'taiiku
', 'zukou
',
261 'gijutsu
', 'katei
', 'sougou
',
265 _VALID_URL = r'https?
://www\
.nhk\
.or\
.jp
/school
/(?P
<id>%s)/?
(?
:[\?#].*)?$' % '|'.join(re.escape(s) for s in KNOWN_SUBJECTS)
268 'url': 'https://www.nhk.or.jp/school/sougou/',
271 'title': '総合的な学習の時間',
273 'playlist_mincount': 16,
275 'url': 'https://www.nhk.or.jp/school/rika/',
280 'playlist_mincount': 15,
283 def _real_extract(self
, url
):
284 subject_id
= self
._match
_id
(url
)
285 webpage
= self
._download
_webpage
(url
, subject_id
)
287 return self
.playlist_from_matches(
288 re
.finditer(rf
'href="((?:https?://www\.nhk\.or\.jp)?/school/{re.escape(subject_id)}/[^/]+/)"', webpage
),
290 self
._html
_search
_regex
(r
'(?s)<span\s+class="subjectName">\s*<img\s*[^<]+>\s*([^<]+?)</span>', webpage
, 'title', fatal
=False),
291 lambda g
: urljoin(url
, g
.group(1)))
294 class NhkForSchoolProgramListIE(InfoExtractor
):
295 _VALID_URL
= r
'https?://www\.nhk\.or\.jp/school/(?P<id>(?:%s)/[a-zA-Z0-9_-]+)' % (
296 '|'.join(re
.escape(s
) for s
in NhkForSchoolSubjectIE
.KNOWN_SUBJECTS
)
299 'url': 'https://www.nhk.or.jp/school/sougou/q/',
302 'title': 'Q~こどものための哲学',
304 'playlist_mincount': 20,
307 def _real_extract(self
, url
):
308 program_id
= self
._match
_id
(url
)
310 webpage
= self
._download
_webpage
(f
'https://www.nhk.or.jp/school/{program_id}/', program_id
)
312 title
= self
._og
_search
_title
(webpage
, fatal
=False) or self
._html
_extract
_title
(webpage
, fatal
=False) or self
._html
_search
_regex
(r
'<h3>([^<]+?)とは?\s*</h3>', webpage
, 'title', fatal
=False)
313 title
= re
.sub(r
'\s*\|\s*NHK\s+for\s+School\s*$', '', title
) if title
else None
314 description
= self
._html
_search
_regex
(
315 r
'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',
316 webpage
, 'description', fatal
=False, group
=0)
318 bangumi_list
= self
._download
_json
(
319 f
'https://www.nhk.or.jp/school/{program_id}/meta/program.json', program_id
)
320 # they're always bangumi
322 self
.url_result(f
'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id={x}')
323 for x
in traverse_obj(bangumi_list
, ('part', ..., 'part-video-dasid')) or []]
325 return self
.playlist_result(bangumis
, program_id
, title
, description
)