]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/nhk.py
[cleanup] Use `_html_extract_title`
[yt-dlp.git] / yt_dlp / extractor / nhk.py
CommitLineData
298a120a
AN
1from __future__ import unicode_literals
2
77cc7c6e 3import re
061d1cd9 4
298a120a 5from .common import InfoExtractor
77cc7c6e
LNO
6from ..utils import (
7 parse_duration,
8 traverse_obj,
9 unescapeHTML,
10 unified_timestamp,
11 urljoin
12)
298a120a
AN
13
14
29f7c58a 15class NhkBaseIE(InfoExtractor):
16 _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json'
17 _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
18 _TYPE_REGEX = r'/(?P<type>video|audio)/'
298a120a 19
29f7c58a 20 def _call_api(self, m_id, lang, is_video, is_episode, is_clip):
21 return self._download_json(
22 self._API_URL_TEMPLATE % (
23 'v' if is_video else 'r',
24 'clip' if is_clip else 'esd',
25 'episode' if is_episode else 'program',
26 m_id, lang, '/all' if is_video else ''),
27 m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] or []
28
29 def _extract_episode_info(self, url, episode=None):
30 fetch_episode = episode is None
5ad28e7f 31 lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups()
061d1cd9
RA
32 if episode_id.isdigit():
33 episode_id = episode_id[:4] + '-' + episode_id[4:]
f9b373af 34
061d1cd9 35 is_video = m_type == 'video'
29f7c58a 36 if fetch_episode:
37 episode = self._call_api(
38 episode_id, lang, is_video, True, episode_id[:4] == '9999')[0]
061d1cd9 39 title = episode.get('sub_title_clean') or episode['sub_title']
45396dd2 40
061d1cd9
RA
41 def get_clean_field(key):
42 return episode.get(key + '_clean') or episode.get(key)
45396dd2 43
061d1cd9 44 series = get_clean_field('title')
45396dd2 45
061d1cd9
RA
46 thumbnails = []
47 for s, w, h in [('', 640, 360), ('_l', 1280, 720)]:
48 img_path = episode.get('image' + s)
49 if not img_path:
50 continue
51 thumbnails.append({
52 'id': '%dp' % h,
53 'height': h,
54 'width': w,
55 'url': 'https://www3.nhk.or.jp' + img_path,
56 })
298a120a 57
061d1cd9
RA
58 info = {
59 'id': episode_id + '-' + lang,
f9b373af 60 'title': '%s - %s' % (series, title) if series and title else title,
061d1cd9
RA
61 'description': get_clean_field('description'),
62 'thumbnails': thumbnails,
f9b373af
S
63 'series': series,
64 'episode': title,
65 }
061d1cd9 66 if is_video:
29f7c58a 67 vod_id = episode['vod_id']
061d1cd9
RA
68 info.update({
69 '_type': 'url_transparent',
a373befa 70 'ie_key': 'Piksel',
29f7c58a 71 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id,
72 'id': vod_id,
061d1cd9
RA
73 })
74 else:
29f7c58a 75 if fetch_episode:
76 audio_path = episode['audio']['audio']
77 info['formats'] = self._extract_m3u8_formats(
78 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path,
79 episode_id, 'm4a', entry_protocol='m3u8_native',
80 m3u8_id='hls', fatal=False)
81 for f in info['formats']:
82 f['language'] = lang
df6c409d 83 self._sort_formats(info['formats'])
29f7c58a 84 else:
85 info.update({
86 '_type': 'url_transparent',
87 'ie_key': NhkVodIE.ie_key(),
88 'url': url,
89 })
061d1cd9 90 return info
29f7c58a 91
92
93class NhkVodIE(NhkBaseIE):
94 _VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
95 # Content available only for a limited period of time. Visit
96 # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
97 _TESTS = [{
98 # video clip
99 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/',
100 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca',
101 'info_dict': {
102 'id': 'a95j5iza',
103 'ext': 'mp4',
104 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU",
105 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
106 'timestamp': 1565965194,
107 'upload_date': '20190816',
108 },
109 }, {
110 # audio clip
111 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/',
112 'info_dict': {
113 'id': 'r_inventions-20201104-1-en',
114 'ext': 'm4a',
115 'title': "Japan's Top Inventions - Miniature Video Cameras",
116 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b',
117 },
118 'params': {
119 # m3u8 download
120 'skip_download': True,
121 },
122 }, {
123 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/',
124 'only_matching': True,
125 }, {
126 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/',
127 'only_matching': True,
128 }, {
129 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/',
130 'only_matching': True,
131 }, {
132 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
133 'only_matching': True,
134 }]
135
136 def _real_extract(self, url):
137 return self._extract_episode_info(url)
138
139
140class NhkVodProgramIE(NhkBaseIE):
141 _VALID_URL = r'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
142 _TESTS = [{
143 # video program episodes
144 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway',
145 'info_dict': {
146 'id': 'japanrailway',
147 'title': 'Japan Railway Journal',
148 },
149 'playlist_mincount': 1,
150 }, {
151 # video program clips
152 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip',
153 'info_dict': {
154 'id': 'japanrailway',
155 'title': 'Japan Railway Journal',
156 },
157 'playlist_mincount': 5,
158 }, {
159 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/',
160 'only_matching': True,
161 }, {
162 # audio program
163 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/',
164 'only_matching': True,
165 }]
166
167 def _real_extract(self, url):
5ad28e7f 168 lang, m_type, program_id, episode_type = self._match_valid_url(url).groups()
29f7c58a 169
170 episodes = self._call_api(
171 program_id, lang, m_type == 'video', False, episode_type == 'clip')
172
173 entries = []
174 for episode in episodes:
175 episode_path = episode.get('url')
176 if not episode_path:
177 continue
178 entries.append(self._extract_episode_info(
179 urljoin(url, episode_path), episode))
180
181 program_title = None
182 if entries:
183 program_title = entries[0].get('series')
184
185 return self.playlist_result(entries, program_id, program_title)
77cc7c6e
LNO
186
187
188class NhkForSchoolBangumiIE(InfoExtractor):
189 _VALID_URL = r'https?://www2\.nhk\.or\.jp/school/movie/(?P<type>bangumi|clip)\.cgi\?das_id=(?P<id>[a-zA-Z0-9_-]+)'
190 _TESTS = [{
191 'url': 'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id=D0005150191_00000',
192 'info_dict': {
193 'id': 'D0005150191_00003',
194 'title': 'にている かな',
195 'duration': 599.999,
196 'timestamp': 1396414800,
197
198 'upload_date': '20140402',
199 'ext': 'mp4',
200
201 'chapters': 'count:12'
202 },
203 'params': {
204 # m3u8 download
205 'skip_download': True,
206 },
207 }]
208
209 def _real_extract(self, url):
210 program_type, video_id = self._match_valid_url(url).groups()
211
212 webpage = self._download_webpage(
213 f'https://www2.nhk.or.jp/school/movie/{program_type}.cgi?das_id={video_id}', video_id)
214
215 # searches all variables
216 base_values = {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)}
217 # and programObj values too
218 program_values = {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)}
219 # extract all chapters
220 chapter_durations = [parse_duration(g.group(1)) for g in re.finditer(r'chapterTime\.push\(\'([0-9:]+?)\'\);', webpage)]
221 chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'<div class="cpTitle"><span>(scene\s*\d+)?</span>([^<]+?)</div>', webpage)]
222
223 # this is how player_core.js is actually doing (!)
224 version = base_values.get('r_version') or program_values.get('version')
225 if version:
226 video_id = f'{video_id.split("_")[0]}_{version}'
227
228 formats = self._extract_m3u8_formats(
229 f'https://nhks-vh.akamaihd.net/i/das/{video_id[0:8]}/{video_id}_V_000.f4v/master.m3u8',
230 video_id, ext='mp4', m3u8_id='hls')
231 self._sort_formats(formats)
232
233 duration = parse_duration(base_values.get('r_duration'))
234
235 chapters = None
236 if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles):
237 start_time = chapter_durations
238 end_time = chapter_durations[1:] + [duration]
239 chapters = [{
240 'start_time': s,
241 'end_time': e,
242 'title': t,
243 } for s, e, t in zip(start_time, end_time, chapter_titles)]
244
245 return {
246 'id': video_id,
247 'title': program_values.get('name'),
248 'duration': parse_duration(base_values.get('r_duration')),
249 'timestamp': unified_timestamp(base_values['r_upload']),
250 'formats': formats,
251 'chapters': chapters,
252 }
253
254
255class NhkForSchoolSubjectIE(InfoExtractor):
256 IE_DESC = 'Portal page for each school subjects, like Japanese (kokugo, 国語) or math (sansuu/suugaku or 算数・数学)'
257 KNOWN_SUBJECTS = (
258 'rika', 'syakai', 'kokugo',
259 'sansuu', 'seikatsu', 'doutoku',
260 'ongaku', 'taiiku', 'zukou',
261 'gijutsu', 'katei', 'sougou',
262 'eigo', 'tokkatsu',
263 'tokushi', 'sonota',
264 )
265 _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>%s)/?(?:[\?#].*)?$' % '|'.join(re.escape(s) for s in KNOWN_SUBJECTS)
266
267 _TESTS = [{
268 'url': 'https://www.nhk.or.jp/school/sougou/',
269 'info_dict': {
270 'id': 'sougou',
271 'title': '総合的な学習の時間',
272 },
273 'playlist_mincount': 16,
274 }, {
275 'url': 'https://www.nhk.or.jp/school/rika/',
276 'info_dict': {
277 'id': 'rika',
278 'title': '理科',
279 },
280 'playlist_mincount': 15,
281 }]
282
283 def _real_extract(self, url):
284 subject_id = self._match_id(url)
285 webpage = self._download_webpage(url, subject_id)
286
287 return self.playlist_from_matches(
288 re.finditer(rf'href="((?:https?://www\.nhk\.or\.jp)?/school/{re.escape(subject_id)}/[^/]+/)"', webpage),
289 subject_id,
290 self._html_search_regex(r'(?s)<span\s+class="subjectName">\s*<img\s*[^<]+>\s*([^<]+?)</span>', webpage, 'title', fatal=False),
291 lambda g: urljoin(url, g.group(1)))
292
293
294class NhkForSchoolProgramListIE(InfoExtractor):
295 _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>(?:%s)/[a-zA-Z0-9_-]+)' % (
296 '|'.join(re.escape(s) for s in NhkForSchoolSubjectIE.KNOWN_SUBJECTS)
297 )
298 _TESTS = [{
299 'url': 'https://www.nhk.or.jp/school/sougou/q/',
300 'info_dict': {
301 'id': 'sougou/q',
302 'title': 'Q~こどものための哲学',
303 },
304 'playlist_mincount': 20,
305 }]
306
307 def _real_extract(self, url):
308 program_id = self._match_id(url)
309
310 webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id)
311
04f3fd2c 312 title = (self._og_search_title(webpage)
313 or self._html_extract_title(webpage)
314 or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False))
77cc7c6e
LNO
315 title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None
316 description = self._html_search_regex(
317 r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',
318 webpage, 'description', fatal=False, group=0)
319
320 bangumi_list = self._download_json(
321 f'https://www.nhk.or.jp/school/{program_id}/meta/program.json', program_id)
322 # they're always bangumi
323 bangumis = [
324 self.url_result(f'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id={x}')
325 for x in traverse_obj(bangumi_list, ('part', ..., 'part-video-dasid')) or []]
326
327 return self.playlist_result(bangumis, program_id, title, description)