]>
Commit | Line | Data |
---|---|---|
f853f859 PH |
1 | from __future__ import unicode_literals |
2 | ||
9fd5ce0c PH |
3 | import json |
4 | import re | |
5 | ||
a504ced0 | 6 | from .common import InfoExtractor |
9fd5ce0c | 7 | |
c2ee6fa6 | 8 | from ..compat import ( |
9 | compat_str, | |
10 | compat_urlparse | |
11 | ) | |
49174788 | 12 | from ..utils import ( |
c2ee6fa6 | 13 | extract_attributes, |
cd3a3ff9 | 14 | float_or_none, |
49174788 S |
15 | int_or_none, |
16 | try_get, | |
cd3a3ff9 | 17 | url_or_none, |
49174788 | 18 | ) |
4ed3e510 | 19 | |
f853f859 | 20 | |
a504ced0 | 21 | class TEDIE(InfoExtractor): |
cfbee8a4 | 22 | IE_NAME = 'ted' |
aab74fa1 PH |
23 | _VALID_URL = r'''(?x) |
24 | (?P<proto>https?://) | |
cd791a5e | 25 | (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ |
bacac173 | 26 | ( |
c2ee6fa6 | 27 | (?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist |
bacac173 JMF |
28 | | |
29 | ((?P<type_talk>talks)) # We have a simple talk | |
ac6c1048 PH |
30 | | |
31 | (?P<type_watch>watch)/[^/]+/[^/]+ | |
bacac173 JMF |
32 | ) |
33 | (/lang/(.*?))? # The url may contain the language | |
ac6c1048 | 34 | /(?P<name>[\w-]+) # Here goes the name and then ".html" |
aab74fa1 | 35 | .*)$ |
bacac173 | 36 | ''' |
ac6c1048 | 37 | _TESTS = [{ |
f853f859 | 38 | 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', |
cd3a3ff9 | 39 | 'md5': 'b0ce2b05ca215042124fbc9e3886493a', |
f853f859 | 40 | 'info_dict': { |
7b9965ea JMF |
41 | 'id': '102', |
42 | 'ext': 'mp4', | |
652bee05 | 43 | 'title': 'The illusion of consciousness', |
bacac173 | 44 | 'description': ('Philosopher Dan Dennett makes a compelling ' |
9e1a5b84 JW |
45 | 'argument that not only don\'t we understand our own ' |
46 | 'consciousness, but that half the time our brains are ' | |
47 | 'actively fooling us.'), | |
652bee05 | 48 | 'uploader': 'Dan Dennett', |
f628d800 | 49 | 'width': 853, |
eb4cb42a | 50 | 'duration': 1308, |
cd3a3ff9 S |
51 | 'view_count': int, |
52 | 'comment_count': int, | |
53 | 'tags': list, | |
54 | }, | |
55 | 'params': { | |
56 | 'skip_download': True, | |
57 | }, | |
ac6c1048 | 58 | }, { |
cd3a3ff9 S |
59 | # missing HTTP bitrates |
60 | 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms', | |
ac6c1048 | 61 | 'info_dict': { |
cd3a3ff9 | 62 | 'id': '6069', |
ac6c1048 | 63 | 'ext': 'mp4', |
cd3a3ff9 | 64 | 'title': 'The beauty and power of algorithms', |
ec85ded8 | 65 | 'thumbnail': r're:^https?://.+\.jpg', |
cd3a3ff9 S |
66 | 'description': 'md5:734e352710fb00d840ab87ae31aaf688', |
67 | 'uploader': 'Vishal Sikka', | |
68 | }, | |
69 | 'params': { | |
70 | 'skip_download': True, | |
f628d800 | 71 | }, |
2d4c98db JMF |
72 | }, { |
73 | 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', | |
cd3a3ff9 | 74 | 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0', |
2d4c98db JMF |
75 | 'info_dict': { |
76 | 'id': '1972', | |
5bec5748 | 77 | 'ext': 'mp4', |
2d4c98db JMF |
78 | 'title': 'Be passionate. Be courageous. Be your best.', |
79 | 'uploader': 'Gabby Giffords and Mark Kelly', | |
5bec5748 | 80 | 'description': 'md5:5174aed4d0f16021b704120360f72b92', |
eb4cb42a | 81 | 'duration': 1128, |
2d4c98db | 82 | }, |
cd3a3ff9 S |
83 | 'params': { |
84 | 'skip_download': True, | |
85 | }, | |
22a6f150 PH |
86 | }, { |
87 | 'url': 'http://www.ted.com/playlists/who_are_the_hackers', | |
88 | 'info_dict': { | |
89 | 'id': '10', | |
90 | 'title': 'Who are the hackers?', | |
c2ee6fa6 | 91 | 'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a' |
22a6f150 PH |
92 | }, |
93 | 'playlist_mincount': 6, | |
a72cbfac JMF |
94 | }, { |
95 | # contains a youtube video | |
96 | 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', | |
97 | 'add_ie': ['Youtube'], | |
98 | 'info_dict': { | |
99 | 'id': '_ZG8HBuDjgc', | |
f22ba4bd | 100 | 'ext': 'webm', |
a72cbfac JMF |
101 | 'title': 'Douglas Adams: Parrots the Universe and Everything', |
102 | 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', | |
103 | 'uploader': 'University of California Television (UCTV)', | |
104 | 'uploader_id': 'UCtelevision', | |
105 | 'upload_date': '20080522', | |
106 | }, | |
107 | 'params': { | |
108 | 'skip_download': True, | |
109 | }, | |
9a984265 S |
110 | }, { |
111 | # no nativeDownloads | |
112 | 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', | |
113 | 'info_dict': { | |
114 | 'id': '1792', | |
115 | 'ext': 'mp4', | |
116 | 'title': 'The orchestra in my mouth', | |
117 | 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', | |
118 | 'uploader': 'Tom Thum', | |
cd3a3ff9 S |
119 | 'view_count': int, |
120 | 'comment_count': int, | |
121 | 'tags': list, | |
9a984265 S |
122 | }, |
123 | 'params': { | |
124 | 'skip_download': True, | |
125 | }, | |
ac6c1048 | 126 | }] |
9fd5ce0c | 127 | |
0ba77818 | 128 | _NATIVE_FORMATS = { |
11fa3d7f | 129 | 'low': {'width': 320, 'height': 180}, |
130 | 'medium': {'width': 512, 'height': 288}, | |
131 | 'high': {'width': 854, 'height': 480}, | |
652bee05 | 132 | } |
9fd5ce0c | 133 | |
ca1fee34 | 134 | def _extract_info(self, webpage): |
49174788 | 135 | info_json = self._search_regex( |
cdb7c7d1 | 136 | r'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*</script>', |
49174788 | 137 | webpage, 'info json') |
ca1fee34 JMF |
138 | return json.loads(info_json) |
139 | ||
9fd5ce0c | 140 | def _real_extract(self, url): |
bacac173 | 141 | m = re.match(self._VALID_URL, url, re.VERBOSE) |
cd791a5e | 142 | if m.group('type').startswith('embed'): |
aab74fa1 PH |
143 | desktop_url = m.group('proto') + 'www' + m.group('urlmain') |
144 | return self.url_result(desktop_url, 'TED') | |
bacac173 | 145 | name = m.group('name') |
9fd5ce0c | 146 | if m.group('type_talk'): |
bacac173 | 147 | return self._talk_info(url, name) |
ac6c1048 PH |
148 | elif m.group('type_watch'): |
149 | return self._watch_info(url, name) | |
bacac173 | 150 | else: |
ca1fee34 | 151 | return self._playlist_videos_info(url, name) |
9fd5ce0c | 152 | |
ca1fee34 | 153 | def _playlist_videos_info(self, url, name): |
9fd5ce0c | 154 | '''Returns the videos of the playlist''' |
fc2ef392 | 155 | |
ca1fee34 | 156 | webpage = self._download_webpage(url, name, |
9e1a5b84 | 157 | 'Downloading playlist webpage') |
49174788 | 158 | |
c2ee6fa6 | 159 | playlist_entries = [] |
dbb18861 | 160 | for entry in re.findall(r'(?s)<[^>]+data-ga-context=["\']playlist["\'][^>]*>', webpage): |
c2ee6fa6 | 161 | attrs = extract_attributes(entry) |
162 | entry_url = compat_urlparse.urljoin(url, attrs['href']) | |
163 | playlist_entries.append(self.url_result(entry_url, self.ie_key())) | |
9fd5ce0c | 164 | |
dbb18861 S |
165 | final_url = self._og_search_url(webpage, fatal=False) |
166 | playlist_id = ( | |
167 | re.match(self._VALID_URL, final_url).group('playlist_id') | |
168 | if final_url else None) | |
169 | ||
fc2ef392 | 170 | return self.playlist_result( |
dbb18861 S |
171 | playlist_entries, playlist_id=playlist_id, |
172 | playlist_title=self._og_search_title(webpage, fatal=False), | |
c2ee6fa6 | 173 | playlist_description=self._og_search_description(webpage)) |
9fd5ce0c | 174 | |
bacac173 JMF |
175 | def _talk_info(self, url, video_name): |
176 | webpage = self._download_webpage(url, video_name) | |
a9a3876d | 177 | |
49174788 S |
178 | info = self._extract_info(webpage) |
179 | ||
cd3a3ff9 S |
180 | data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info |
181 | talk_info = data['talks'][0] | |
49174788 S |
182 | |
183 | title = talk_info['title'].strip() | |
a9a3876d | 184 | |
2a88a0c4 RA |
185 | downloads = talk_info.get('downloads') or {} |
186 | native_downloads = downloads.get('nativeDownloads') or talk_info.get('nativeDownloads') or {} | |
49174788 | 187 | |
652bee05 | 188 | formats = [{ |
652bee05 JMF |
189 | 'url': format_url, |
190 | 'format_id': format_id, | |
49174788 | 191 | } for (format_id, format_url) in native_downloads.items() if format_url is not None] |
2a88a0c4 RA |
192 | |
193 | subtitled_downloads = downloads.get('subtitledDownloads') or {} | |
194 | for lang, subtitled_download in subtitled_downloads.items(): | |
195 | for q in self._NATIVE_FORMATS: | |
196 | q_url = subtitled_download.get(q) | |
197 | if not q_url: | |
198 | continue | |
199 | formats.append({ | |
200 | 'url': q_url, | |
201 | 'format_id': '%s-%s' % (q, lang), | |
202 | 'language': lang, | |
203 | }) | |
204 | ||
2d4c98db JMF |
205 | if formats: |
206 | for f in formats: | |
2a88a0c4 | 207 | finfo = self._NATIVE_FORMATS.get(f['format_id'].split('-')[0]) |
2d4c98db JMF |
208 | if finfo: |
209 | f.update(finfo) | |
66ee7b32 | 210 | |
49174788 S |
211 | player_talk = talk_info['player_talks'][0] |
212 | ||
cd3a3ff9 S |
213 | external = player_talk.get('external') |
214 | if isinstance(external, dict): | |
215 | service = external.get('service') | |
216 | if isinstance(service, compat_str): | |
217 | ext_url = None | |
218 | if service.lower() == 'youtube': | |
219 | ext_url = external.get('code') | |
d226c560 ER |
220 | |
221 | return self.url_result(ext_url or external['uri']) | |
cd3a3ff9 | 222 | |
49174788 S |
223 | resources_ = player_talk.get('resources') or talk_info.get('resources') |
224 | ||
11fa3d7f | 225 | http_url = None |
49174788 | 226 | for format_id, resources in resources_.items(): |
2a88a0c4 | 227 | if format_id == 'hls': |
a94e7c19 RA |
228 | if not isinstance(resources, dict): |
229 | continue | |
cd3a3ff9 S |
230 | stream_url = url_or_none(resources.get('stream')) |
231 | if not stream_url: | |
232 | continue | |
11fa3d7f | 233 | formats.extend(self._extract_m3u8_formats( |
cd3a3ff9 S |
234 | stream_url, video_name, 'mp4', m3u8_id=format_id, |
235 | fatal=False)) | |
2a88a0c4 RA |
236 | else: |
237 | if not isinstance(resources, list): | |
238 | continue | |
239 | if format_id == 'h264': | |
240 | for resource in resources: | |
241 | h264_url = resource.get('file') | |
242 | if not h264_url: | |
243 | continue | |
244 | bitrate = int_or_none(resource.get('bitrate')) | |
245 | formats.append({ | |
246 | 'url': h264_url, | |
247 | 'format_id': '%s-%sk' % (format_id, bitrate), | |
248 | 'tbr': bitrate, | |
249 | }) | |
250 | if re.search(r'\d+k', h264_url): | |
251 | http_url = h264_url | |
252 | elif format_id == 'rtmp': | |
253 | streamer = talk_info.get('streamer') | |
254 | if not streamer: | |
255 | continue | |
256 | for resource in resources: | |
257 | formats.append({ | |
258 | 'format_id': '%s-%s' % (format_id, resource.get('name')), | |
259 | 'url': streamer, | |
260 | 'play_path': resource['file'], | |
261 | 'ext': 'flv', | |
262 | 'width': int_or_none(resource.get('width')), | |
263 | 'height': int_or_none(resource.get('height')), | |
264 | 'tbr': int_or_none(resource.get('bitrate')), | |
265 | }) | |
11fa3d7f | 266 | |
267 | m3u8_formats = list(filter( | |
ff99fe52 | 268 | lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', |
11fa3d7f | 269 | formats)) |
270 | if http_url: | |
271 | for m3u8_format in m3u8_formats: | |
272 | bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) | |
273 | if not bitrate: | |
274 | continue | |
cd3a3ff9 S |
275 | bitrate_url = re.sub(r'\d+k', bitrate, http_url) |
276 | if not self._is_valid_url( | |
277 | bitrate_url, video_name, '%s bitrate' % bitrate): | |
278 | continue | |
11fa3d7f | 279 | f = m3u8_format.copy() |
280 | f.update({ | |
cd3a3ff9 | 281 | 'url': bitrate_url, |
11fa3d7f | 282 | 'format_id': m3u8_format['format_id'].replace('hls', 'http'), |
283 | 'protocol': 'http', | |
284 | }) | |
f28363ad RA |
285 | if f.get('acodec') == 'none': |
286 | del f['acodec'] | |
11fa3d7f | 287 | formats.append(f) |
66ee7b32 S |
288 | |
289 | audio_download = talk_info.get('audioDownload') | |
290 | if audio_download: | |
291 | formats.append({ | |
292 | 'url': audio_download, | |
293 | 'format_id': 'audio', | |
736785ab | 294 | 'vcodec': 'none', |
66ee7b32 S |
295 | }) |
296 | ||
f628d800 | 297 | self._sort_formats(formats) |
652bee05 | 298 | |
7b9965ea | 299 | video_id = compat_str(talk_info['id']) |
a9a3876d | 300 | |
463a9087 | 301 | return { |
a9a3876d | 302 | 'id': video_id, |
49174788 S |
303 | 'title': title, |
304 | 'uploader': player_talk.get('speaker') or talk_info.get('speaker'), | |
305 | 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'), | |
652bee05 | 306 | 'description': self._og_search_description(webpage), |
03091e37 | 307 | 'subtitles': self._get_subtitles(video_id, talk_info), |
0d8cb1cc | 308 | 'formats': formats, |
cd3a3ff9 S |
309 | 'duration': float_or_none(talk_info.get('duration')), |
310 | 'view_count': int_or_none(data.get('viewed_count')), | |
311 | 'comment_count': int_or_none( | |
312 | try_get(data, lambda x: x['comments']['count'])), | |
313 | 'tags': try_get(talk_info, lambda x: x['tags'], list), | |
0d8cb1cc PH |
314 | } |
315 | ||
a504ced0 | 316 | def _get_subtitles(self, video_id, talk_info): |
f2bb33a9 S |
317 | sub_lang_list = {} |
318 | for language in try_get( | |
319 | talk_info, | |
320 | (lambda x: x['downloads']['languages'], | |
321 | lambda x: x['languages']), list): | |
322 | lang_code = language.get('languageCode') or language.get('ianaCode') | |
323 | if not lang_code: | |
324 | continue | |
325 | sub_lang_list[lang_code] = [ | |
326 | { | |
327 | 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext), | |
328 | 'ext': ext, | |
329 | } | |
330 | for ext in ['ted', 'srt'] | |
331 | ] | |
332 | return sub_lang_list | |
ac6c1048 PH |
333 | |
334 | def _watch_info(self, url, name): | |
335 | webpage = self._download_webpage(url, name) | |
336 | ||
337 | config_json = self._html_search_regex( | |
de9bd74b | 338 | r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>', |
f628d800 | 339 | webpage, 'config', default=None) |
340 | if not config_json: | |
341 | embed_url = self._search_regex( | |
342 | r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url') | |
343 | return self.url_result(self._proto_relative_url(embed_url)) | |
de9bd74b | 344 | config = json.loads(config_json)['config'] |
ac6c1048 PH |
345 | video_url = config['video']['url'] |
346 | thumbnail = config.get('image', {}).get('url') | |
347 | ||
348 | title = self._html_search_regex( | |
349 | r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title') | |
350 | description = self._html_search_regex( | |
621f33c9 PH |
351 | [ |
352 | r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>', | |
353 | r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>', | |
354 | ], | |
ac6c1048 PH |
355 | webpage, 'description', fatal=False) |
356 | ||
357 | return { | |
358 | 'id': name, | |
359 | 'url': video_url, | |
360 | 'title': title, | |
361 | 'thumbnail': thumbnail, | |
362 | 'description': description, | |
363 | } |