]>
Commit | Line | Data |
---|---|---|
f853f859 PH |
1 | from __future__ import unicode_literals |
2 | ||
9fd5ce0c PH |
3 | import json |
4 | import re | |
5 | ||
a504ced0 | 6 | from .common import InfoExtractor |
9fd5ce0c | 7 | |
c2ee6fa6 | 8 | from ..compat import ( |
9 | compat_str, | |
10 | compat_urlparse | |
11 | ) | |
49174788 | 12 | from ..utils import ( |
c2ee6fa6 | 13 | extract_attributes, |
cd3a3ff9 | 14 | float_or_none, |
49174788 S |
15 | int_or_none, |
16 | try_get, | |
cd3a3ff9 | 17 | url_or_none, |
49174788 | 18 | ) |
4ed3e510 | 19 | |
f853f859 | 20 | |
a504ced0 | 21 | class TEDIE(InfoExtractor): |
cfbee8a4 | 22 | IE_NAME = 'ted' |
aab74fa1 PH |
23 | _VALID_URL = r'''(?x) |
24 | (?P<proto>https?://) | |
cd791a5e | 25 | (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ |
bacac173 | 26 | ( |
c2ee6fa6 | 27 | (?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist |
bacac173 JMF |
28 | | |
29 | ((?P<type_talk>talks)) # We have a simple talk | |
ac6c1048 PH |
30 | | |
31 | (?P<type_watch>watch)/[^/]+/[^/]+ | |
bacac173 JMF |
32 | ) |
33 | (/lang/(.*?))? # The url may contain the language | |
ac6c1048 | 34 | /(?P<name>[\w-]+) # Here goes the name and then ".html" |
aab74fa1 | 35 | .*)$ |
bacac173 | 36 | ''' |
ac6c1048 | 37 | _TESTS = [{ |
f853f859 | 38 | 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', |
cd3a3ff9 | 39 | 'md5': 'b0ce2b05ca215042124fbc9e3886493a', |
f853f859 | 40 | 'info_dict': { |
7b9965ea JMF |
41 | 'id': '102', |
42 | 'ext': 'mp4', | |
652bee05 | 43 | 'title': 'The illusion of consciousness', |
bacac173 | 44 | 'description': ('Philosopher Dan Dennett makes a compelling ' |
9e1a5b84 JW |
45 | 'argument that not only don\'t we understand our own ' |
46 | 'consciousness, but that half the time our brains are ' | |
47 | 'actively fooling us.'), | |
652bee05 | 48 | 'uploader': 'Dan Dennett', |
f628d800 | 49 | 'width': 853, |
eb4cb42a | 50 | 'duration': 1308, |
cd3a3ff9 S |
51 | 'view_count': int, |
52 | 'comment_count': int, | |
53 | 'tags': list, | |
54 | }, | |
55 | 'params': { | |
56 | 'skip_download': True, | |
57 | }, | |
ac6c1048 | 58 | }, { |
cd3a3ff9 S |
59 | # missing HTTP bitrates |
60 | 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms', | |
ac6c1048 | 61 | 'info_dict': { |
cd3a3ff9 | 62 | 'id': '6069', |
ac6c1048 | 63 | 'ext': 'mp4', |
cd3a3ff9 | 64 | 'title': 'The beauty and power of algorithms', |
ec85ded8 | 65 | 'thumbnail': r're:^https?://.+\.jpg', |
cd3a3ff9 S |
66 | 'description': 'md5:734e352710fb00d840ab87ae31aaf688', |
67 | 'uploader': 'Vishal Sikka', | |
68 | }, | |
69 | 'params': { | |
70 | 'skip_download': True, | |
f628d800 | 71 | }, |
2d4c98db JMF |
72 | }, { |
73 | 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', | |
cd3a3ff9 | 74 | 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0', |
2d4c98db JMF |
75 | 'info_dict': { |
76 | 'id': '1972', | |
5bec5748 | 77 | 'ext': 'mp4', |
2d4c98db JMF |
78 | 'title': 'Be passionate. Be courageous. Be your best.', |
79 | 'uploader': 'Gabby Giffords and Mark Kelly', | |
5bec5748 | 80 | 'description': 'md5:5174aed4d0f16021b704120360f72b92', |
eb4cb42a | 81 | 'duration': 1128, |
2d4c98db | 82 | }, |
cd3a3ff9 S |
83 | 'params': { |
84 | 'skip_download': True, | |
85 | }, | |
22a6f150 PH |
86 | }, { |
87 | 'url': 'http://www.ted.com/playlists/who_are_the_hackers', | |
88 | 'info_dict': { | |
89 | 'id': '10', | |
90 | 'title': 'Who are the hackers?', | |
c2ee6fa6 | 91 | 'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a' |
22a6f150 PH |
92 | }, |
93 | 'playlist_mincount': 6, | |
a72cbfac JMF |
94 | }, { |
95 | # contains a youtube video | |
96 | 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', | |
97 | 'add_ie': ['Youtube'], | |
98 | 'info_dict': { | |
99 | 'id': '_ZG8HBuDjgc', | |
f22ba4bd | 100 | 'ext': 'webm', |
a72cbfac JMF |
101 | 'title': 'Douglas Adams: Parrots the Universe and Everything', |
102 | 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', | |
103 | 'uploader': 'University of California Television (UCTV)', | |
104 | 'uploader_id': 'UCtelevision', | |
105 | 'upload_date': '20080522', | |
106 | }, | |
107 | 'params': { | |
108 | 'skip_download': True, | |
109 | }, | |
9a984265 S |
110 | }, { |
111 | # no nativeDownloads | |
112 | 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', | |
113 | 'info_dict': { | |
114 | 'id': '1792', | |
115 | 'ext': 'mp4', | |
116 | 'title': 'The orchestra in my mouth', | |
117 | 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', | |
118 | 'uploader': 'Tom Thum', | |
cd3a3ff9 S |
119 | 'view_count': int, |
120 | 'comment_count': int, | |
121 | 'tags': list, | |
9a984265 S |
122 | }, |
123 | 'params': { | |
124 | 'skip_download': True, | |
125 | }, | |
14eb1ee1 | 126 | }, { |
127 | # with own formats and private Youtube external | |
128 | 'url': 'https://www.ted.com/talks/spencer_wells_a_family_tree_for_humanity', | |
129 | 'only_matching': True, | |
ac6c1048 | 130 | }] |
9fd5ce0c | 131 | |
0ba77818 | 132 | _NATIVE_FORMATS = { |
11fa3d7f | 133 | 'low': {'width': 320, 'height': 180}, |
134 | 'medium': {'width': 512, 'height': 288}, | |
135 | 'high': {'width': 854, 'height': 480}, | |
652bee05 | 136 | } |
9fd5ce0c | 137 | |
ca1fee34 | 138 | def _extract_info(self, webpage): |
49174788 | 139 | info_json = self._search_regex( |
cdb7c7d1 | 140 | r'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*</script>', |
49174788 | 141 | webpage, 'info json') |
ca1fee34 JMF |
142 | return json.loads(info_json) |
143 | ||
9fd5ce0c | 144 | def _real_extract(self, url): |
bacac173 | 145 | m = re.match(self._VALID_URL, url, re.VERBOSE) |
cd791a5e | 146 | if m.group('type').startswith('embed'): |
aab74fa1 PH |
147 | desktop_url = m.group('proto') + 'www' + m.group('urlmain') |
148 | return self.url_result(desktop_url, 'TED') | |
bacac173 | 149 | name = m.group('name') |
9fd5ce0c | 150 | if m.group('type_talk'): |
bacac173 | 151 | return self._talk_info(url, name) |
ac6c1048 PH |
152 | elif m.group('type_watch'): |
153 | return self._watch_info(url, name) | |
bacac173 | 154 | else: |
ca1fee34 | 155 | return self._playlist_videos_info(url, name) |
9fd5ce0c | 156 | |
ca1fee34 | 157 | def _playlist_videos_info(self, url, name): |
9fd5ce0c | 158 | '''Returns the videos of the playlist''' |
fc2ef392 | 159 | |
ca1fee34 | 160 | webpage = self._download_webpage(url, name, |
9e1a5b84 | 161 | 'Downloading playlist webpage') |
49174788 | 162 | |
c2ee6fa6 | 163 | playlist_entries = [] |
dbb18861 | 164 | for entry in re.findall(r'(?s)<[^>]+data-ga-context=["\']playlist["\'][^>]*>', webpage): |
c2ee6fa6 | 165 | attrs = extract_attributes(entry) |
166 | entry_url = compat_urlparse.urljoin(url, attrs['href']) | |
167 | playlist_entries.append(self.url_result(entry_url, self.ie_key())) | |
9fd5ce0c | 168 | |
dbb18861 S |
169 | final_url = self._og_search_url(webpage, fatal=False) |
170 | playlist_id = ( | |
171 | re.match(self._VALID_URL, final_url).group('playlist_id') | |
172 | if final_url else None) | |
173 | ||
fc2ef392 | 174 | return self.playlist_result( |
dbb18861 S |
175 | playlist_entries, playlist_id=playlist_id, |
176 | playlist_title=self._og_search_title(webpage, fatal=False), | |
c2ee6fa6 | 177 | playlist_description=self._og_search_description(webpage)) |
9fd5ce0c | 178 | |
bacac173 JMF |
179 | def _talk_info(self, url, video_name): |
180 | webpage = self._download_webpage(url, video_name) | |
a9a3876d | 181 | |
49174788 S |
182 | info = self._extract_info(webpage) |
183 | ||
cd3a3ff9 S |
184 | data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info |
185 | talk_info = data['talks'][0] | |
49174788 S |
186 | |
187 | title = talk_info['title'].strip() | |
a9a3876d | 188 | |
2a88a0c4 RA |
189 | downloads = talk_info.get('downloads') or {} |
190 | native_downloads = downloads.get('nativeDownloads') or talk_info.get('nativeDownloads') or {} | |
49174788 | 191 | |
652bee05 | 192 | formats = [{ |
652bee05 JMF |
193 | 'url': format_url, |
194 | 'format_id': format_id, | |
49174788 | 195 | } for (format_id, format_url) in native_downloads.items() if format_url is not None] |
2a88a0c4 RA |
196 | |
197 | subtitled_downloads = downloads.get('subtitledDownloads') or {} | |
198 | for lang, subtitled_download in subtitled_downloads.items(): | |
199 | for q in self._NATIVE_FORMATS: | |
200 | q_url = subtitled_download.get(q) | |
201 | if not q_url: | |
202 | continue | |
203 | formats.append({ | |
204 | 'url': q_url, | |
205 | 'format_id': '%s-%s' % (q, lang), | |
206 | 'language': lang, | |
207 | }) | |
208 | ||
2d4c98db JMF |
209 | if formats: |
210 | for f in formats: | |
2a88a0c4 | 211 | finfo = self._NATIVE_FORMATS.get(f['format_id'].split('-')[0]) |
2d4c98db JMF |
212 | if finfo: |
213 | f.update(finfo) | |
66ee7b32 | 214 | |
49174788 S |
215 | player_talk = talk_info['player_talks'][0] |
216 | ||
217 | resources_ = player_talk.get('resources') or talk_info.get('resources') | |
218 | ||
11fa3d7f | 219 | http_url = None |
49174788 | 220 | for format_id, resources in resources_.items(): |
2a88a0c4 | 221 | if format_id == 'hls': |
a94e7c19 RA |
222 | if not isinstance(resources, dict): |
223 | continue | |
cd3a3ff9 S |
224 | stream_url = url_or_none(resources.get('stream')) |
225 | if not stream_url: | |
226 | continue | |
11fa3d7f | 227 | formats.extend(self._extract_m3u8_formats( |
cd3a3ff9 S |
228 | stream_url, video_name, 'mp4', m3u8_id=format_id, |
229 | fatal=False)) | |
2a88a0c4 RA |
230 | else: |
231 | if not isinstance(resources, list): | |
232 | continue | |
233 | if format_id == 'h264': | |
234 | for resource in resources: | |
235 | h264_url = resource.get('file') | |
236 | if not h264_url: | |
237 | continue | |
238 | bitrate = int_or_none(resource.get('bitrate')) | |
239 | formats.append({ | |
240 | 'url': h264_url, | |
241 | 'format_id': '%s-%sk' % (format_id, bitrate), | |
242 | 'tbr': bitrate, | |
243 | }) | |
244 | if re.search(r'\d+k', h264_url): | |
245 | http_url = h264_url | |
246 | elif format_id == 'rtmp': | |
247 | streamer = talk_info.get('streamer') | |
248 | if not streamer: | |
249 | continue | |
250 | for resource in resources: | |
251 | formats.append({ | |
252 | 'format_id': '%s-%s' % (format_id, resource.get('name')), | |
253 | 'url': streamer, | |
254 | 'play_path': resource['file'], | |
255 | 'ext': 'flv', | |
256 | 'width': int_or_none(resource.get('width')), | |
257 | 'height': int_or_none(resource.get('height')), | |
258 | 'tbr': int_or_none(resource.get('bitrate')), | |
259 | }) | |
11fa3d7f | 260 | |
261 | m3u8_formats = list(filter( | |
ff99fe52 | 262 | lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', |
11fa3d7f | 263 | formats)) |
264 | if http_url: | |
265 | for m3u8_format in m3u8_formats: | |
266 | bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) | |
267 | if not bitrate: | |
268 | continue | |
cd3a3ff9 S |
269 | bitrate_url = re.sub(r'\d+k', bitrate, http_url) |
270 | if not self._is_valid_url( | |
271 | bitrate_url, video_name, '%s bitrate' % bitrate): | |
272 | continue | |
11fa3d7f | 273 | f = m3u8_format.copy() |
274 | f.update({ | |
cd3a3ff9 | 275 | 'url': bitrate_url, |
11fa3d7f | 276 | 'format_id': m3u8_format['format_id'].replace('hls', 'http'), |
277 | 'protocol': 'http', | |
278 | }) | |
f28363ad RA |
279 | if f.get('acodec') == 'none': |
280 | del f['acodec'] | |
11fa3d7f | 281 | formats.append(f) |
66ee7b32 S |
282 | |
283 | audio_download = talk_info.get('audioDownload') | |
284 | if audio_download: | |
285 | formats.append({ | |
286 | 'url': audio_download, | |
287 | 'format_id': 'audio', | |
736785ab | 288 | 'vcodec': 'none', |
66ee7b32 S |
289 | }) |
290 | ||
14eb1ee1 | 291 | if not formats: |
292 | external = player_talk.get('external') | |
293 | if isinstance(external, dict): | |
294 | service = external.get('service') | |
295 | if isinstance(service, compat_str): | |
296 | ext_url = None | |
297 | if service.lower() == 'youtube': | |
298 | ext_url = external.get('code') | |
299 | return self.url_result(ext_url or external['uri']) | |
300 | ||
f628d800 | 301 | self._sort_formats(formats) |
652bee05 | 302 | |
7b9965ea | 303 | video_id = compat_str(talk_info['id']) |
a9a3876d | 304 | |
463a9087 | 305 | return { |
a9a3876d | 306 | 'id': video_id, |
49174788 S |
307 | 'title': title, |
308 | 'uploader': player_talk.get('speaker') or talk_info.get('speaker'), | |
309 | 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'), | |
652bee05 | 310 | 'description': self._og_search_description(webpage), |
03091e37 | 311 | 'subtitles': self._get_subtitles(video_id, talk_info), |
0d8cb1cc | 312 | 'formats': formats, |
cd3a3ff9 S |
313 | 'duration': float_or_none(talk_info.get('duration')), |
314 | 'view_count': int_or_none(data.get('viewed_count')), | |
315 | 'comment_count': int_or_none( | |
316 | try_get(data, lambda x: x['comments']['count'])), | |
317 | 'tags': try_get(talk_info, lambda x: x['tags'], list), | |
0d8cb1cc PH |
318 | } |
319 | ||
a504ced0 | 320 | def _get_subtitles(self, video_id, talk_info): |
f2bb33a9 S |
321 | sub_lang_list = {} |
322 | for language in try_get( | |
323 | talk_info, | |
324 | (lambda x: x['downloads']['languages'], | |
325 | lambda x: x['languages']), list): | |
326 | lang_code = language.get('languageCode') or language.get('ianaCode') | |
327 | if not lang_code: | |
328 | continue | |
329 | sub_lang_list[lang_code] = [ | |
330 | { | |
331 | 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext), | |
332 | 'ext': ext, | |
333 | } | |
334 | for ext in ['ted', 'srt'] | |
335 | ] | |
336 | return sub_lang_list | |
ac6c1048 PH |
337 | |
338 | def _watch_info(self, url, name): | |
339 | webpage = self._download_webpage(url, name) | |
340 | ||
341 | config_json = self._html_search_regex( | |
de9bd74b | 342 | r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>', |
f628d800 | 343 | webpage, 'config', default=None) |
344 | if not config_json: | |
345 | embed_url = self._search_regex( | |
346 | r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url') | |
347 | return self.url_result(self._proto_relative_url(embed_url)) | |
de9bd74b | 348 | config = json.loads(config_json)['config'] |
ac6c1048 PH |
349 | video_url = config['video']['url'] |
350 | thumbnail = config.get('image', {}).get('url') | |
351 | ||
352 | title = self._html_search_regex( | |
353 | r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title') | |
354 | description = self._html_search_regex( | |
621f33c9 PH |
355 | [ |
356 | r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>', | |
357 | r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>', | |
358 | ], | |
ac6c1048 PH |
359 | webpage, 'description', fatal=False) |
360 | ||
361 | return { | |
362 | 'id': name, | |
363 | 'url': video_url, | |
364 | 'title': title, | |
365 | 'thumbnail': thumbnail, | |
366 | 'description': description, | |
367 | } |