]>
Commit | Line | Data |
---|---|---|
f853f859 PH |
1 | from __future__ import unicode_literals |
2 | ||
9fd5ce0c PH |
3 | import json |
4 | import re | |
5 | ||
a504ced0 | 6 | from .common import InfoExtractor |
9fd5ce0c | 7 | |
66ee7b32 | 8 | from ..compat import compat_str |
49174788 S |
9 | from ..utils import ( |
10 | int_or_none, | |
11 | try_get, | |
12 | ) | |
4ed3e510 | 13 | |
f853f859 | 14 | |
a504ced0 | 15 | class TEDIE(InfoExtractor): |
cfbee8a4 | 16 | IE_NAME = 'ted' |
aab74fa1 PH |
17 | _VALID_URL = r'''(?x) |
18 | (?P<proto>https?://) | |
cd791a5e | 19 | (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ |
bacac173 JMF |
20 | ( |
21 | (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist | |
22 | | | |
23 | ((?P<type_talk>talks)) # We have a simple talk | |
ac6c1048 PH |
24 | | |
25 | (?P<type_watch>watch)/[^/]+/[^/]+ | |
bacac173 JMF |
26 | ) |
27 | (/lang/(.*?))? # The url may contain the language | |
ac6c1048 | 28 | /(?P<name>[\w-]+) # Here goes the name and then ".html" |
aab74fa1 | 29 | .*)$ |
bacac173 | 30 | ''' |
ac6c1048 | 31 | _TESTS = [{ |
f853f859 | 32 | 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', |
f628d800 | 33 | 'md5': '0de43ac406aa3e4ea74b66c9c7789b13', |
f853f859 | 34 | 'info_dict': { |
7b9965ea JMF |
35 | 'id': '102', |
36 | 'ext': 'mp4', | |
652bee05 | 37 | 'title': 'The illusion of consciousness', |
bacac173 | 38 | 'description': ('Philosopher Dan Dennett makes a compelling ' |
9e1a5b84 JW |
39 | 'argument that not only don\'t we understand our own ' |
40 | 'consciousness, but that half the time our brains are ' | |
41 | 'actively fooling us.'), | |
652bee05 | 42 | 'uploader': 'Dan Dennett', |
f628d800 | 43 | 'width': 853, |
eb4cb42a | 44 | 'duration': 1308, |
6f5ac90c | 45 | } |
ac6c1048 PH |
46 | }, { |
47 | 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', | |
f628d800 | 48 | 'md5': 'b899ac15e345fb39534d913f7606082b', |
ac6c1048 | 49 | 'info_dict': { |
f628d800 | 50 | 'id': 'tSVI8ta_P4w', |
ac6c1048 PH |
51 | 'ext': 'mp4', |
52 | 'title': 'Vishal Sikka: The beauty and power of algorithms', | |
ec85ded8 | 53 | 'thumbnail': r're:^https?://.+\.jpg', |
f628d800 | 54 | 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4', |
55 | 'upload_date': '20140122', | |
56 | 'uploader_id': 'TEDInstitute', | |
57 | 'uploader': 'TED Institute', | |
58 | }, | |
59 | 'add_ie': ['Youtube'], | |
2d4c98db JMF |
60 | }, { |
61 | 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', | |
f628d800 | 62 | 'md5': '71b3ab2f4233012dce09d515c9c39ce2', |
2d4c98db JMF |
63 | 'info_dict': { |
64 | 'id': '1972', | |
5bec5748 | 65 | 'ext': 'mp4', |
2d4c98db JMF |
66 | 'title': 'Be passionate. Be courageous. Be your best.', |
67 | 'uploader': 'Gabby Giffords and Mark Kelly', | |
5bec5748 | 68 | 'description': 'md5:5174aed4d0f16021b704120360f72b92', |
eb4cb42a | 69 | 'duration': 1128, |
2d4c98db | 70 | }, |
22a6f150 PH |
71 | }, { |
72 | 'url': 'http://www.ted.com/playlists/who_are_the_hackers', | |
73 | 'info_dict': { | |
74 | 'id': '10', | |
75 | 'title': 'Who are the hackers?', | |
76 | }, | |
77 | 'playlist_mincount': 6, | |
a72cbfac JMF |
78 | }, { |
79 | # contains a youtube video | |
80 | 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', | |
81 | 'add_ie': ['Youtube'], | |
82 | 'info_dict': { | |
83 | 'id': '_ZG8HBuDjgc', | |
f22ba4bd | 84 | 'ext': 'webm', |
a72cbfac JMF |
85 | 'title': 'Douglas Adams: Parrots the Universe and Everything', |
86 | 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', | |
87 | 'uploader': 'University of California Television (UCTV)', | |
88 | 'uploader_id': 'UCtelevision', | |
89 | 'upload_date': '20080522', | |
90 | }, | |
91 | 'params': { | |
92 | 'skip_download': True, | |
93 | }, | |
a461a119 S |
94 | }, { |
95 | # YouTube video | |
96 | 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond', | |
97 | 'add_ie': ['Youtube'], | |
98 | 'info_dict': { | |
99 | 'id': 'aFBIPO-P7LM', | |
100 | 'ext': 'mp4', | |
101 | 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville', | |
102 | 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1', | |
103 | 'uploader': 'TEDx Talks', | |
104 | 'uploader_id': 'TEDxTalks', | |
105 | 'upload_date': '20111216', | |
106 | }, | |
107 | 'params': { | |
108 | 'skip_download': True, | |
109 | }, | |
ac6c1048 | 110 | }] |
9fd5ce0c | 111 | |
0ba77818 | 112 | _NATIVE_FORMATS = { |
11fa3d7f | 113 | 'low': {'width': 320, 'height': 180}, |
114 | 'medium': {'width': 512, 'height': 288}, | |
115 | 'high': {'width': 854, 'height': 480}, | |
652bee05 | 116 | } |
9fd5ce0c | 117 | |
ca1fee34 | 118 | def _extract_info(self, webpage): |
49174788 S |
119 | info_json = self._search_regex( |
120 | r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>', | |
121 | webpage, 'info json') | |
ca1fee34 JMF |
122 | return json.loads(info_json) |
123 | ||
9fd5ce0c | 124 | def _real_extract(self, url): |
bacac173 | 125 | m = re.match(self._VALID_URL, url, re.VERBOSE) |
cd791a5e | 126 | if m.group('type').startswith('embed'): |
aab74fa1 PH |
127 | desktop_url = m.group('proto') + 'www' + m.group('urlmain') |
128 | return self.url_result(desktop_url, 'TED') | |
bacac173 | 129 | name = m.group('name') |
9fd5ce0c | 130 | if m.group('type_talk'): |
bacac173 | 131 | return self._talk_info(url, name) |
ac6c1048 PH |
132 | elif m.group('type_watch'): |
133 | return self._watch_info(url, name) | |
bacac173 | 134 | else: |
ca1fee34 | 135 | return self._playlist_videos_info(url, name) |
9fd5ce0c | 136 | |
ca1fee34 | 137 | def _playlist_videos_info(self, url, name): |
9fd5ce0c | 138 | '''Returns the videos of the playlist''' |
fc2ef392 | 139 | |
ca1fee34 | 140 | webpage = self._download_webpage(url, name, |
9e1a5b84 | 141 | 'Downloading playlist webpage') |
ca1fee34 | 142 | info = self._extract_info(webpage) |
49174788 S |
143 | |
144 | playlist_info = try_get( | |
145 | info, lambda x: x['__INITIAL_DATA__']['playlist'], | |
146 | dict) or info['playlist'] | |
9fd5ce0c | 147 | |
fc2ef392 | 148 | playlist_entries = [ |
f07a9f6f | 149 | self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key()) |
49174788 S |
150 | for talk in try_get( |
151 | info, lambda x: x['__INITIAL_DATA__']['talks'], | |
152 | dict) or info['talks'] | |
fc2ef392 PH |
153 | ] |
154 | return self.playlist_result( | |
ca1fee34 JMF |
155 | playlist_entries, |
156 | playlist_id=compat_str(playlist_info['id']), | |
157 | playlist_title=playlist_info['title']) | |
9fd5ce0c | 158 | |
bacac173 JMF |
159 | def _talk_info(self, url, video_name): |
160 | webpage = self._download_webpage(url, video_name) | |
a9a3876d | 161 | |
49174788 S |
162 | info = self._extract_info(webpage) |
163 | ||
164 | talk_info = try_get( | |
165 | info, lambda x: x['__INITIAL_DATA__']['talks'][0], | |
166 | dict) or info['talks'][0] | |
167 | ||
168 | title = talk_info['title'].strip() | |
a9a3876d | 169 | |
a461a119 S |
170 | external = talk_info.get('external') |
171 | if external: | |
172 | service = external['service'] | |
173 | self.to_screen('Found video from %s' % service) | |
174 | ext_url = None | |
175 | if service.lower() == 'youtube': | |
176 | ext_url = external.get('code') | |
a72cbfac JMF |
177 | return { |
178 | '_type': 'url', | |
a461a119 | 179 | 'url': ext_url or external['uri'], |
a72cbfac JMF |
180 | } |
181 | ||
49174788 S |
182 | native_downloads = try_get( |
183 | talk_info, lambda x: x['downloads']['nativeDownloads'], | |
184 | dict) or talk_info['nativeDownloads'] | |
185 | ||
652bee05 | 186 | formats = [{ |
652bee05 JMF |
187 | 'url': format_url, |
188 | 'format_id': format_id, | |
189 | 'format': format_id, | |
49174788 | 190 | } for (format_id, format_url) in native_downloads.items() if format_url is not None] |
2d4c98db JMF |
191 | if formats: |
192 | for f in formats: | |
193 | finfo = self._NATIVE_FORMATS.get(f['format_id']) | |
194 | if finfo: | |
195 | f.update(finfo) | |
66ee7b32 | 196 | |
49174788 S |
197 | player_talk = talk_info['player_talks'][0] |
198 | ||
199 | resources_ = player_talk.get('resources') or talk_info.get('resources') | |
200 | ||
11fa3d7f | 201 | http_url = None |
49174788 | 202 | for format_id, resources in resources_.items(): |
66ee7b32 S |
203 | if format_id == 'h264': |
204 | for resource in resources: | |
11fa3d7f | 205 | h264_url = resource.get('file') |
206 | if not h264_url: | |
207 | continue | |
66ee7b32 S |
208 | bitrate = int_or_none(resource.get('bitrate')) |
209 | formats.append({ | |
11fa3d7f | 210 | 'url': h264_url, |
66ee7b32 S |
211 | 'format_id': '%s-%sk' % (format_id, bitrate), |
212 | 'tbr': bitrate, | |
213 | }) | |
ec85ded8 | 214 | if re.search(r'\d+k', h264_url): |
11fa3d7f | 215 | http_url = h264_url |
66ee7b32 S |
216 | elif format_id == 'rtmp': |
217 | streamer = talk_info.get('streamer') | |
218 | if not streamer: | |
219 | continue | |
220 | for resource in resources: | |
221 | formats.append({ | |
222 | 'format_id': '%s-%s' % (format_id, resource.get('name')), | |
223 | 'url': streamer, | |
224 | 'play_path': resource['file'], | |
225 | 'ext': 'flv', | |
226 | 'width': int_or_none(resource.get('width')), | |
227 | 'height': int_or_none(resource.get('height')), | |
228 | 'tbr': int_or_none(resource.get('bitrate')), | |
229 | }) | |
230 | elif format_id == 'hls': | |
11fa3d7f | 231 | formats.extend(self._extract_m3u8_formats( |
232 | resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False)) | |
233 | ||
234 | m3u8_formats = list(filter( | |
ff99fe52 | 235 | lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', |
11fa3d7f | 236 | formats)) |
237 | if http_url: | |
238 | for m3u8_format in m3u8_formats: | |
239 | bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) | |
240 | if not bitrate: | |
241 | continue | |
242 | f = m3u8_format.copy() | |
243 | f.update({ | |
244 | 'url': re.sub(r'\d+k', bitrate, http_url), | |
245 | 'format_id': m3u8_format['format_id'].replace('hls', 'http'), | |
246 | 'protocol': 'http', | |
247 | }) | |
248 | formats.append(f) | |
66ee7b32 S |
249 | |
250 | audio_download = talk_info.get('audioDownload') | |
251 | if audio_download: | |
252 | formats.append({ | |
253 | 'url': audio_download, | |
254 | 'format_id': 'audio', | |
736785ab | 255 | 'vcodec': 'none', |
66ee7b32 S |
256 | }) |
257 | ||
f628d800 | 258 | self._sort_formats(formats) |
652bee05 | 259 | |
7b9965ea | 260 | video_id = compat_str(talk_info['id']) |
a9a3876d | 261 | |
463a9087 | 262 | return { |
a9a3876d | 263 | 'id': video_id, |
49174788 S |
264 | 'title': title, |
265 | 'uploader': player_talk.get('speaker') or talk_info.get('speaker'), | |
266 | 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'), | |
652bee05 | 267 | 'description': self._og_search_description(webpage), |
03091e37 | 268 | 'subtitles': self._get_subtitles(video_id, talk_info), |
0d8cb1cc | 269 | 'formats': formats, |
eb4cb42a | 270 | 'duration': talk_info.get('duration'), |
0d8cb1cc PH |
271 | } |
272 | ||
a504ced0 | 273 | def _get_subtitles(self, video_id, talk_info): |
f2bb33a9 S |
274 | sub_lang_list = {} |
275 | for language in try_get( | |
276 | talk_info, | |
277 | (lambda x: x['downloads']['languages'], | |
278 | lambda x: x['languages']), list): | |
279 | lang_code = language.get('languageCode') or language.get('ianaCode') | |
280 | if not lang_code: | |
281 | continue | |
282 | sub_lang_list[lang_code] = [ | |
283 | { | |
284 | 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext), | |
285 | 'ext': ext, | |
286 | } | |
287 | for ext in ['ted', 'srt'] | |
288 | ] | |
289 | return sub_lang_list | |
ac6c1048 PH |
290 | |
291 | def _watch_info(self, url, name): | |
292 | webpage = self._download_webpage(url, name) | |
293 | ||
294 | config_json = self._html_search_regex( | |
de9bd74b | 295 | r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>', |
f628d800 | 296 | webpage, 'config', default=None) |
297 | if not config_json: | |
298 | embed_url = self._search_regex( | |
299 | r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url') | |
300 | return self.url_result(self._proto_relative_url(embed_url)) | |
de9bd74b | 301 | config = json.loads(config_json)['config'] |
ac6c1048 PH |
302 | video_url = config['video']['url'] |
303 | thumbnail = config.get('image', {}).get('url') | |
304 | ||
305 | title = self._html_search_regex( | |
306 | r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title') | |
307 | description = self._html_search_regex( | |
621f33c9 PH |
308 | [ |
309 | r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>', | |
310 | r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>', | |
311 | ], | |
ac6c1048 PH |
312 | webpage, 'description', fatal=False) |
313 | ||
314 | return { | |
315 | 'id': name, | |
316 | 'url': video_url, | |
317 | 'title': title, | |
318 | 'thumbnail': thumbnail, | |
319 | 'description': description, | |
320 | } |