]>
Commit | Line | Data |
---|---|---|
f853f859 PH |
1 | from __future__ import unicode_literals |
2 | ||
9fd5ce0c PH |
3 | import json |
4 | import re | |
5 | ||
a504ced0 | 6 | from .common import InfoExtractor |
9fd5ce0c | 7 | |
66ee7b32 S |
8 | from ..compat import compat_str |
9 | from ..utils import int_or_none | |
4ed3e510 | 10 | |
f853f859 | 11 | |
a504ced0 | 12 | class TEDIE(InfoExtractor): |
cfbee8a4 | 13 | IE_NAME = 'ted' |
aab74fa1 PH |
14 | _VALID_URL = r'''(?x) |
15 | (?P<proto>https?://) | |
cd791a5e | 16 | (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ |
bacac173 JMF |
17 | ( |
18 | (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist | |
19 | | | |
20 | ((?P<type_talk>talks)) # We have a simple talk | |
ac6c1048 PH |
21 | | |
22 | (?P<type_watch>watch)/[^/]+/[^/]+ | |
bacac173 JMF |
23 | ) |
24 | (/lang/(.*?))? # The url may contain the language | |
ac6c1048 | 25 | /(?P<name>[\w-]+) # Here goes the name and then ".html" |
aab74fa1 | 26 | .*)$ |
bacac173 | 27 | ''' |
ac6c1048 | 28 | _TESTS = [{ |
f853f859 | 29 | 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', |
f628d800 | 30 | 'md5': '0de43ac406aa3e4ea74b66c9c7789b13', |
f853f859 | 31 | 'info_dict': { |
7b9965ea JMF |
32 | 'id': '102', |
33 | 'ext': 'mp4', | |
652bee05 | 34 | 'title': 'The illusion of consciousness', |
bacac173 | 35 | 'description': ('Philosopher Dan Dennett makes a compelling ' |
9e1a5b84 JW |
36 | 'argument that not only don\'t we understand our own ' |
37 | 'consciousness, but that half the time our brains are ' | |
38 | 'actively fooling us.'), | |
652bee05 | 39 | 'uploader': 'Dan Dennett', |
f628d800 | 40 | 'width': 853, |
eb4cb42a | 41 | 'duration': 1308, |
6f5ac90c | 42 | } |
ac6c1048 PH |
43 | }, { |
44 | 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', | |
f628d800 | 45 | 'md5': 'b899ac15e345fb39534d913f7606082b', |
ac6c1048 | 46 | 'info_dict': { |
f628d800 | 47 | 'id': 'tSVI8ta_P4w', |
ac6c1048 PH |
48 | 'ext': 'mp4', |
49 | 'title': 'Vishal Sikka: The beauty and power of algorithms', | |
50 | 'thumbnail': 're:^https?://.+\.jpg', | |
f628d800 | 51 | 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4', |
52 | 'upload_date': '20140122', | |
53 | 'uploader_id': 'TEDInstitute', | |
54 | 'uploader': 'TED Institute', | |
55 | }, | |
56 | 'add_ie': ['Youtube'], | |
2d4c98db JMF |
57 | }, { |
58 | 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', | |
f628d800 | 59 | 'md5': '71b3ab2f4233012dce09d515c9c39ce2', |
2d4c98db JMF |
60 | 'info_dict': { |
61 | 'id': '1972', | |
5bec5748 | 62 | 'ext': 'mp4', |
2d4c98db JMF |
63 | 'title': 'Be passionate. Be courageous. Be your best.', |
64 | 'uploader': 'Gabby Giffords and Mark Kelly', | |
5bec5748 | 65 | 'description': 'md5:5174aed4d0f16021b704120360f72b92', |
eb4cb42a | 66 | 'duration': 1128, |
2d4c98db | 67 | }, |
22a6f150 PH |
68 | }, { |
69 | 'url': 'http://www.ted.com/playlists/who_are_the_hackers', | |
70 | 'info_dict': { | |
71 | 'id': '10', | |
72 | 'title': 'Who are the hackers?', | |
73 | }, | |
74 | 'playlist_mincount': 6, | |
a72cbfac JMF |
75 | }, { |
76 | # contains a youtube video | |
77 | 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', | |
78 | 'add_ie': ['Youtube'], | |
79 | 'info_dict': { | |
80 | 'id': '_ZG8HBuDjgc', | |
f22ba4bd | 81 | 'ext': 'webm', |
a72cbfac JMF |
82 | 'title': 'Douglas Adams: Parrots the Universe and Everything', |
83 | 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', | |
84 | 'uploader': 'University of California Television (UCTV)', | |
85 | 'uploader_id': 'UCtelevision', | |
86 | 'upload_date': '20080522', | |
87 | }, | |
88 | 'params': { | |
89 | 'skip_download': True, | |
90 | }, | |
a461a119 S |
91 | }, { |
92 | # YouTube video | |
93 | 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond', | |
94 | 'add_ie': ['Youtube'], | |
95 | 'info_dict': { | |
96 | 'id': 'aFBIPO-P7LM', | |
97 | 'ext': 'mp4', | |
98 | 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville', | |
99 | 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1', | |
100 | 'uploader': 'TEDx Talks', | |
101 | 'uploader_id': 'TEDxTalks', | |
102 | 'upload_date': '20111216', | |
103 | }, | |
104 | 'params': { | |
105 | 'skip_download': True, | |
106 | }, | |
ac6c1048 | 107 | }] |
9fd5ce0c | 108 | |
0ba77818 | 109 | _NATIVE_FORMATS = { |
11fa3d7f | 110 | 'low': {'width': 320, 'height': 180}, |
111 | 'medium': {'width': 512, 'height': 288}, | |
112 | 'high': {'width': 854, 'height': 480}, | |
652bee05 | 113 | } |
9fd5ce0c | 114 | |
ca1fee34 | 115 | def _extract_info(self, webpage): |
bacac173 | 116 | info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>', |
9e1a5b84 | 117 | webpage, 'info json') |
ca1fee34 JMF |
118 | return json.loads(info_json) |
119 | ||
9fd5ce0c | 120 | def _real_extract(self, url): |
bacac173 | 121 | m = re.match(self._VALID_URL, url, re.VERBOSE) |
cd791a5e | 122 | if m.group('type').startswith('embed'): |
aab74fa1 PH |
123 | desktop_url = m.group('proto') + 'www' + m.group('urlmain') |
124 | return self.url_result(desktop_url, 'TED') | |
bacac173 | 125 | name = m.group('name') |
9fd5ce0c | 126 | if m.group('type_talk'): |
bacac173 | 127 | return self._talk_info(url, name) |
ac6c1048 PH |
128 | elif m.group('type_watch'): |
129 | return self._watch_info(url, name) | |
bacac173 | 130 | else: |
ca1fee34 | 131 | return self._playlist_videos_info(url, name) |
9fd5ce0c | 132 | |
ca1fee34 | 133 | def _playlist_videos_info(self, url, name): |
9fd5ce0c | 134 | '''Returns the videos of the playlist''' |
fc2ef392 | 135 | |
ca1fee34 | 136 | webpage = self._download_webpage(url, name, |
9e1a5b84 | 137 | 'Downloading playlist webpage') |
ca1fee34 JMF |
138 | info = self._extract_info(webpage) |
139 | playlist_info = info['playlist'] | |
9fd5ce0c | 140 | |
fc2ef392 | 141 | playlist_entries = [ |
f07a9f6f | 142 | self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key()) |
ca1fee34 | 143 | for talk in info['talks'] |
fc2ef392 PH |
144 | ] |
145 | return self.playlist_result( | |
ca1fee34 JMF |
146 | playlist_entries, |
147 | playlist_id=compat_str(playlist_info['id']), | |
148 | playlist_title=playlist_info['title']) | |
9fd5ce0c | 149 | |
bacac173 JMF |
150 | def _talk_info(self, url, video_name): |
151 | webpage = self._download_webpage(url, video_name) | |
9fd5ce0c | 152 | self.report_extraction(video_name) |
a9a3876d | 153 | |
ca1fee34 | 154 | talk_info = self._extract_info(webpage)['talks'][0] |
a9a3876d | 155 | |
a461a119 S |
156 | external = talk_info.get('external') |
157 | if external: | |
158 | service = external['service'] | |
159 | self.to_screen('Found video from %s' % service) | |
160 | ext_url = None | |
161 | if service.lower() == 'youtube': | |
162 | ext_url = external.get('code') | |
a72cbfac JMF |
163 | return { |
164 | '_type': 'url', | |
a461a119 | 165 | 'url': ext_url or external['uri'], |
a72cbfac JMF |
166 | } |
167 | ||
652bee05 | 168 | formats = [{ |
652bee05 JMF |
169 | 'url': format_url, |
170 | 'format_id': format_id, | |
171 | 'format': format_id, | |
2d4c98db JMF |
172 | } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None] |
173 | if formats: | |
174 | for f in formats: | |
175 | finfo = self._NATIVE_FORMATS.get(f['format_id']) | |
176 | if finfo: | |
177 | f.update(finfo) | |
66ee7b32 | 178 | |
11fa3d7f | 179 | http_url = None |
66ee7b32 S |
180 | for format_id, resources in talk_info['resources'].items(): |
181 | if format_id == 'h264': | |
182 | for resource in resources: | |
11fa3d7f | 183 | h264_url = resource.get('file') |
184 | if not h264_url: | |
185 | continue | |
66ee7b32 S |
186 | bitrate = int_or_none(resource.get('bitrate')) |
187 | formats.append({ | |
11fa3d7f | 188 | 'url': h264_url, |
66ee7b32 S |
189 | 'format_id': '%s-%sk' % (format_id, bitrate), |
190 | 'tbr': bitrate, | |
191 | }) | |
11fa3d7f | 192 | if re.search('\d+k', h264_url): |
193 | http_url = h264_url | |
66ee7b32 S |
194 | elif format_id == 'rtmp': |
195 | streamer = talk_info.get('streamer') | |
196 | if not streamer: | |
197 | continue | |
198 | for resource in resources: | |
199 | formats.append({ | |
200 | 'format_id': '%s-%s' % (format_id, resource.get('name')), | |
201 | 'url': streamer, | |
202 | 'play_path': resource['file'], | |
203 | 'ext': 'flv', | |
204 | 'width': int_or_none(resource.get('width')), | |
205 | 'height': int_or_none(resource.get('height')), | |
206 | 'tbr': int_or_none(resource.get('bitrate')), | |
207 | }) | |
208 | elif format_id == 'hls': | |
11fa3d7f | 209 | formats.extend(self._extract_m3u8_formats( |
210 | resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False)) | |
211 | ||
212 | m3u8_formats = list(filter( | |
213 | lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', | |
214 | formats)) | |
215 | if http_url: | |
216 | for m3u8_format in m3u8_formats: | |
217 | bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) | |
218 | if not bitrate: | |
219 | continue | |
220 | f = m3u8_format.copy() | |
221 | f.update({ | |
222 | 'url': re.sub(r'\d+k', bitrate, http_url), | |
223 | 'format_id': m3u8_format['format_id'].replace('hls', 'http'), | |
224 | 'protocol': 'http', | |
225 | }) | |
226 | formats.append(f) | |
66ee7b32 S |
227 | |
228 | audio_download = talk_info.get('audioDownload') | |
229 | if audio_download: | |
230 | formats.append({ | |
231 | 'url': audio_download, | |
232 | 'format_id': 'audio', | |
736785ab | 233 | 'vcodec': 'none', |
66ee7b32 S |
234 | }) |
235 | ||
f628d800 | 236 | self._sort_formats(formats) |
652bee05 | 237 | |
7b9965ea | 238 | video_id = compat_str(talk_info['id']) |
a9a3876d | 239 | |
b6c1cecc JMF |
240 | thumbnail = talk_info['thumb'] |
241 | if not thumbnail.startswith('http'): | |
242 | thumbnail = 'http://' + thumbnail | |
463a9087 | 243 | return { |
a9a3876d | 244 | 'id': video_id, |
a8eb5a8e | 245 | 'title': talk_info['title'].strip(), |
652bee05 | 246 | 'uploader': talk_info['speaker'], |
b6c1cecc | 247 | 'thumbnail': thumbnail, |
652bee05 | 248 | 'description': self._og_search_description(webpage), |
03091e37 | 249 | 'subtitles': self._get_subtitles(video_id, talk_info), |
0d8cb1cc | 250 | 'formats': formats, |
eb4cb42a | 251 | 'duration': talk_info.get('duration'), |
0d8cb1cc PH |
252 | } |
253 | ||
a504ced0 | 254 | def _get_subtitles(self, video_id, talk_info): |
652bee05 JMF |
255 | languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] |
256 | if languages: | |
257 | sub_lang_list = {} | |
258 | for l in languages: | |
a504ced0 JMF |
259 | sub_lang_list[l] = [ |
260 | { | |
261 | 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext), | |
262 | 'ext': ext, | |
263 | } | |
264 | for ext in ['ted', 'srt'] | |
265 | ] | |
652bee05 JMF |
266 | return sub_lang_list |
267 | else: | |
652bee05 | 268 | return {} |
ac6c1048 PH |
269 | |
270 | def _watch_info(self, url, name): | |
271 | webpage = self._download_webpage(url, name) | |
272 | ||
273 | config_json = self._html_search_regex( | |
de9bd74b | 274 | r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>', |
f628d800 | 275 | webpage, 'config', default=None) |
276 | if not config_json: | |
277 | embed_url = self._search_regex( | |
278 | r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url') | |
279 | return self.url_result(self._proto_relative_url(embed_url)) | |
de9bd74b | 280 | config = json.loads(config_json)['config'] |
ac6c1048 PH |
281 | video_url = config['video']['url'] |
282 | thumbnail = config.get('image', {}).get('url') | |
283 | ||
284 | title = self._html_search_regex( | |
285 | r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title') | |
286 | description = self._html_search_regex( | |
621f33c9 PH |
287 | [ |
288 | r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>', | |
289 | r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>', | |
290 | ], | |
ac6c1048 PH |
291 | webpage, 'description', fatal=False) |
292 | ||
293 | return { | |
294 | 'id': name, | |
295 | 'url': video_url, | |
296 | 'title': title, | |
297 | 'thumbnail': thumbnail, | |
298 | 'description': description, | |
299 | } |