]>
Commit | Line | Data |
---|---|---|
1 | from __future__ import unicode_literals | |
2 | ||
3 | import json | |
4 | import re | |
5 | ||
6 | from .common import InfoExtractor | |
7 | ||
8 | from ..compat import ( | |
9 | compat_str, | |
10 | compat_urlparse | |
11 | ) | |
12 | from ..utils import ( | |
13 | extract_attributes, | |
14 | float_or_none, | |
15 | int_or_none, | |
16 | try_get, | |
17 | url_or_none, | |
18 | ) | |
19 | ||
20 | ||
21 | class TEDIE(InfoExtractor): | |
22 | IE_NAME = 'ted' | |
23 | _VALID_URL = r'''(?x) | |
24 | (?P<proto>https?://) | |
25 | (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ | |
26 | ( | |
27 | (?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist | |
28 | | | |
29 | ((?P<type_talk>talks)) # We have a simple talk | |
30 | | | |
31 | (?P<type_watch>watch)/[^/]+/[^/]+ | |
32 | ) | |
33 | (/lang/(.*?))? # The url may contain the language | |
34 | /(?P<name>[\w-]+) # Here goes the name and then ".html" | |
35 | .*)$ | |
36 | ''' | |
37 | _TESTS = [{ | |
38 | 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', | |
39 | 'md5': 'b0ce2b05ca215042124fbc9e3886493a', | |
40 | 'info_dict': { | |
41 | 'id': '102', | |
42 | 'ext': 'mp4', | |
43 | 'title': 'The illusion of consciousness', | |
44 | 'description': ('Philosopher Dan Dennett makes a compelling ' | |
45 | 'argument that not only don\'t we understand our own ' | |
46 | 'consciousness, but that half the time our brains are ' | |
47 | 'actively fooling us.'), | |
48 | 'uploader': 'Dan Dennett', | |
49 | 'width': 853, | |
50 | 'duration': 1308, | |
51 | 'view_count': int, | |
52 | 'comment_count': int, | |
53 | 'tags': list, | |
54 | }, | |
55 | 'params': { | |
56 | 'skip_download': True, | |
57 | }, | |
58 | }, { | |
59 | # missing HTTP bitrates | |
60 | 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms', | |
61 | 'info_dict': { | |
62 | 'id': '6069', | |
63 | 'ext': 'mp4', | |
64 | 'title': 'The beauty and power of algorithms', | |
65 | 'thumbnail': r're:^https?://.+\.jpg', | |
66 | 'description': 'md5:734e352710fb00d840ab87ae31aaf688', | |
67 | 'uploader': 'Vishal Sikka', | |
68 | }, | |
69 | 'params': { | |
70 | 'skip_download': True, | |
71 | }, | |
72 | }, { | |
73 | 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', | |
74 | 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0', | |
75 | 'info_dict': { | |
76 | 'id': '1972', | |
77 | 'ext': 'mp4', | |
78 | 'title': 'Be passionate. Be courageous. Be your best.', | |
79 | 'uploader': 'Gabby Giffords and Mark Kelly', | |
80 | 'description': 'md5:5174aed4d0f16021b704120360f72b92', | |
81 | 'duration': 1128, | |
82 | }, | |
83 | 'params': { | |
84 | 'skip_download': True, | |
85 | }, | |
86 | }, { | |
87 | 'url': 'http://www.ted.com/playlists/who_are_the_hackers', | |
88 | 'info_dict': { | |
89 | 'id': '10', | |
90 | 'title': 'Who are the hackers?', | |
91 | 'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a' | |
92 | }, | |
93 | 'playlist_mincount': 6, | |
94 | }, { | |
95 | # contains a youtube video | |
96 | 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', | |
97 | 'add_ie': ['Youtube'], | |
98 | 'info_dict': { | |
99 | 'id': '_ZG8HBuDjgc', | |
100 | 'ext': 'webm', | |
101 | 'title': 'Douglas Adams: Parrots the Universe and Everything', | |
102 | 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', | |
103 | 'uploader': 'University of California Television (UCTV)', | |
104 | 'uploader_id': 'UCtelevision', | |
105 | 'upload_date': '20080522', | |
106 | }, | |
107 | 'params': { | |
108 | 'skip_download': True, | |
109 | }, | |
110 | }, { | |
111 | # no nativeDownloads | |
112 | 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', | |
113 | 'info_dict': { | |
114 | 'id': '1792', | |
115 | 'ext': 'mp4', | |
116 | 'title': 'The orchestra in my mouth', | |
117 | 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', | |
118 | 'uploader': 'Tom Thum', | |
119 | 'view_count': int, | |
120 | 'comment_count': int, | |
121 | 'tags': list, | |
122 | }, | |
123 | 'params': { | |
124 | 'skip_download': True, | |
125 | }, | |
126 | }] | |
127 | ||
128 | _NATIVE_FORMATS = { | |
129 | 'low': {'width': 320, 'height': 180}, | |
130 | 'medium': {'width': 512, 'height': 288}, | |
131 | 'high': {'width': 854, 'height': 480}, | |
132 | } | |
133 | ||
134 | def _extract_info(self, webpage): | |
135 | info_json = self._search_regex( | |
136 | r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>', | |
137 | webpage, 'info json') | |
138 | return json.loads(info_json) | |
139 | ||
140 | def _real_extract(self, url): | |
141 | m = re.match(self._VALID_URL, url, re.VERBOSE) | |
142 | if m.group('type').startswith('embed'): | |
143 | desktop_url = m.group('proto') + 'www' + m.group('urlmain') | |
144 | return self.url_result(desktop_url, 'TED') | |
145 | name = m.group('name') | |
146 | if m.group('type_talk'): | |
147 | return self._talk_info(url, name) | |
148 | elif m.group('type_watch'): | |
149 | return self._watch_info(url, name) | |
150 | else: | |
151 | return self._playlist_videos_info(url, name) | |
152 | ||
153 | def _playlist_videos_info(self, url, name): | |
154 | '''Returns the videos of the playlist''' | |
155 | ||
156 | webpage = self._download_webpage(url, name, | |
157 | 'Downloading playlist webpage') | |
158 | ||
159 | playlist_entries = [] | |
160 | for entry in re.findall(r'(?s)<[^>]+data-ga-context="playlist"[^>]*>', webpage): | |
161 | attrs = extract_attributes(entry) | |
162 | entry_url = compat_urlparse.urljoin(url, attrs['href']) | |
163 | playlist_entries.append(self.url_result(entry_url, self.ie_key())) | |
164 | ||
165 | final_url = self._og_search_url(webpage) | |
166 | return self.playlist_result( | |
167 | playlist_entries, | |
168 | playlist_id=re.match(self._VALID_URL, final_url, re.VERBOSE).group('playlist_id'), | |
169 | playlist_title=self._og_search_title(webpage), | |
170 | playlist_description=self._og_search_description(webpage)) | |
171 | ||
172 | def _talk_info(self, url, video_name): | |
173 | webpage = self._download_webpage(url, video_name) | |
174 | ||
175 | info = self._extract_info(webpage) | |
176 | ||
177 | data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info | |
178 | talk_info = data['talks'][0] | |
179 | ||
180 | title = talk_info['title'].strip() | |
181 | ||
182 | native_downloads = try_get( | |
183 | talk_info, | |
184 | (lambda x: x['downloads']['nativeDownloads'], | |
185 | lambda x: x['nativeDownloads']), | |
186 | dict) or {} | |
187 | ||
188 | formats = [{ | |
189 | 'url': format_url, | |
190 | 'format_id': format_id, | |
191 | 'format': format_id, | |
192 | } for (format_id, format_url) in native_downloads.items() if format_url is not None] | |
193 | if formats: | |
194 | for f in formats: | |
195 | finfo = self._NATIVE_FORMATS.get(f['format_id']) | |
196 | if finfo: | |
197 | f.update(finfo) | |
198 | ||
199 | player_talk = talk_info['player_talks'][0] | |
200 | ||
201 | external = player_talk.get('external') | |
202 | if isinstance(external, dict): | |
203 | service = external.get('service') | |
204 | if isinstance(service, compat_str): | |
205 | ext_url = None | |
206 | if service.lower() == 'youtube': | |
207 | ext_url = external.get('code') | |
208 | ||
209 | return self.url_result(ext_url or external['uri']) | |
210 | ||
211 | resources_ = player_talk.get('resources') or talk_info.get('resources') | |
212 | ||
213 | http_url = None | |
214 | for format_id, resources in resources_.items(): | |
215 | if format_id == 'h264': | |
216 | for resource in resources: | |
217 | h264_url = resource.get('file') | |
218 | if not h264_url: | |
219 | continue | |
220 | bitrate = int_or_none(resource.get('bitrate')) | |
221 | formats.append({ | |
222 | 'url': h264_url, | |
223 | 'format_id': '%s-%sk' % (format_id, bitrate), | |
224 | 'tbr': bitrate, | |
225 | }) | |
226 | if re.search(r'\d+k', h264_url): | |
227 | http_url = h264_url | |
228 | elif format_id == 'rtmp': | |
229 | streamer = talk_info.get('streamer') | |
230 | if not streamer: | |
231 | continue | |
232 | for resource in resources: | |
233 | formats.append({ | |
234 | 'format_id': '%s-%s' % (format_id, resource.get('name')), | |
235 | 'url': streamer, | |
236 | 'play_path': resource['file'], | |
237 | 'ext': 'flv', | |
238 | 'width': int_or_none(resource.get('width')), | |
239 | 'height': int_or_none(resource.get('height')), | |
240 | 'tbr': int_or_none(resource.get('bitrate')), | |
241 | }) | |
242 | elif format_id == 'hls': | |
243 | if not isinstance(resources, dict): | |
244 | continue | |
245 | stream_url = url_or_none(resources.get('stream')) | |
246 | if not stream_url: | |
247 | continue | |
248 | formats.extend(self._extract_m3u8_formats( | |
249 | stream_url, video_name, 'mp4', m3u8_id=format_id, | |
250 | fatal=False)) | |
251 | ||
252 | m3u8_formats = list(filter( | |
253 | lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', | |
254 | formats)) | |
255 | if http_url: | |
256 | for m3u8_format in m3u8_formats: | |
257 | bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) | |
258 | if not bitrate: | |
259 | continue | |
260 | bitrate_url = re.sub(r'\d+k', bitrate, http_url) | |
261 | if not self._is_valid_url( | |
262 | bitrate_url, video_name, '%s bitrate' % bitrate): | |
263 | continue | |
264 | f = m3u8_format.copy() | |
265 | f.update({ | |
266 | 'url': bitrate_url, | |
267 | 'format_id': m3u8_format['format_id'].replace('hls', 'http'), | |
268 | 'protocol': 'http', | |
269 | }) | |
270 | if f.get('acodec') == 'none': | |
271 | del f['acodec'] | |
272 | formats.append(f) | |
273 | ||
274 | audio_download = talk_info.get('audioDownload') | |
275 | if audio_download: | |
276 | formats.append({ | |
277 | 'url': audio_download, | |
278 | 'format_id': 'audio', | |
279 | 'vcodec': 'none', | |
280 | }) | |
281 | ||
282 | self._sort_formats(formats) | |
283 | ||
284 | video_id = compat_str(talk_info['id']) | |
285 | ||
286 | return { | |
287 | 'id': video_id, | |
288 | 'title': title, | |
289 | 'uploader': player_talk.get('speaker') or talk_info.get('speaker'), | |
290 | 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'), | |
291 | 'description': self._og_search_description(webpage), | |
292 | 'subtitles': self._get_subtitles(video_id, talk_info), | |
293 | 'formats': formats, | |
294 | 'duration': float_or_none(talk_info.get('duration')), | |
295 | 'view_count': int_or_none(data.get('viewed_count')), | |
296 | 'comment_count': int_or_none( | |
297 | try_get(data, lambda x: x['comments']['count'])), | |
298 | 'tags': try_get(talk_info, lambda x: x['tags'], list), | |
299 | } | |
300 | ||
301 | def _get_subtitles(self, video_id, talk_info): | |
302 | sub_lang_list = {} | |
303 | for language in try_get( | |
304 | talk_info, | |
305 | (lambda x: x['downloads']['languages'], | |
306 | lambda x: x['languages']), list): | |
307 | lang_code = language.get('languageCode') or language.get('ianaCode') | |
308 | if not lang_code: | |
309 | continue | |
310 | sub_lang_list[lang_code] = [ | |
311 | { | |
312 | 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext), | |
313 | 'ext': ext, | |
314 | } | |
315 | for ext in ['ted', 'srt'] | |
316 | ] | |
317 | return sub_lang_list | |
318 | ||
319 | def _watch_info(self, url, name): | |
320 | webpage = self._download_webpage(url, name) | |
321 | ||
322 | config_json = self._html_search_regex( | |
323 | r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>', | |
324 | webpage, 'config', default=None) | |
325 | if not config_json: | |
326 | embed_url = self._search_regex( | |
327 | r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url') | |
328 | return self.url_result(self._proto_relative_url(embed_url)) | |
329 | config = json.loads(config_json)['config'] | |
330 | video_url = config['video']['url'] | |
331 | thumbnail = config.get('image', {}).get('url') | |
332 | ||
333 | title = self._html_search_regex( | |
334 | r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title') | |
335 | description = self._html_search_regex( | |
336 | [ | |
337 | r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>', | |
338 | r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>', | |
339 | ], | |
340 | webpage, 'description', fatal=False) | |
341 | ||
342 | return { | |
343 | 'id': name, | |
344 | 'url': video_url, | |
345 | 'title': title, | |
346 | 'thumbnail': thumbnail, | |
347 | 'description': description, | |
348 | } |