]>
Commit | Line | Data |
---|---|---|
f853f859 PH |
1 | from __future__ import unicode_literals |
2 | ||
9fd5ce0c PH |
3 | import json |
4 | import re | |
5 | ||
a504ced0 | 6 | from .common import InfoExtractor |
9fd5ce0c | 7 | |
66ee7b32 S |
8 | from ..compat import compat_str |
9 | from ..utils import int_or_none | |
4ed3e510 | 10 | |
f853f859 | 11 | |
a504ced0 | 12 | class TEDIE(InfoExtractor): |
cfbee8a4 | 13 | IE_NAME = 'ted' |
aab74fa1 PH |
14 | _VALID_URL = r'''(?x) |
15 | (?P<proto>https?://) | |
cd791a5e | 16 | (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ |
bacac173 JMF |
17 | ( |
18 | (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist | |
19 | | | |
20 | ((?P<type_talk>talks)) # We have a simple talk | |
ac6c1048 PH |
21 | | |
22 | (?P<type_watch>watch)/[^/]+/[^/]+ | |
bacac173 JMF |
23 | ) |
24 | (/lang/(.*?))? # The url may contain the language | |
ac6c1048 | 25 | /(?P<name>[\w-]+) # Here goes the name and then ".html" |
aab74fa1 | 26 | .*)$ |
bacac173 | 27 | ''' |
ac6c1048 | 28 | _TESTS = [{ |
f853f859 | 29 | 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', |
4d2f143c | 30 | 'md5': 'fc94ac279feebbce69f21c0c6ee82810', |
f853f859 | 31 | 'info_dict': { |
7b9965ea JMF |
32 | 'id': '102', |
33 | 'ext': 'mp4', | |
652bee05 | 34 | 'title': 'The illusion of consciousness', |
bacac173 | 35 | 'description': ('Philosopher Dan Dennett makes a compelling ' |
9e1a5b84 JW |
36 | 'argument that not only don\'t we understand our own ' |
37 | 'consciousness, but that half the time our brains are ' | |
38 | 'actively fooling us.'), | |
652bee05 | 39 | 'uploader': 'Dan Dennett', |
0ba77818 | 40 | 'width': 854, |
eb4cb42a | 41 | 'duration': 1308, |
6f5ac90c | 42 | } |
ac6c1048 PH |
43 | }, { |
44 | 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', | |
45 | 'md5': '226f4fb9c62380d11b7995efa4c87994', | |
46 | 'info_dict': { | |
47 | 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms', | |
48 | 'ext': 'mp4', | |
49 | 'title': 'Vishal Sikka: The beauty and power of algorithms', | |
50 | 'thumbnail': 're:^https?://.+\.jpg', | |
51 | 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.', | |
52 | } | |
2d4c98db JMF |
53 | }, { |
54 | 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', | |
55 | 'info_dict': { | |
56 | 'id': '1972', | |
5bec5748 | 57 | 'ext': 'mp4', |
2d4c98db JMF |
58 | 'title': 'Be passionate. Be courageous. Be your best.', |
59 | 'uploader': 'Gabby Giffords and Mark Kelly', | |
5bec5748 | 60 | 'description': 'md5:5174aed4d0f16021b704120360f72b92', |
eb4cb42a | 61 | 'duration': 1128, |
2d4c98db | 62 | }, |
22a6f150 PH |
63 | }, { |
64 | 'url': 'http://www.ted.com/playlists/who_are_the_hackers', | |
65 | 'info_dict': { | |
66 | 'id': '10', | |
67 | 'title': 'Who are the hackers?', | |
68 | }, | |
69 | 'playlist_mincount': 6, | |
a72cbfac JMF |
70 | }, { |
71 | # contains a youtube video | |
72 | 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', | |
73 | 'add_ie': ['Youtube'], | |
74 | 'info_dict': { | |
75 | 'id': '_ZG8HBuDjgc', | |
76 | 'ext': 'mp4', | |
77 | 'title': 'Douglas Adams: Parrots the Universe and Everything', | |
78 | 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', | |
79 | 'uploader': 'University of California Television (UCTV)', | |
80 | 'uploader_id': 'UCtelevision', | |
81 | 'upload_date': '20080522', | |
82 | }, | |
83 | 'params': { | |
84 | 'skip_download': True, | |
85 | }, | |
a461a119 S |
86 | }, { |
87 | # YouTube video | |
88 | 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond', | |
89 | 'add_ie': ['Youtube'], | |
90 | 'info_dict': { | |
91 | 'id': 'aFBIPO-P7LM', | |
92 | 'ext': 'mp4', | |
93 | 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville', | |
94 | 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1', | |
95 | 'uploader': 'TEDx Talks', | |
96 | 'uploader_id': 'TEDxTalks', | |
97 | 'upload_date': '20111216', | |
98 | }, | |
99 | 'params': { | |
100 | 'skip_download': True, | |
101 | }, | |
ac6c1048 | 102 | }] |
9fd5ce0c | 103 | |
0ba77818 PH |
104 | _NATIVE_FORMATS = { |
105 | 'low': {'preference': 1, 'width': 320, 'height': 180}, | |
106 | 'medium': {'preference': 2, 'width': 512, 'height': 288}, | |
107 | 'high': {'preference': 3, 'width': 854, 'height': 480}, | |
652bee05 | 108 | } |
9fd5ce0c | 109 | |
ca1fee34 | 110 | def _extract_info(self, webpage): |
bacac173 | 111 | info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>', |
9e1a5b84 | 112 | webpage, 'info json') |
ca1fee34 JMF |
113 | return json.loads(info_json) |
114 | ||
9fd5ce0c | 115 | def _real_extract(self, url): |
bacac173 | 116 | m = re.match(self._VALID_URL, url, re.VERBOSE) |
cd791a5e | 117 | if m.group('type').startswith('embed'): |
aab74fa1 PH |
118 | desktop_url = m.group('proto') + 'www' + m.group('urlmain') |
119 | return self.url_result(desktop_url, 'TED') | |
bacac173 | 120 | name = m.group('name') |
9fd5ce0c | 121 | if m.group('type_talk'): |
bacac173 | 122 | return self._talk_info(url, name) |
ac6c1048 PH |
123 | elif m.group('type_watch'): |
124 | return self._watch_info(url, name) | |
bacac173 | 125 | else: |
ca1fee34 | 126 | return self._playlist_videos_info(url, name) |
9fd5ce0c | 127 | |
ca1fee34 | 128 | def _playlist_videos_info(self, url, name): |
9fd5ce0c | 129 | '''Returns the videos of the playlist''' |
fc2ef392 | 130 | |
ca1fee34 | 131 | webpage = self._download_webpage(url, name, |
9e1a5b84 | 132 | 'Downloading playlist webpage') |
ca1fee34 JMF |
133 | info = self._extract_info(webpage) |
134 | playlist_info = info['playlist'] | |
9fd5ce0c | 135 | |
fc2ef392 | 136 | playlist_entries = [ |
f07a9f6f | 137 | self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key()) |
ca1fee34 | 138 | for talk in info['talks'] |
fc2ef392 PH |
139 | ] |
140 | return self.playlist_result( | |
ca1fee34 JMF |
141 | playlist_entries, |
142 | playlist_id=compat_str(playlist_info['id']), | |
143 | playlist_title=playlist_info['title']) | |
9fd5ce0c | 144 | |
bacac173 JMF |
145 | def _talk_info(self, url, video_name): |
146 | webpage = self._download_webpage(url, video_name) | |
9fd5ce0c | 147 | self.report_extraction(video_name) |
a9a3876d | 148 | |
ca1fee34 | 149 | talk_info = self._extract_info(webpage)['talks'][0] |
a9a3876d | 150 | |
a461a119 S |
151 | external = talk_info.get('external') |
152 | if external: | |
153 | service = external['service'] | |
154 | self.to_screen('Found video from %s' % service) | |
155 | ext_url = None | |
156 | if service.lower() == 'youtube': | |
157 | ext_url = external.get('code') | |
a72cbfac JMF |
158 | return { |
159 | '_type': 'url', | |
a461a119 | 160 | 'url': ext_url or external['uri'], |
a72cbfac JMF |
161 | } |
162 | ||
652bee05 | 163 | formats = [{ |
652bee05 JMF |
164 | 'url': format_url, |
165 | 'format_id': format_id, | |
166 | 'format': format_id, | |
2d4c98db JMF |
167 | } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None] |
168 | if formats: | |
169 | for f in formats: | |
170 | finfo = self._NATIVE_FORMATS.get(f['format_id']) | |
171 | if finfo: | |
172 | f.update(finfo) | |
66ee7b32 S |
173 | |
174 | for format_id, resources in talk_info['resources'].items(): | |
175 | if format_id == 'h264': | |
176 | for resource in resources: | |
177 | bitrate = int_or_none(resource.get('bitrate')) | |
178 | formats.append({ | |
179 | 'url': resource['file'], | |
180 | 'format_id': '%s-%sk' % (format_id, bitrate), | |
181 | 'tbr': bitrate, | |
182 | }) | |
183 | elif format_id == 'rtmp': | |
184 | streamer = talk_info.get('streamer') | |
185 | if not streamer: | |
186 | continue | |
187 | for resource in resources: | |
188 | formats.append({ | |
189 | 'format_id': '%s-%s' % (format_id, resource.get('name')), | |
190 | 'url': streamer, | |
191 | 'play_path': resource['file'], | |
192 | 'ext': 'flv', | |
193 | 'width': int_or_none(resource.get('width')), | |
194 | 'height': int_or_none(resource.get('height')), | |
195 | 'tbr': int_or_none(resource.get('bitrate')), | |
196 | }) | |
197 | elif format_id == 'hls': | |
736785ab S |
198 | hls_formats = self._extract_m3u8_formats( |
199 | resources.get('stream'), video_name, 'mp4', m3u8_id=format_id) | |
200 | for f in hls_formats: | |
6621ca39 S |
201 | if f.get('format_id') == 'hls-meta': |
202 | continue | |
0f0b5736 S |
203 | if not f.get('height'): |
204 | f['vcodec'] = 'none' | |
205 | else: | |
206 | f['acodec'] = 'none' | |
736785ab | 207 | formats.extend(hls_formats) |
66ee7b32 S |
208 | |
209 | audio_download = talk_info.get('audioDownload') | |
210 | if audio_download: | |
211 | formats.append({ | |
212 | 'url': audio_download, | |
213 | 'format_id': 'audio', | |
736785ab | 214 | 'vcodec': 'none', |
14f7abfa | 215 | 'preference': -0.5, |
66ee7b32 S |
216 | }) |
217 | ||
652bee05 JMF |
218 | self._sort_formats(formats) |
219 | ||
7b9965ea | 220 | video_id = compat_str(talk_info['id']) |
a9a3876d | 221 | |
b6c1cecc JMF |
222 | thumbnail = talk_info['thumb'] |
223 | if not thumbnail.startswith('http'): | |
224 | thumbnail = 'http://' + thumbnail | |
463a9087 | 225 | return { |
a9a3876d | 226 | 'id': video_id, |
a8eb5a8e | 227 | 'title': talk_info['title'].strip(), |
652bee05 | 228 | 'uploader': talk_info['speaker'], |
b6c1cecc | 229 | 'thumbnail': thumbnail, |
652bee05 | 230 | 'description': self._og_search_description(webpage), |
03091e37 | 231 | 'subtitles': self._get_subtitles(video_id, talk_info), |
0d8cb1cc | 232 | 'formats': formats, |
eb4cb42a | 233 | 'duration': talk_info.get('duration'), |
0d8cb1cc PH |
234 | } |
235 | ||
a504ced0 | 236 | def _get_subtitles(self, video_id, talk_info): |
652bee05 JMF |
237 | languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] |
238 | if languages: | |
239 | sub_lang_list = {} | |
240 | for l in languages: | |
a504ced0 JMF |
241 | sub_lang_list[l] = [ |
242 | { | |
243 | 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext), | |
244 | 'ext': ext, | |
245 | } | |
246 | for ext in ['ted', 'srt'] | |
247 | ] | |
652bee05 JMF |
248 | return sub_lang_list |
249 | else: | |
652bee05 | 250 | return {} |
ac6c1048 PH |
251 | |
252 | def _watch_info(self, url, name): | |
253 | webpage = self._download_webpage(url, name) | |
254 | ||
255 | config_json = self._html_search_regex( | |
de9bd74b S |
256 | r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>', |
257 | webpage, 'config') | |
258 | config = json.loads(config_json)['config'] | |
ac6c1048 PH |
259 | video_url = config['video']['url'] |
260 | thumbnail = config.get('image', {}).get('url') | |
261 | ||
262 | title = self._html_search_regex( | |
263 | r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title') | |
264 | description = self._html_search_regex( | |
621f33c9 PH |
265 | [ |
266 | r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>', | |
267 | r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>', | |
268 | ], | |
ac6c1048 PH |
269 | webpage, 'description', fatal=False) |
270 | ||
271 | return { | |
272 | 'id': name, | |
273 | 'url': video_url, | |
274 | 'title': title, | |
275 | 'thumbnail': thumbnail, | |
276 | 'description': description, | |
277 | } |