]>
Commit | Line | Data |
---|---|---|
f853f859 PH |
1 | from __future__ import unicode_literals |
2 | ||
9fd5ce0c PH |
3 | import json |
4 | import re | |
5 | ||
a9a3876d | 6 | from .subtitles import SubtitlesInfoExtractor |
9fd5ce0c | 7 | |
1cc79574 | 8 | from ..compat import ( |
ca1fee34 | 9 | compat_str, |
4ed3e510 IM |
10 | ) |
11 | ||
f853f859 | 12 | |
a9a3876d | 13 | class TEDIE(SubtitlesInfoExtractor): |
aab74fa1 PH |
14 | _VALID_URL = r'''(?x) |
15 | (?P<proto>https?://) | |
cd791a5e | 16 | (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ |
bacac173 JMF |
17 | ( |
18 | (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist | |
19 | | | |
20 | ((?P<type_talk>talks)) # We have a simple talk | |
ac6c1048 PH |
21 | | |
22 | (?P<type_watch>watch)/[^/]+/[^/]+ | |
bacac173 JMF |
23 | ) |
24 | (/lang/(.*?))? # The url may contain the language | |
ac6c1048 | 25 | /(?P<name>[\w-]+) # Here goes the name and then ".html" |
aab74fa1 | 26 | .*)$ |
bacac173 | 27 | ''' |
ac6c1048 | 28 | _TESTS = [{ |
f853f859 | 29 | 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', |
4d2f143c | 30 | 'md5': 'fc94ac279feebbce69f21c0c6ee82810', |
f853f859 | 31 | 'info_dict': { |
7b9965ea JMF |
32 | 'id': '102', |
33 | 'ext': 'mp4', | |
652bee05 | 34 | 'title': 'The illusion of consciousness', |
bacac173 | 35 | 'description': ('Philosopher Dan Dennett makes a compelling ' |
9e1a5b84 JW |
36 | 'argument that not only don\'t we understand our own ' |
37 | 'consciousness, but that half the time our brains are ' | |
38 | 'actively fooling us.'), | |
652bee05 | 39 | 'uploader': 'Dan Dennett', |
0ba77818 | 40 | 'width': 854, |
eb4cb42a | 41 | 'duration': 1308, |
6f5ac90c | 42 | } |
ac6c1048 PH |
43 | }, { |
44 | 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', | |
45 | 'md5': '226f4fb9c62380d11b7995efa4c87994', | |
46 | 'info_dict': { | |
47 | 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms', | |
48 | 'ext': 'mp4', | |
49 | 'title': 'Vishal Sikka: The beauty and power of algorithms', | |
50 | 'thumbnail': 're:^https?://.+\.jpg', | |
51 | 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.', | |
52 | } | |
2d4c98db JMF |
53 | }, { |
54 | 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', | |
55 | 'info_dict': { | |
56 | 'id': '1972', | |
5bec5748 | 57 | 'ext': 'mp4', |
2d4c98db JMF |
58 | 'title': 'Be passionate. Be courageous. Be your best.', |
59 | 'uploader': 'Gabby Giffords and Mark Kelly', | |
5bec5748 | 60 | 'description': 'md5:5174aed4d0f16021b704120360f72b92', |
eb4cb42a | 61 | 'duration': 1128, |
2d4c98db | 62 | }, |
22a6f150 PH |
63 | }, { |
64 | 'url': 'http://www.ted.com/playlists/who_are_the_hackers', | |
65 | 'info_dict': { | |
66 | 'id': '10', | |
67 | 'title': 'Who are the hackers?', | |
68 | }, | |
69 | 'playlist_mincount': 6, | |
a72cbfac JMF |
70 | }, { |
71 | # contains a youtube video | |
72 | 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', | |
73 | 'add_ie': ['Youtube'], | |
74 | 'info_dict': { | |
75 | 'id': '_ZG8HBuDjgc', | |
76 | 'ext': 'mp4', | |
77 | 'title': 'Douglas Adams: Parrots the Universe and Everything', | |
78 | 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', | |
79 | 'uploader': 'University of California Television (UCTV)', | |
80 | 'uploader_id': 'UCtelevision', | |
81 | 'upload_date': '20080522', | |
82 | }, | |
83 | 'params': { | |
84 | 'skip_download': True, | |
85 | }, | |
ac6c1048 | 86 | }] |
9fd5ce0c | 87 | |
0ba77818 PH |
88 | _NATIVE_FORMATS = { |
89 | 'low': {'preference': 1, 'width': 320, 'height': 180}, | |
90 | 'medium': {'preference': 2, 'width': 512, 'height': 288}, | |
91 | 'high': {'preference': 3, 'width': 854, 'height': 480}, | |
652bee05 | 92 | } |
9fd5ce0c | 93 | |
ca1fee34 | 94 | def _extract_info(self, webpage): |
bacac173 | 95 | info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>', |
9e1a5b84 | 96 | webpage, 'info json') |
ca1fee34 JMF |
97 | return json.loads(info_json) |
98 | ||
9fd5ce0c | 99 | def _real_extract(self, url): |
bacac173 | 100 | m = re.match(self._VALID_URL, url, re.VERBOSE) |
cd791a5e | 101 | if m.group('type').startswith('embed'): |
aab74fa1 PH |
102 | desktop_url = m.group('proto') + 'www' + m.group('urlmain') |
103 | return self.url_result(desktop_url, 'TED') | |
bacac173 | 104 | name = m.group('name') |
9fd5ce0c | 105 | if m.group('type_talk'): |
bacac173 | 106 | return self._talk_info(url, name) |
ac6c1048 PH |
107 | elif m.group('type_watch'): |
108 | return self._watch_info(url, name) | |
bacac173 | 109 | else: |
ca1fee34 | 110 | return self._playlist_videos_info(url, name) |
9fd5ce0c | 111 | |
ca1fee34 | 112 | def _playlist_videos_info(self, url, name): |
9fd5ce0c | 113 | '''Returns the videos of the playlist''' |
fc2ef392 | 114 | |
ca1fee34 | 115 | webpage = self._download_webpage(url, name, |
9e1a5b84 | 116 | 'Downloading playlist webpage') |
ca1fee34 JMF |
117 | info = self._extract_info(webpage) |
118 | playlist_info = info['playlist'] | |
9fd5ce0c | 119 | |
fc2ef392 | 120 | playlist_entries = [ |
f07a9f6f | 121 | self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key()) |
ca1fee34 | 122 | for talk in info['talks'] |
fc2ef392 PH |
123 | ] |
124 | return self.playlist_result( | |
ca1fee34 JMF |
125 | playlist_entries, |
126 | playlist_id=compat_str(playlist_info['id']), | |
127 | playlist_title=playlist_info['title']) | |
9fd5ce0c | 128 | |
bacac173 JMF |
129 | def _talk_info(self, url, video_name): |
130 | webpage = self._download_webpage(url, video_name) | |
9fd5ce0c | 131 | self.report_extraction(video_name) |
a9a3876d | 132 | |
ca1fee34 | 133 | talk_info = self._extract_info(webpage)['talks'][0] |
a9a3876d | 134 | |
a72cbfac JMF |
135 | if talk_info.get('external') is not None: |
136 | self.to_screen('Found video from %s' % talk_info['external']['service']) | |
137 | return { | |
138 | '_type': 'url', | |
139 | 'url': talk_info['external']['uri'], | |
140 | } | |
141 | ||
652bee05 | 142 | formats = [{ |
652bee05 JMF |
143 | 'url': format_url, |
144 | 'format_id': format_id, | |
145 | 'format': format_id, | |
2d4c98db JMF |
146 | } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None] |
147 | if formats: | |
148 | for f in formats: | |
149 | finfo = self._NATIVE_FORMATS.get(f['format_id']) | |
150 | if finfo: | |
151 | f.update(finfo) | |
152 | else: | |
153 | # Use rtmp downloads | |
154 | formats = [{ | |
155 | 'format_id': f['name'], | |
156 | 'url': talk_info['streamer'], | |
157 | 'play_path': f['file'], | |
158 | 'ext': 'flv', | |
159 | 'width': f['width'], | |
160 | 'height': f['height'], | |
161 | 'tbr': f['bitrate'], | |
162 | } for f in talk_info['resources']['rtmp']] | |
652bee05 JMF |
163 | self._sort_formats(formats) |
164 | ||
7b9965ea | 165 | video_id = compat_str(talk_info['id']) |
a9a3876d | 166 | # subtitles |
652bee05 | 167 | video_subtitles = self.extract_subtitles(video_id, talk_info) |
a9a3876d | 168 | if self._downloader.params.get('listsubtitles', False): |
652bee05 | 169 | self._list_available_subtitles(video_id, talk_info) |
a9a3876d IM |
170 | return |
171 | ||
b6c1cecc JMF |
172 | thumbnail = talk_info['thumb'] |
173 | if not thumbnail.startswith('http'): | |
174 | thumbnail = 'http://' + thumbnail | |
463a9087 | 175 | return { |
a9a3876d | 176 | 'id': video_id, |
a8eb5a8e | 177 | 'title': talk_info['title'].strip(), |
652bee05 | 178 | 'uploader': talk_info['speaker'], |
b6c1cecc | 179 | 'thumbnail': thumbnail, |
652bee05 | 180 | 'description': self._og_search_description(webpage), |
a9a3876d | 181 | 'subtitles': video_subtitles, |
0d8cb1cc | 182 | 'formats': formats, |
eb4cb42a | 183 | 'duration': talk_info.get('duration'), |
0d8cb1cc PH |
184 | } |
185 | ||
652bee05 JMF |
186 | def _get_available_subtitles(self, video_id, talk_info): |
187 | languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] | |
188 | if languages: | |
189 | sub_lang_list = {} | |
190 | for l in languages: | |
191 | url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) | |
192 | sub_lang_list[l] = url | |
193 | return sub_lang_list | |
194 | else: | |
f07a9f6f | 195 | self._downloader.report_warning('video doesn\'t have subtitles') |
652bee05 | 196 | return {} |
ac6c1048 PH |
197 | |
198 | def _watch_info(self, url, name): | |
199 | webpage = self._download_webpage(url, name) | |
200 | ||
201 | config_json = self._html_search_regex( | |
de9bd74b S |
202 | r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>', |
203 | webpage, 'config') | |
204 | config = json.loads(config_json)['config'] | |
ac6c1048 PH |
205 | video_url = config['video']['url'] |
206 | thumbnail = config.get('image', {}).get('url') | |
207 | ||
208 | title = self._html_search_regex( | |
209 | r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title') | |
210 | description = self._html_search_regex( | |
621f33c9 PH |
211 | [ |
212 | r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>', | |
213 | r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>', | |
214 | ], | |
ac6c1048 PH |
215 | webpage, 'description', fatal=False) |
216 | ||
217 | return { | |
218 | 'id': name, | |
219 | 'url': video_url, | |
220 | 'title': title, | |
221 | 'thumbnail': thumbnail, | |
222 | 'description': description, | |
223 | } |