]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/ted.py
[cinemassacre] update tests
[yt-dlp.git] / youtube_dl / extractor / ted.py
CommitLineData
f853f859
PH
1from __future__ import unicode_literals
2
9fd5ce0c
PH
3import json
4import re
5
a504ced0 6from .common import InfoExtractor
9fd5ce0c 7
66ee7b32
S
8from ..compat import compat_str
9from ..utils import int_or_none
4ed3e510 10
f853f859 11
a504ced0 12class TEDIE(InfoExtractor):
cfbee8a4 13 IE_NAME = 'ted'
aab74fa1
PH
14 _VALID_URL = r'''(?x)
15 (?P<proto>https?://)
cd791a5e 16 (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
bacac173
JMF
17 (
18 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19 |
20 ((?P<type_talk>talks)) # We have a simple talk
ac6c1048
PH
21 |
22 (?P<type_watch>watch)/[^/]+/[^/]+
bacac173
JMF
23 )
24 (/lang/(.*?))? # The url may contain the language
ac6c1048 25 /(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1 26 .*)$
bacac173 27 '''
ac6c1048 28 _TESTS = [{
f853f859 29 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
4d2f143c 30 'md5': 'fc94ac279feebbce69f21c0c6ee82810',
f853f859 31 'info_dict': {
7b9965ea
JMF
32 'id': '102',
33 'ext': 'mp4',
652bee05 34 'title': 'The illusion of consciousness',
bacac173 35 'description': ('Philosopher Dan Dennett makes a compelling '
9e1a5b84
JW
36 'argument that not only don\'t we understand our own '
37 'consciousness, but that half the time our brains are '
38 'actively fooling us.'),
652bee05 39 'uploader': 'Dan Dennett',
0ba77818 40 'width': 854,
eb4cb42a 41 'duration': 1308,
6f5ac90c 42 }
ac6c1048
PH
43 }, {
44 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
45 'md5': '226f4fb9c62380d11b7995efa4c87994',
46 'info_dict': {
47 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
48 'ext': 'mp4',
49 'title': 'Vishal Sikka: The beauty and power of algorithms',
50 'thumbnail': 're:^https?://.+\.jpg',
51 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
52 }
2d4c98db
JMF
53 }, {
54 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
55 'info_dict': {
56 'id': '1972',
5bec5748 57 'ext': 'mp4',
2d4c98db
JMF
58 'title': 'Be passionate. Be courageous. Be your best.',
59 'uploader': 'Gabby Giffords and Mark Kelly',
5bec5748 60 'description': 'md5:5174aed4d0f16021b704120360f72b92',
eb4cb42a 61 'duration': 1128,
2d4c98db 62 },
22a6f150
PH
63 }, {
64 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
65 'info_dict': {
66 'id': '10',
67 'title': 'Who are the hackers?',
68 },
69 'playlist_mincount': 6,
a72cbfac
JMF
70 }, {
71 # contains a youtube video
72 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
73 'add_ie': ['Youtube'],
74 'info_dict': {
75 'id': '_ZG8HBuDjgc',
76 'ext': 'mp4',
77 'title': 'Douglas Adams: Parrots the Universe and Everything',
78 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
79 'uploader': 'University of California Television (UCTV)',
80 'uploader_id': 'UCtelevision',
81 'upload_date': '20080522',
82 },
83 'params': {
84 'skip_download': True,
85 },
a461a119
S
86 }, {
87 # YouTube video
88 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
89 'add_ie': ['Youtube'],
90 'info_dict': {
91 'id': 'aFBIPO-P7LM',
92 'ext': 'mp4',
93 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
94 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
95 'uploader': 'TEDx Talks',
96 'uploader_id': 'TEDxTalks',
97 'upload_date': '20111216',
98 },
99 'params': {
100 'skip_download': True,
101 },
ac6c1048 102 }]
9fd5ce0c 103
0ba77818
PH
104 _NATIVE_FORMATS = {
105 'low': {'preference': 1, 'width': 320, 'height': 180},
106 'medium': {'preference': 2, 'width': 512, 'height': 288},
107 'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05 108 }
9fd5ce0c 109
ca1fee34 110 def _extract_info(self, webpage):
bacac173 111 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
9e1a5b84 112 webpage, 'info json')
ca1fee34
JMF
113 return json.loads(info_json)
114
9fd5ce0c 115 def _real_extract(self, url):
bacac173 116 m = re.match(self._VALID_URL, url, re.VERBOSE)
cd791a5e 117 if m.group('type').startswith('embed'):
aab74fa1
PH
118 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
119 return self.url_result(desktop_url, 'TED')
bacac173 120 name = m.group('name')
9fd5ce0c 121 if m.group('type_talk'):
bacac173 122 return self._talk_info(url, name)
ac6c1048
PH
123 elif m.group('type_watch'):
124 return self._watch_info(url, name)
bacac173 125 else:
ca1fee34 126 return self._playlist_videos_info(url, name)
9fd5ce0c 127
ca1fee34 128 def _playlist_videos_info(self, url, name):
9fd5ce0c 129 '''Returns the videos of the playlist'''
fc2ef392 130
ca1fee34 131 webpage = self._download_webpage(url, name,
9e1a5b84 132 'Downloading playlist webpage')
ca1fee34
JMF
133 info = self._extract_info(webpage)
134 playlist_info = info['playlist']
9fd5ce0c 135
fc2ef392 136 playlist_entries = [
f07a9f6f 137 self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34 138 for talk in info['talks']
fc2ef392
PH
139 ]
140 return self.playlist_result(
ca1fee34
JMF
141 playlist_entries,
142 playlist_id=compat_str(playlist_info['id']),
143 playlist_title=playlist_info['title'])
9fd5ce0c 144
bacac173
JMF
145 def _talk_info(self, url, video_name):
146 webpage = self._download_webpage(url, video_name)
9fd5ce0c 147 self.report_extraction(video_name)
a9a3876d 148
ca1fee34 149 talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d 150
a461a119
S
151 external = talk_info.get('external')
152 if external:
153 service = external['service']
154 self.to_screen('Found video from %s' % service)
155 ext_url = None
156 if service.lower() == 'youtube':
157 ext_url = external.get('code')
a72cbfac
JMF
158 return {
159 '_type': 'url',
a461a119 160 'url': ext_url or external['uri'],
a72cbfac
JMF
161 }
162
652bee05 163 formats = [{
652bee05
JMF
164 'url': format_url,
165 'format_id': format_id,
166 'format': format_id,
2d4c98db
JMF
167 } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
168 if formats:
169 for f in formats:
170 finfo = self._NATIVE_FORMATS.get(f['format_id'])
171 if finfo:
172 f.update(finfo)
66ee7b32
S
173
174 for format_id, resources in talk_info['resources'].items():
175 if format_id == 'h264':
176 for resource in resources:
177 bitrate = int_or_none(resource.get('bitrate'))
178 formats.append({
179 'url': resource['file'],
180 'format_id': '%s-%sk' % (format_id, bitrate),
181 'tbr': bitrate,
182 })
183 elif format_id == 'rtmp':
184 streamer = talk_info.get('streamer')
185 if not streamer:
186 continue
187 for resource in resources:
188 formats.append({
189 'format_id': '%s-%s' % (format_id, resource.get('name')),
190 'url': streamer,
191 'play_path': resource['file'],
192 'ext': 'flv',
193 'width': int_or_none(resource.get('width')),
194 'height': int_or_none(resource.get('height')),
195 'tbr': int_or_none(resource.get('bitrate')),
196 })
197 elif format_id == 'hls':
736785ab
S
198 hls_formats = self._extract_m3u8_formats(
199 resources.get('stream'), video_name, 'mp4', m3u8_id=format_id)
200 for f in hls_formats:
6621ca39
S
201 if f.get('format_id') == 'hls-meta':
202 continue
0f0b5736
S
203 if not f.get('height'):
204 f['vcodec'] = 'none'
205 else:
206 f['acodec'] = 'none'
736785ab 207 formats.extend(hls_formats)
66ee7b32
S
208
209 audio_download = talk_info.get('audioDownload')
210 if audio_download:
211 formats.append({
212 'url': audio_download,
213 'format_id': 'audio',
736785ab 214 'vcodec': 'none',
14f7abfa 215 'preference': -0.5,
66ee7b32
S
216 })
217
652bee05
JMF
218 self._sort_formats(formats)
219
7b9965ea 220 video_id = compat_str(talk_info['id'])
a9a3876d 221
b6c1cecc
JMF
222 thumbnail = talk_info['thumb']
223 if not thumbnail.startswith('http'):
224 thumbnail = 'http://' + thumbnail
463a9087 225 return {
a9a3876d 226 'id': video_id,
a8eb5a8e 227 'title': talk_info['title'].strip(),
652bee05 228 'uploader': talk_info['speaker'],
b6c1cecc 229 'thumbnail': thumbnail,
652bee05 230 'description': self._og_search_description(webpage),
03091e37 231 'subtitles': self._get_subtitles(video_id, talk_info),
0d8cb1cc 232 'formats': formats,
eb4cb42a 233 'duration': talk_info.get('duration'),
0d8cb1cc
PH
234 }
235
a504ced0 236 def _get_subtitles(self, video_id, talk_info):
652bee05
JMF
237 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
238 if languages:
239 sub_lang_list = {}
240 for l in languages:
a504ced0
JMF
241 sub_lang_list[l] = [
242 {
243 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
244 'ext': ext,
245 }
246 for ext in ['ted', 'srt']
247 ]
652bee05
JMF
248 return sub_lang_list
249 else:
652bee05 250 return {}
ac6c1048
PH
251
252 def _watch_info(self, url, name):
253 webpage = self._download_webpage(url, name)
254
255 config_json = self._html_search_regex(
de9bd74b
S
256 r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
257 webpage, 'config')
258 config = json.loads(config_json)['config']
ac6c1048
PH
259 video_url = config['video']['url']
260 thumbnail = config.get('image', {}).get('url')
261
262 title = self._html_search_regex(
263 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
264 description = self._html_search_regex(
621f33c9
PH
265 [
266 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
267 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
268 ],
ac6c1048
PH
269 webpage, 'description', fatal=False)
270
271 return {
272 'id': name,
273 'url': video_url,
274 'title': title,
275 'thumbnail': thumbnail,
276 'description': description,
277 }