]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/ted.py
release 2014.11.12
[yt-dlp.git] / youtube_dl / extractor / ted.py
CommitLineData
f853f859
PH
1from __future__ import unicode_literals
2
9fd5ce0c
PH
3import json
4import re
5
a9a3876d 6from .subtitles import SubtitlesInfoExtractor
9fd5ce0c 7
4ed3e510 8from ..utils import (
ca1fee34 9 compat_str,
4ed3e510
IM
10)
11
f853f859 12
a9a3876d 13class TEDIE(SubtitlesInfoExtractor):
aab74fa1
PH
14 _VALID_URL = r'''(?x)
15 (?P<proto>https?://)
16 (?P<type>www|embed)(?P<urlmain>\.ted\.com/
bacac173
JMF
17 (
18 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19 |
20 ((?P<type_talk>talks)) # We have a simple talk
ac6c1048
PH
21 |
22 (?P<type_watch>watch)/[^/]+/[^/]+
bacac173
JMF
23 )
24 (/lang/(.*?))? # The url may contain the language
ac6c1048 25 /(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1 26 .*)$
bacac173 27 '''
ac6c1048 28 _TESTS = [{
f853f859 29 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
4d2f143c 30 'md5': 'fc94ac279feebbce69f21c0c6ee82810',
f853f859 31 'info_dict': {
7b9965ea
JMF
32 'id': '102',
33 'ext': 'mp4',
652bee05 34 'title': 'The illusion of consciousness',
bacac173
JMF
35 'description': ('Philosopher Dan Dennett makes a compelling '
36 'argument that not only don\'t we understand our own '
37 'consciousness, but that half the time our brains are '
38 'actively fooling us.'),
652bee05 39 'uploader': 'Dan Dennett',
0ba77818 40 'width': 854,
6f5ac90c 41 }
ac6c1048
PH
42 }, {
43 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
44 'md5': '226f4fb9c62380d11b7995efa4c87994',
45 'info_dict': {
46 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
47 'ext': 'mp4',
48 'title': 'Vishal Sikka: The beauty and power of algorithms',
49 'thumbnail': 're:^https?://.+\.jpg',
50 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
51 }
2d4c98db
JMF
52 }, {
53 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
54 'info_dict': {
55 'id': '1972',
5bec5748 56 'ext': 'mp4',
2d4c98db
JMF
57 'title': 'Be passionate. Be courageous. Be your best.',
58 'uploader': 'Gabby Giffords and Mark Kelly',
5bec5748 59 'description': 'md5:5174aed4d0f16021b704120360f72b92',
2d4c98db 60 },
22a6f150
PH
61 }, {
62 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
63 'info_dict': {
64 'id': '10',
65 'title': 'Who are the hackers?',
66 },
67 'playlist_mincount': 6,
a72cbfac
JMF
68 }, {
69 # contains a youtube video
70 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
71 'add_ie': ['Youtube'],
72 'info_dict': {
73 'id': '_ZG8HBuDjgc',
74 'ext': 'mp4',
75 'title': 'Douglas Adams: Parrots the Universe and Everything',
76 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
77 'uploader': 'University of California Television (UCTV)',
78 'uploader_id': 'UCtelevision',
79 'upload_date': '20080522',
80 },
81 'params': {
82 'skip_download': True,
83 },
ac6c1048 84 }]
9fd5ce0c 85
0ba77818
PH
86 _NATIVE_FORMATS = {
87 'low': {'preference': 1, 'width': 320, 'height': 180},
88 'medium': {'preference': 2, 'width': 512, 'height': 288},
89 'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05 90 }
9fd5ce0c 91
ca1fee34 92 def _extract_info(self, webpage):
bacac173
JMF
93 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
94 webpage, 'info json')
ca1fee34
JMF
95 return json.loads(info_json)
96
9fd5ce0c 97 def _real_extract(self, url):
bacac173 98 m = re.match(self._VALID_URL, url, re.VERBOSE)
aab74fa1
PH
99 if m.group('type') == 'embed':
100 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
101 return self.url_result(desktop_url, 'TED')
bacac173 102 name = m.group('name')
9fd5ce0c 103 if m.group('type_talk'):
bacac173 104 return self._talk_info(url, name)
ac6c1048
PH
105 elif m.group('type_watch'):
106 return self._watch_info(url, name)
bacac173 107 else:
ca1fee34 108 return self._playlist_videos_info(url, name)
9fd5ce0c 109
ca1fee34 110 def _playlist_videos_info(self, url, name):
9fd5ce0c 111 '''Returns the videos of the playlist'''
fc2ef392 112
ca1fee34
JMF
113 webpage = self._download_webpage(url, name,
114 'Downloading playlist webpage')
115 info = self._extract_info(webpage)
116 playlist_info = info['playlist']
9fd5ce0c 117
fc2ef392 118 playlist_entries = [
f07a9f6f 119 self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34 120 for talk in info['talks']
fc2ef392
PH
121 ]
122 return self.playlist_result(
ca1fee34
JMF
123 playlist_entries,
124 playlist_id=compat_str(playlist_info['id']),
125 playlist_title=playlist_info['title'])
9fd5ce0c 126
bacac173
JMF
127 def _talk_info(self, url, video_name):
128 webpage = self._download_webpage(url, video_name)
9fd5ce0c 129 self.report_extraction(video_name)
a9a3876d 130
ca1fee34 131 talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d 132
a72cbfac
JMF
133 if talk_info.get('external') is not None:
134 self.to_screen('Found video from %s' % talk_info['external']['service'])
135 return {
136 '_type': 'url',
137 'url': talk_info['external']['uri'],
138 }
139
652bee05 140 formats = [{
652bee05
JMF
141 'url': format_url,
142 'format_id': format_id,
143 'format': format_id,
2d4c98db
JMF
144 } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
145 if formats:
146 for f in formats:
147 finfo = self._NATIVE_FORMATS.get(f['format_id'])
148 if finfo:
149 f.update(finfo)
150 else:
151 # Use rtmp downloads
152 formats = [{
153 'format_id': f['name'],
154 'url': talk_info['streamer'],
155 'play_path': f['file'],
156 'ext': 'flv',
157 'width': f['width'],
158 'height': f['height'],
159 'tbr': f['bitrate'],
160 } for f in talk_info['resources']['rtmp']]
652bee05
JMF
161 self._sort_formats(formats)
162
7b9965ea 163 video_id = compat_str(talk_info['id'])
a9a3876d 164 # subtitles
652bee05 165 video_subtitles = self.extract_subtitles(video_id, talk_info)
a9a3876d 166 if self._downloader.params.get('listsubtitles', False):
652bee05 167 self._list_available_subtitles(video_id, talk_info)
a9a3876d
IM
168 return
169
b6c1cecc
JMF
170 thumbnail = talk_info['thumb']
171 if not thumbnail.startswith('http'):
172 thumbnail = 'http://' + thumbnail
463a9087 173 return {
a9a3876d 174 'id': video_id,
a8eb5a8e 175 'title': talk_info['title'].strip(),
652bee05 176 'uploader': talk_info['speaker'],
b6c1cecc 177 'thumbnail': thumbnail,
652bee05 178 'description': self._og_search_description(webpage),
a9a3876d 179 'subtitles': video_subtitles,
0d8cb1cc
PH
180 'formats': formats,
181 }
182
652bee05
JMF
183 def _get_available_subtitles(self, video_id, talk_info):
184 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
185 if languages:
186 sub_lang_list = {}
187 for l in languages:
188 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
189 sub_lang_list[l] = url
190 return sub_lang_list
191 else:
f07a9f6f 192 self._downloader.report_warning('video doesn\'t have subtitles')
652bee05 193 return {}
ac6c1048
PH
194
195 def _watch_info(self, url, name):
196 webpage = self._download_webpage(url, name)
197
198 config_json = self._html_search_regex(
199 r"data-config='([^']+)", webpage, 'config')
200 config = json.loads(config_json)
201 video_url = config['video']['url']
202 thumbnail = config.get('image', {}).get('url')
203
204 title = self._html_search_regex(
205 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
206 description = self._html_search_regex(
621f33c9
PH
207 [
208 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
209 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
210 ],
ac6c1048
PH
211 webpage, 'description', fatal=False)
212
213 return {
214 'id': name,
215 'url': video_url,
216 'title': title,
217 'thumbnail': thumbnail,
218 'description': description,
219 }