]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/ted.py
prefer 'code' to 'uri' if present
[yt-dlp.git] / youtube_dl / extractor / ted.py
CommitLineData
f853f859
PH
1from __future__ import unicode_literals
2
9fd5ce0c
PH
3import json
4import re
5
a9a3876d 6from .subtitles import SubtitlesInfoExtractor
9fd5ce0c 7
1cc79574 8from ..compat import (
ca1fee34 9 compat_str,
4ed3e510
IM
10)
11
f853f859 12
a9a3876d 13class TEDIE(SubtitlesInfoExtractor):
aab74fa1
PH
14 _VALID_URL = r'''(?x)
15 (?P<proto>https?://)
cd791a5e 16 (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
bacac173
JMF
17 (
18 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19 |
20 ((?P<type_talk>talks)) # We have a simple talk
ac6c1048
PH
21 |
22 (?P<type_watch>watch)/[^/]+/[^/]+
bacac173
JMF
23 )
24 (/lang/(.*?))? # The url may contain the language
ac6c1048 25 /(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1 26 .*)$
bacac173 27 '''
ac6c1048 28 _TESTS = [{
f853f859 29 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
4d2f143c 30 'md5': 'fc94ac279feebbce69f21c0c6ee82810',
f853f859 31 'info_dict': {
7b9965ea
JMF
32 'id': '102',
33 'ext': 'mp4',
652bee05 34 'title': 'The illusion of consciousness',
bacac173 35 'description': ('Philosopher Dan Dennett makes a compelling '
9e1a5b84
JW
36 'argument that not only don\'t we understand our own '
37 'consciousness, but that half the time our brains are '
38 'actively fooling us.'),
652bee05 39 'uploader': 'Dan Dennett',
0ba77818 40 'width': 854,
eb4cb42a 41 'duration': 1308,
6f5ac90c 42 }
ac6c1048
PH
43 }, {
44 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
45 'md5': '226f4fb9c62380d11b7995efa4c87994',
46 'info_dict': {
47 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
48 'ext': 'mp4',
49 'title': 'Vishal Sikka: The beauty and power of algorithms',
50 'thumbnail': 're:^https?://.+\.jpg',
51 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
52 }
2d4c98db
JMF
53 }, {
54 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
55 'info_dict': {
56 'id': '1972',
5bec5748 57 'ext': 'mp4',
2d4c98db
JMF
58 'title': 'Be passionate. Be courageous. Be your best.',
59 'uploader': 'Gabby Giffords and Mark Kelly',
5bec5748 60 'description': 'md5:5174aed4d0f16021b704120360f72b92',
eb4cb42a 61 'duration': 1128,
2d4c98db 62 },
22a6f150
PH
63 }, {
64 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
65 'info_dict': {
66 'id': '10',
67 'title': 'Who are the hackers?',
68 },
69 'playlist_mincount': 6,
a72cbfac
JMF
70 }, {
71 # contains a youtube video
72 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
73 'add_ie': ['Youtube'],
74 'info_dict': {
75 'id': '_ZG8HBuDjgc',
76 'ext': 'mp4',
77 'title': 'Douglas Adams: Parrots the Universe and Everything',
78 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
79 'uploader': 'University of California Television (UCTV)',
80 'uploader_id': 'UCtelevision',
81 'upload_date': '20080522',
82 },
83 'params': {
84 'skip_download': True,
85 },
ac6c1048 86 }]
9fd5ce0c 87
0ba77818
PH
88 _NATIVE_FORMATS = {
89 'low': {'preference': 1, 'width': 320, 'height': 180},
90 'medium': {'preference': 2, 'width': 512, 'height': 288},
91 'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05 92 }
9fd5ce0c 93
ca1fee34 94 def _extract_info(self, webpage):
bacac173 95 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
9e1a5b84 96 webpage, 'info json')
ca1fee34
JMF
97 return json.loads(info_json)
98
9fd5ce0c 99 def _real_extract(self, url):
bacac173 100 m = re.match(self._VALID_URL, url, re.VERBOSE)
cd791a5e 101 if m.group('type').startswith('embed'):
aab74fa1
PH
102 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
103 return self.url_result(desktop_url, 'TED')
bacac173 104 name = m.group('name')
9fd5ce0c 105 if m.group('type_talk'):
bacac173 106 return self._talk_info(url, name)
ac6c1048
PH
107 elif m.group('type_watch'):
108 return self._watch_info(url, name)
bacac173 109 else:
ca1fee34 110 return self._playlist_videos_info(url, name)
9fd5ce0c 111
ca1fee34 112 def _playlist_videos_info(self, url, name):
9fd5ce0c 113 '''Returns the videos of the playlist'''
fc2ef392 114
ca1fee34 115 webpage = self._download_webpage(url, name,
9e1a5b84 116 'Downloading playlist webpage')
ca1fee34
JMF
117 info = self._extract_info(webpage)
118 playlist_info = info['playlist']
9fd5ce0c 119
fc2ef392 120 playlist_entries = [
f07a9f6f 121 self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34 122 for talk in info['talks']
fc2ef392
PH
123 ]
124 return self.playlist_result(
ca1fee34
JMF
125 playlist_entries,
126 playlist_id=compat_str(playlist_info['id']),
127 playlist_title=playlist_info['title'])
9fd5ce0c 128
bacac173
JMF
129 def _talk_info(self, url, video_name):
130 webpage = self._download_webpage(url, video_name)
9fd5ce0c 131 self.report_extraction(video_name)
a9a3876d 132
ca1fee34 133 talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d 134
a72cbfac
JMF
135 if talk_info.get('external') is not None:
136 self.to_screen('Found video from %s' % talk_info['external']['service'])
1bd83860 137 if 'code' in talk_info['external']:
138 ext_url = talk_info['external']['code']
139 else:
140 ext_url = talk_info['external']['uri']
a72cbfac
JMF
141 return {
142 '_type': 'url',
1bd83860 143 'url': ext_url,
a72cbfac
JMF
144 }
145
652bee05 146 formats = [{
652bee05
JMF
147 'url': format_url,
148 'format_id': format_id,
149 'format': format_id,
2d4c98db
JMF
150 } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
151 if formats:
152 for f in formats:
153 finfo = self._NATIVE_FORMATS.get(f['format_id'])
154 if finfo:
155 f.update(finfo)
156 else:
157 # Use rtmp downloads
158 formats = [{
159 'format_id': f['name'],
160 'url': talk_info['streamer'],
161 'play_path': f['file'],
162 'ext': 'flv',
163 'width': f['width'],
164 'height': f['height'],
165 'tbr': f['bitrate'],
166 } for f in talk_info['resources']['rtmp']]
652bee05
JMF
167 self._sort_formats(formats)
168
7b9965ea 169 video_id = compat_str(talk_info['id'])
a9a3876d 170 # subtitles
652bee05 171 video_subtitles = self.extract_subtitles(video_id, talk_info)
a9a3876d 172 if self._downloader.params.get('listsubtitles', False):
652bee05 173 self._list_available_subtitles(video_id, talk_info)
a9a3876d
IM
174 return
175
b6c1cecc
JMF
176 thumbnail = talk_info['thumb']
177 if not thumbnail.startswith('http'):
178 thumbnail = 'http://' + thumbnail
463a9087 179 return {
a9a3876d 180 'id': video_id,
a8eb5a8e 181 'title': talk_info['title'].strip(),
652bee05 182 'uploader': talk_info['speaker'],
b6c1cecc 183 'thumbnail': thumbnail,
652bee05 184 'description': self._og_search_description(webpage),
a9a3876d 185 'subtitles': video_subtitles,
0d8cb1cc 186 'formats': formats,
eb4cb42a 187 'duration': talk_info.get('duration'),
0d8cb1cc
PH
188 }
189
652bee05
JMF
190 def _get_available_subtitles(self, video_id, talk_info):
191 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
192 if languages:
193 sub_lang_list = {}
194 for l in languages:
195 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
196 sub_lang_list[l] = url
197 return sub_lang_list
198 else:
f07a9f6f 199 self._downloader.report_warning('video doesn\'t have subtitles')
652bee05 200 return {}
ac6c1048
PH
201
202 def _watch_info(self, url, name):
203 webpage = self._download_webpage(url, name)
204
205 config_json = self._html_search_regex(
de9bd74b
S
206 r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
207 webpage, 'config')
208 config = json.loads(config_json)['config']
ac6c1048
PH
209 video_url = config['video']['url']
210 thumbnail = config.get('image', {}).get('url')
211
212 title = self._html_search_regex(
213 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
214 description = self._html_search_regex(
621f33c9
PH
215 [
216 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
217 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
218 ],
ac6c1048
PH
219 webpage, 'description', fatal=False)
220
221 return {
222 'id': name,
223 'url': video_url,
224 'title': title,
225 'thumbnail': thumbnail,
226 'description': description,
227 }