]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/ted.py
[extractor/common] Add ability to specify custom field preference for `_sort_formats`
[yt-dlp.git] / youtube_dl / extractor / ted.py
CommitLineData
f853f859
PH
1from __future__ import unicode_literals
2
9fd5ce0c
PH
3import json
4import re
5
a504ced0 6from .common import InfoExtractor
9fd5ce0c 7
66ee7b32
S
8from ..compat import compat_str
9from ..utils import int_or_none
4ed3e510 10
f853f859 11
a504ced0 12class TEDIE(InfoExtractor):
aab74fa1
PH
13 _VALID_URL = r'''(?x)
14 (?P<proto>https?://)
cd791a5e 15 (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
bacac173
JMF
16 (
17 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
18 |
19 ((?P<type_talk>talks)) # We have a simple talk
ac6c1048
PH
20 |
21 (?P<type_watch>watch)/[^/]+/[^/]+
bacac173
JMF
22 )
23 (/lang/(.*?))? # The url may contain the language
ac6c1048 24 /(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1 25 .*)$
bacac173 26 '''
ac6c1048 27 _TESTS = [{
f853f859 28 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
4d2f143c 29 'md5': 'fc94ac279feebbce69f21c0c6ee82810',
f853f859 30 'info_dict': {
7b9965ea
JMF
31 'id': '102',
32 'ext': 'mp4',
652bee05 33 'title': 'The illusion of consciousness',
bacac173 34 'description': ('Philosopher Dan Dennett makes a compelling '
9e1a5b84
JW
35 'argument that not only don\'t we understand our own '
36 'consciousness, but that half the time our brains are '
37 'actively fooling us.'),
652bee05 38 'uploader': 'Dan Dennett',
0ba77818 39 'width': 854,
eb4cb42a 40 'duration': 1308,
6f5ac90c 41 }
ac6c1048
PH
42 }, {
43 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
44 'md5': '226f4fb9c62380d11b7995efa4c87994',
45 'info_dict': {
46 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
47 'ext': 'mp4',
48 'title': 'Vishal Sikka: The beauty and power of algorithms',
49 'thumbnail': 're:^https?://.+\.jpg',
50 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
51 }
2d4c98db
JMF
52 }, {
53 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
54 'info_dict': {
55 'id': '1972',
5bec5748 56 'ext': 'mp4',
2d4c98db
JMF
57 'title': 'Be passionate. Be courageous. Be your best.',
58 'uploader': 'Gabby Giffords and Mark Kelly',
5bec5748 59 'description': 'md5:5174aed4d0f16021b704120360f72b92',
eb4cb42a 60 'duration': 1128,
2d4c98db 61 },
22a6f150
PH
62 }, {
63 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
64 'info_dict': {
65 'id': '10',
66 'title': 'Who are the hackers?',
67 },
68 'playlist_mincount': 6,
a72cbfac
JMF
69 }, {
70 # contains a youtube video
71 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
72 'add_ie': ['Youtube'],
73 'info_dict': {
74 'id': '_ZG8HBuDjgc',
75 'ext': 'mp4',
76 'title': 'Douglas Adams: Parrots the Universe and Everything',
77 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
78 'uploader': 'University of California Television (UCTV)',
79 'uploader_id': 'UCtelevision',
80 'upload_date': '20080522',
81 },
82 'params': {
83 'skip_download': True,
84 },
a461a119
S
85 }, {
86 # YouTube video
87 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
88 'add_ie': ['Youtube'],
89 'info_dict': {
90 'id': 'aFBIPO-P7LM',
91 'ext': 'mp4',
92 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
93 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
94 'uploader': 'TEDx Talks',
95 'uploader_id': 'TEDxTalks',
96 'upload_date': '20111216',
97 },
98 'params': {
99 'skip_download': True,
100 },
ac6c1048 101 }]
9fd5ce0c 102
0ba77818
PH
103 _NATIVE_FORMATS = {
104 'low': {'preference': 1, 'width': 320, 'height': 180},
105 'medium': {'preference': 2, 'width': 512, 'height': 288},
106 'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05 107 }
9fd5ce0c 108
ca1fee34 109 def _extract_info(self, webpage):
bacac173 110 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
9e1a5b84 111 webpage, 'info json')
ca1fee34
JMF
112 return json.loads(info_json)
113
9fd5ce0c 114 def _real_extract(self, url):
bacac173 115 m = re.match(self._VALID_URL, url, re.VERBOSE)
cd791a5e 116 if m.group('type').startswith('embed'):
aab74fa1
PH
117 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
118 return self.url_result(desktop_url, 'TED')
bacac173 119 name = m.group('name')
9fd5ce0c 120 if m.group('type_talk'):
bacac173 121 return self._talk_info(url, name)
ac6c1048
PH
122 elif m.group('type_watch'):
123 return self._watch_info(url, name)
bacac173 124 else:
ca1fee34 125 return self._playlist_videos_info(url, name)
9fd5ce0c 126
ca1fee34 127 def _playlist_videos_info(self, url, name):
9fd5ce0c 128 '''Returns the videos of the playlist'''
fc2ef392 129
ca1fee34 130 webpage = self._download_webpage(url, name,
9e1a5b84 131 'Downloading playlist webpage')
ca1fee34
JMF
132 info = self._extract_info(webpage)
133 playlist_info = info['playlist']
9fd5ce0c 134
fc2ef392 135 playlist_entries = [
f07a9f6f 136 self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34 137 for talk in info['talks']
fc2ef392
PH
138 ]
139 return self.playlist_result(
ca1fee34
JMF
140 playlist_entries,
141 playlist_id=compat_str(playlist_info['id']),
142 playlist_title=playlist_info['title'])
9fd5ce0c 143
bacac173
JMF
144 def _talk_info(self, url, video_name):
145 webpage = self._download_webpage(url, video_name)
9fd5ce0c 146 self.report_extraction(video_name)
a9a3876d 147
ca1fee34 148 talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d 149
a461a119
S
150 external = talk_info.get('external')
151 if external:
152 service = external['service']
153 self.to_screen('Found video from %s' % service)
154 ext_url = None
155 if service.lower() == 'youtube':
156 ext_url = external.get('code')
a72cbfac
JMF
157 return {
158 '_type': 'url',
a461a119 159 'url': ext_url or external['uri'],
a72cbfac
JMF
160 }
161
652bee05 162 formats = [{
652bee05
JMF
163 'url': format_url,
164 'format_id': format_id,
165 'format': format_id,
2d4c98db
JMF
166 } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
167 if formats:
168 for f in formats:
169 finfo = self._NATIVE_FORMATS.get(f['format_id'])
170 if finfo:
171 f.update(finfo)
66ee7b32
S
172
173 for format_id, resources in talk_info['resources'].items():
174 if format_id == 'h264':
175 for resource in resources:
176 bitrate = int_or_none(resource.get('bitrate'))
177 formats.append({
178 'url': resource['file'],
179 'format_id': '%s-%sk' % (format_id, bitrate),
180 'tbr': bitrate,
181 })
182 elif format_id == 'rtmp':
183 streamer = talk_info.get('streamer')
184 if not streamer:
185 continue
186 for resource in resources:
187 formats.append({
188 'format_id': '%s-%s' % (format_id, resource.get('name')),
189 'url': streamer,
190 'play_path': resource['file'],
191 'ext': 'flv',
192 'width': int_or_none(resource.get('width')),
193 'height': int_or_none(resource.get('height')),
194 'tbr': int_or_none(resource.get('bitrate')),
195 })
196 elif format_id == 'hls':
197 formats.extend(self._extract_m3u8_formats(
198 resources.get('stream'), video_name, 'mp4', m3u8_id=format_id))
199
200 audio_download = talk_info.get('audioDownload')
201 if audio_download:
202 formats.append({
203 'url': audio_download,
204 'format_id': 'audio',
205 })
206
652bee05
JMF
207 self._sort_formats(formats)
208
7b9965ea 209 video_id = compat_str(talk_info['id'])
a9a3876d 210
b6c1cecc
JMF
211 thumbnail = talk_info['thumb']
212 if not thumbnail.startswith('http'):
213 thumbnail = 'http://' + thumbnail
463a9087 214 return {
a9a3876d 215 'id': video_id,
a8eb5a8e 216 'title': talk_info['title'].strip(),
652bee05 217 'uploader': talk_info['speaker'],
b6c1cecc 218 'thumbnail': thumbnail,
652bee05 219 'description': self._og_search_description(webpage),
03091e37 220 'subtitles': self._get_subtitles(video_id, talk_info),
0d8cb1cc 221 'formats': formats,
eb4cb42a 222 'duration': talk_info.get('duration'),
0d8cb1cc
PH
223 }
224
a504ced0 225 def _get_subtitles(self, video_id, talk_info):
652bee05
JMF
226 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
227 if languages:
228 sub_lang_list = {}
229 for l in languages:
a504ced0
JMF
230 sub_lang_list[l] = [
231 {
232 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
233 'ext': ext,
234 }
235 for ext in ['ted', 'srt']
236 ]
652bee05
JMF
237 return sub_lang_list
238 else:
652bee05 239 return {}
ac6c1048
PH
240
241 def _watch_info(self, url, name):
242 webpage = self._download_webpage(url, name)
243
244 config_json = self._html_search_regex(
de9bd74b
S
245 r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
246 webpage, 'config')
247 config = json.loads(config_json)['config']
ac6c1048
PH
248 video_url = config['video']['url']
249 thumbnail = config.get('image', {}).get('url')
250
251 title = self._html_search_regex(
252 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
253 description = self._html_search_regex(
621f33c9
PH
254 [
255 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
256 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
257 ],
ac6c1048
PH
258 webpage, 'description', fatal=False)
259
260 return {
261 'id': name,
262 'url': video_url,
263 'title': title,
264 'thumbnail': thumbnail,
265 'description': description,
266 }