]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/ted.py
[youtube] Include video Id in common error message (Fixes #2786)
[yt-dlp.git] / youtube_dl / extractor / ted.py
CommitLineData
f853f859
PH
1from __future__ import unicode_literals
2
9fd5ce0c
PH
3import json
4import re
5
a9a3876d 6from .subtitles import SubtitlesInfoExtractor
9fd5ce0c 7
4ed3e510 8from ..utils import (
ca1fee34 9 compat_str,
4ed3e510
IM
10)
11
f853f859 12
a9a3876d 13class TEDIE(SubtitlesInfoExtractor):
aab74fa1
PH
14 _VALID_URL = r'''(?x)
15 (?P<proto>https?://)
16 (?P<type>www|embed)(?P<urlmain>\.ted\.com/
bacac173
JMF
17 (
18 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19 |
20 ((?P<type_talk>talks)) # We have a simple talk
ac6c1048
PH
21 |
22 (?P<type_watch>watch)/[^/]+/[^/]+
bacac173
JMF
23 )
24 (/lang/(.*?))? # The url may contain the language
ac6c1048 25 /(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1 26 .*)$
bacac173 27 '''
ac6c1048 28 _TESTS = [{
f853f859 29 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
26dca166 30 'md5': '4ea1dada91e4174b53dac2bb8ace429d',
f853f859 31 'info_dict': {
7b9965ea
JMF
32 'id': '102',
33 'ext': 'mp4',
652bee05 34 'title': 'The illusion of consciousness',
bacac173
JMF
35 'description': ('Philosopher Dan Dennett makes a compelling '
36 'argument that not only don\'t we understand our own '
37 'consciousness, but that half the time our brains are '
38 'actively fooling us.'),
652bee05 39 'uploader': 'Dan Dennett',
0ba77818 40 'width': 854,
6f5ac90c 41 }
ac6c1048
PH
42 }, {
43 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
44 'md5': '226f4fb9c62380d11b7995efa4c87994',
45 'info_dict': {
46 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
47 'ext': 'mp4',
48 'title': 'Vishal Sikka: The beauty and power of algorithms',
49 'thumbnail': 're:^https?://.+\.jpg',
50 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
51 }
2d4c98db
JMF
52 }, {
53 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
54 'info_dict': {
55 'id': '1972',
56 'ext': 'flv',
57 'title': 'Be passionate. Be courageous. Be your best.',
58 'uploader': 'Gabby Giffords and Mark Kelly',
59 'description': 'md5:d89e1d8ebafdac8e55df4c219ecdbfe9',
60 },
61 'params': {
62 # rtmp download
63 'skip_download': True,
64 },
ac6c1048 65 }]
9fd5ce0c 66
0ba77818
PH
67 _NATIVE_FORMATS = {
68 'low': {'preference': 1, 'width': 320, 'height': 180},
69 'medium': {'preference': 2, 'width': 512, 'height': 288},
70 'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05 71 }
9fd5ce0c 72
ca1fee34 73 def _extract_info(self, webpage):
bacac173
JMF
74 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
75 webpage, 'info json')
ca1fee34
JMF
76 return json.loads(info_json)
77
9fd5ce0c 78 def _real_extract(self, url):
bacac173 79 m = re.match(self._VALID_URL, url, re.VERBOSE)
aab74fa1
PH
80 if m.group('type') == 'embed':
81 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
82 return self.url_result(desktop_url, 'TED')
bacac173 83 name = m.group('name')
9fd5ce0c 84 if m.group('type_talk'):
bacac173 85 return self._talk_info(url, name)
ac6c1048
PH
86 elif m.group('type_watch'):
87 return self._watch_info(url, name)
bacac173 88 else:
ca1fee34 89 return self._playlist_videos_info(url, name)
9fd5ce0c 90
ca1fee34 91 def _playlist_videos_info(self, url, name):
9fd5ce0c 92 '''Returns the videos of the playlist'''
fc2ef392 93
ca1fee34
JMF
94 webpage = self._download_webpage(url, name,
95 'Downloading playlist webpage')
96 info = self._extract_info(webpage)
97 playlist_info = info['playlist']
9fd5ce0c 98
fc2ef392 99 playlist_entries = [
f07a9f6f 100 self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34 101 for talk in info['talks']
fc2ef392
PH
102 ]
103 return self.playlist_result(
ca1fee34
JMF
104 playlist_entries,
105 playlist_id=compat_str(playlist_info['id']),
106 playlist_title=playlist_info['title'])
9fd5ce0c 107
bacac173
JMF
108 def _talk_info(self, url, video_name):
109 webpage = self._download_webpage(url, video_name)
9fd5ce0c 110 self.report_extraction(video_name)
a9a3876d 111
ca1fee34 112 talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d 113
652bee05 114 formats = [{
652bee05
JMF
115 'url': format_url,
116 'format_id': format_id,
117 'format': format_id,
2d4c98db
JMF
118 } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
119 if formats:
120 for f in formats:
121 finfo = self._NATIVE_FORMATS.get(f['format_id'])
122 if finfo:
123 f.update(finfo)
124 else:
125 # Use rtmp downloads
126 formats = [{
127 'format_id': f['name'],
128 'url': talk_info['streamer'],
129 'play_path': f['file'],
130 'ext': 'flv',
131 'width': f['width'],
132 'height': f['height'],
133 'tbr': f['bitrate'],
134 } for f in talk_info['resources']['rtmp']]
652bee05
JMF
135 self._sort_formats(formats)
136
7b9965ea 137 video_id = compat_str(talk_info['id'])
a9a3876d 138 # subtitles
652bee05 139 video_subtitles = self.extract_subtitles(video_id, talk_info)
a9a3876d 140 if self._downloader.params.get('listsubtitles', False):
652bee05 141 self._list_available_subtitles(video_id, talk_info)
a9a3876d
IM
142 return
143
b6c1cecc
JMF
144 thumbnail = talk_info['thumb']
145 if not thumbnail.startswith('http'):
146 thumbnail = 'http://' + thumbnail
463a9087 147 return {
a9a3876d 148 'id': video_id,
652bee05
JMF
149 'title': talk_info['title'],
150 'uploader': talk_info['speaker'],
b6c1cecc 151 'thumbnail': thumbnail,
652bee05 152 'description': self._og_search_description(webpage),
a9a3876d 153 'subtitles': video_subtitles,
0d8cb1cc
PH
154 'formats': formats,
155 }
156
652bee05
JMF
157 def _get_available_subtitles(self, video_id, talk_info):
158 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
159 if languages:
160 sub_lang_list = {}
161 for l in languages:
162 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
163 sub_lang_list[l] = url
164 return sub_lang_list
165 else:
f07a9f6f 166 self._downloader.report_warning('video doesn\'t have subtitles')
652bee05 167 return {}
ac6c1048
PH
168
169 def _watch_info(self, url, name):
170 webpage = self._download_webpage(url, name)
171
172 config_json = self._html_search_regex(
173 r"data-config='([^']+)", webpage, 'config')
174 config = json.loads(config_json)
175 video_url = config['video']['url']
176 thumbnail = config.get('image', {}).get('url')
177
178 title = self._html_search_regex(
179 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
180 description = self._html_search_regex(
621f33c9
PH
181 [
182 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
183 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
184 ],
ac6c1048
PH
185 webpage, 'description', fatal=False)
186
187 return {
188 'id': name,
189 'url': video_url,
190 'title': title,
191 'thumbnail': thumbnail,
192 'description': description,
193 }