]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/ted.py
[daum] Support non-numeric video IDs (Fixes #3749)
[yt-dlp.git] / youtube_dl / extractor / ted.py
CommitLineData
f853f859
PH
1from __future__ import unicode_literals
2
9fd5ce0c
PH
3import json
4import re
5
a9a3876d 6from .subtitles import SubtitlesInfoExtractor
9fd5ce0c 7
4ed3e510 8from ..utils import (
ca1fee34 9 compat_str,
4ed3e510
IM
10)
11
f853f859 12
a9a3876d 13class TEDIE(SubtitlesInfoExtractor):
aab74fa1
PH
14 _VALID_URL = r'''(?x)
15 (?P<proto>https?://)
16 (?P<type>www|embed)(?P<urlmain>\.ted\.com/
bacac173
JMF
17 (
18 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19 |
20 ((?P<type_talk>talks)) # We have a simple talk
ac6c1048
PH
21 |
22 (?P<type_watch>watch)/[^/]+/[^/]+
bacac173
JMF
23 )
24 (/lang/(.*?))? # The url may contain the language
ac6c1048 25 /(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1 26 .*)$
bacac173 27 '''
ac6c1048 28 _TESTS = [{
f853f859 29 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
4d2f143c 30 'md5': 'fc94ac279feebbce69f21c0c6ee82810',
f853f859 31 'info_dict': {
7b9965ea
JMF
32 'id': '102',
33 'ext': 'mp4',
652bee05 34 'title': 'The illusion of consciousness',
bacac173
JMF
35 'description': ('Philosopher Dan Dennett makes a compelling '
36 'argument that not only don\'t we understand our own '
37 'consciousness, but that half the time our brains are '
38 'actively fooling us.'),
652bee05 39 'uploader': 'Dan Dennett',
0ba77818 40 'width': 854,
6f5ac90c 41 }
ac6c1048
PH
42 }, {
43 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
44 'md5': '226f4fb9c62380d11b7995efa4c87994',
45 'info_dict': {
46 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
47 'ext': 'mp4',
48 'title': 'Vishal Sikka: The beauty and power of algorithms',
49 'thumbnail': 're:^https?://.+\.jpg',
50 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
51 }
2d4c98db
JMF
52 }, {
53 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
54 'info_dict': {
55 'id': '1972',
5bec5748 56 'ext': 'mp4',
2d4c98db
JMF
57 'title': 'Be passionate. Be courageous. Be your best.',
58 'uploader': 'Gabby Giffords and Mark Kelly',
5bec5748 59 'description': 'md5:5174aed4d0f16021b704120360f72b92',
2d4c98db 60 },
22a6f150
PH
61 }, {
62 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
63 'info_dict': {
64 'id': '10',
65 'title': 'Who are the hackers?',
66 },
67 'playlist_mincount': 6,
ac6c1048 68 }]
9fd5ce0c 69
0ba77818
PH
70 _NATIVE_FORMATS = {
71 'low': {'preference': 1, 'width': 320, 'height': 180},
72 'medium': {'preference': 2, 'width': 512, 'height': 288},
73 'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05 74 }
9fd5ce0c 75
ca1fee34 76 def _extract_info(self, webpage):
bacac173
JMF
77 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
78 webpage, 'info json')
ca1fee34
JMF
79 return json.loads(info_json)
80
9fd5ce0c 81 def _real_extract(self, url):
bacac173 82 m = re.match(self._VALID_URL, url, re.VERBOSE)
aab74fa1
PH
83 if m.group('type') == 'embed':
84 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
85 return self.url_result(desktop_url, 'TED')
bacac173 86 name = m.group('name')
9fd5ce0c 87 if m.group('type_talk'):
bacac173 88 return self._talk_info(url, name)
ac6c1048
PH
89 elif m.group('type_watch'):
90 return self._watch_info(url, name)
bacac173 91 else:
ca1fee34 92 return self._playlist_videos_info(url, name)
9fd5ce0c 93
ca1fee34 94 def _playlist_videos_info(self, url, name):
9fd5ce0c 95 '''Returns the videos of the playlist'''
fc2ef392 96
ca1fee34
JMF
97 webpage = self._download_webpage(url, name,
98 'Downloading playlist webpage')
99 info = self._extract_info(webpage)
100 playlist_info = info['playlist']
9fd5ce0c 101
fc2ef392 102 playlist_entries = [
f07a9f6f 103 self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34 104 for talk in info['talks']
fc2ef392
PH
105 ]
106 return self.playlist_result(
ca1fee34
JMF
107 playlist_entries,
108 playlist_id=compat_str(playlist_info['id']),
109 playlist_title=playlist_info['title'])
9fd5ce0c 110
bacac173
JMF
111 def _talk_info(self, url, video_name):
112 webpage = self._download_webpage(url, video_name)
9fd5ce0c 113 self.report_extraction(video_name)
a9a3876d 114
ca1fee34 115 talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d 116
652bee05 117 formats = [{
652bee05
JMF
118 'url': format_url,
119 'format_id': format_id,
120 'format': format_id,
2d4c98db
JMF
121 } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
122 if formats:
123 for f in formats:
124 finfo = self._NATIVE_FORMATS.get(f['format_id'])
125 if finfo:
126 f.update(finfo)
127 else:
128 # Use rtmp downloads
129 formats = [{
130 'format_id': f['name'],
131 'url': talk_info['streamer'],
132 'play_path': f['file'],
133 'ext': 'flv',
134 'width': f['width'],
135 'height': f['height'],
136 'tbr': f['bitrate'],
137 } for f in talk_info['resources']['rtmp']]
652bee05
JMF
138 self._sort_formats(formats)
139
7b9965ea 140 video_id = compat_str(talk_info['id'])
a9a3876d 141 # subtitles
652bee05 142 video_subtitles = self.extract_subtitles(video_id, talk_info)
a9a3876d 143 if self._downloader.params.get('listsubtitles', False):
652bee05 144 self._list_available_subtitles(video_id, talk_info)
a9a3876d
IM
145 return
146
b6c1cecc
JMF
147 thumbnail = talk_info['thumb']
148 if not thumbnail.startswith('http'):
149 thumbnail = 'http://' + thumbnail
463a9087 150 return {
a9a3876d 151 'id': video_id,
652bee05
JMF
152 'title': talk_info['title'],
153 'uploader': talk_info['speaker'],
b6c1cecc 154 'thumbnail': thumbnail,
652bee05 155 'description': self._og_search_description(webpage),
a9a3876d 156 'subtitles': video_subtitles,
0d8cb1cc
PH
157 'formats': formats,
158 }
159
652bee05
JMF
160 def _get_available_subtitles(self, video_id, talk_info):
161 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
162 if languages:
163 sub_lang_list = {}
164 for l in languages:
165 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
166 sub_lang_list[l] = url
167 return sub_lang_list
168 else:
f07a9f6f 169 self._downloader.report_warning('video doesn\'t have subtitles')
652bee05 170 return {}
ac6c1048
PH
171
172 def _watch_info(self, url, name):
173 webpage = self._download_webpage(url, name)
174
175 config_json = self._html_search_regex(
176 r"data-config='([^']+)", webpage, 'config')
177 config = json.loads(config_json)
178 video_url = config['video']['url']
179 thumbnail = config.get('image', {}).get('url')
180
181 title = self._html_search_regex(
182 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
183 description = self._html_search_regex(
621f33c9
PH
184 [
185 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
186 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
187 ],
ac6c1048
PH
188 webpage, 'description', fatal=False)
189
190 return {
191 'id': name,
192 'url': video_url,
193 'title': title,
194 'thumbnail': thumbnail,
195 'description': description,
196 }