]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/ted.py
[niconico] Add support for channel video
[yt-dlp.git] / youtube_dl / extractor / ted.py
CommitLineData
f853f859
PH
1from __future__ import unicode_literals
2
9fd5ce0c
PH
3import json
4import re
5
a9a3876d 6from .subtitles import SubtitlesInfoExtractor
9fd5ce0c 7
4ed3e510 8from ..utils import (
ca1fee34 9 compat_str,
4ed3e510
IM
10)
11
f853f859 12
a9a3876d 13class TEDIE(SubtitlesInfoExtractor):
aab74fa1
PH
14 _VALID_URL = r'''(?x)
15 (?P<proto>https?://)
16 (?P<type>www|embed)(?P<urlmain>\.ted\.com/
bacac173
JMF
17 (
18 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19 |
20 ((?P<type_talk>talks)) # We have a simple talk
ac6c1048
PH
21 |
22 (?P<type_watch>watch)/[^/]+/[^/]+
bacac173
JMF
23 )
24 (/lang/(.*?))? # The url may contain the language
ac6c1048 25 /(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1 26 .*)$
bacac173 27 '''
ac6c1048 28 _TESTS = [{
f853f859 29 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
4d2f143c 30 'md5': 'fc94ac279feebbce69f21c0c6ee82810',
f853f859 31 'info_dict': {
7b9965ea
JMF
32 'id': '102',
33 'ext': 'mp4',
652bee05 34 'title': 'The illusion of consciousness',
bacac173
JMF
35 'description': ('Philosopher Dan Dennett makes a compelling '
36 'argument that not only don\'t we understand our own '
37 'consciousness, but that half the time our brains are '
38 'actively fooling us.'),
652bee05 39 'uploader': 'Dan Dennett',
0ba77818 40 'width': 854,
6f5ac90c 41 }
ac6c1048
PH
42 }, {
43 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
44 'md5': '226f4fb9c62380d11b7995efa4c87994',
45 'info_dict': {
46 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
47 'ext': 'mp4',
48 'title': 'Vishal Sikka: The beauty and power of algorithms',
49 'thumbnail': 're:^https?://.+\.jpg',
50 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
51 }
2d4c98db
JMF
52 }, {
53 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
5bec5748 54 'md5': '49144e345a899b8cb34d315f3b9cfeeb',
2d4c98db
JMF
55 'info_dict': {
56 'id': '1972',
5bec5748 57 'ext': 'mp4',
2d4c98db
JMF
58 'title': 'Be passionate. Be courageous. Be your best.',
59 'uploader': 'Gabby Giffords and Mark Kelly',
5bec5748 60 'description': 'md5:5174aed4d0f16021b704120360f72b92',
2d4c98db 61 },
ac6c1048 62 }]
9fd5ce0c 63
0ba77818
PH
64 _NATIVE_FORMATS = {
65 'low': {'preference': 1, 'width': 320, 'height': 180},
66 'medium': {'preference': 2, 'width': 512, 'height': 288},
67 'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05 68 }
9fd5ce0c 69
ca1fee34 70 def _extract_info(self, webpage):
bacac173
JMF
71 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
72 webpage, 'info json')
ca1fee34
JMF
73 return json.loads(info_json)
74
9fd5ce0c 75 def _real_extract(self, url):
bacac173 76 m = re.match(self._VALID_URL, url, re.VERBOSE)
aab74fa1
PH
77 if m.group('type') == 'embed':
78 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
79 return self.url_result(desktop_url, 'TED')
bacac173 80 name = m.group('name')
9fd5ce0c 81 if m.group('type_talk'):
bacac173 82 return self._talk_info(url, name)
ac6c1048
PH
83 elif m.group('type_watch'):
84 return self._watch_info(url, name)
bacac173 85 else:
ca1fee34 86 return self._playlist_videos_info(url, name)
9fd5ce0c 87
ca1fee34 88 def _playlist_videos_info(self, url, name):
9fd5ce0c 89 '''Returns the videos of the playlist'''
fc2ef392 90
ca1fee34
JMF
91 webpage = self._download_webpage(url, name,
92 'Downloading playlist webpage')
93 info = self._extract_info(webpage)
94 playlist_info = info['playlist']
9fd5ce0c 95
fc2ef392 96 playlist_entries = [
f07a9f6f 97 self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34 98 for talk in info['talks']
fc2ef392
PH
99 ]
100 return self.playlist_result(
ca1fee34
JMF
101 playlist_entries,
102 playlist_id=compat_str(playlist_info['id']),
103 playlist_title=playlist_info['title'])
9fd5ce0c 104
bacac173
JMF
105 def _talk_info(self, url, video_name):
106 webpage = self._download_webpage(url, video_name)
9fd5ce0c 107 self.report_extraction(video_name)
a9a3876d 108
ca1fee34 109 talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d 110
652bee05 111 formats = [{
652bee05
JMF
112 'url': format_url,
113 'format_id': format_id,
114 'format': format_id,
2d4c98db
JMF
115 } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
116 if formats:
117 for f in formats:
118 finfo = self._NATIVE_FORMATS.get(f['format_id'])
119 if finfo:
120 f.update(finfo)
121 else:
122 # Use rtmp downloads
123 formats = [{
124 'format_id': f['name'],
125 'url': talk_info['streamer'],
126 'play_path': f['file'],
127 'ext': 'flv',
128 'width': f['width'],
129 'height': f['height'],
130 'tbr': f['bitrate'],
131 } for f in talk_info['resources']['rtmp']]
652bee05
JMF
132 self._sort_formats(formats)
133
7b9965ea 134 video_id = compat_str(talk_info['id'])
a9a3876d 135 # subtitles
652bee05 136 video_subtitles = self.extract_subtitles(video_id, talk_info)
a9a3876d 137 if self._downloader.params.get('listsubtitles', False):
652bee05 138 self._list_available_subtitles(video_id, talk_info)
a9a3876d
IM
139 return
140
b6c1cecc
JMF
141 thumbnail = talk_info['thumb']
142 if not thumbnail.startswith('http'):
143 thumbnail = 'http://' + thumbnail
463a9087 144 return {
a9a3876d 145 'id': video_id,
652bee05
JMF
146 'title': talk_info['title'],
147 'uploader': talk_info['speaker'],
b6c1cecc 148 'thumbnail': thumbnail,
652bee05 149 'description': self._og_search_description(webpage),
a9a3876d 150 'subtitles': video_subtitles,
0d8cb1cc
PH
151 'formats': formats,
152 }
153
652bee05
JMF
154 def _get_available_subtitles(self, video_id, talk_info):
155 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
156 if languages:
157 sub_lang_list = {}
158 for l in languages:
159 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
160 sub_lang_list[l] = url
161 return sub_lang_list
162 else:
f07a9f6f 163 self._downloader.report_warning('video doesn\'t have subtitles')
652bee05 164 return {}
ac6c1048
PH
165
166 def _watch_info(self, url, name):
167 webpage = self._download_webpage(url, name)
168
169 config_json = self._html_search_regex(
170 r"data-config='([^']+)", webpage, 'config')
171 config = json.loads(config_json)
172 video_url = config['video']['url']
173 thumbnail = config.get('image', {}).get('url')
174
175 title = self._html_search_regex(
176 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
177 description = self._html_search_regex(
621f33c9
PH
178 [
179 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
180 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
181 ],
ac6c1048
PH
182 webpage, 'description', fatal=False)
183
184 return {
185 'id': name,
186 'url': video_url,
187 'title': title,
188 'thumbnail': thumbnail,
189 'description': description,
190 }