]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/ted.py
[rts] Update test
[yt-dlp.git] / youtube_dl / extractor / ted.py
CommitLineData
f853f859
PH
1from __future__ import unicode_literals
2
9fd5ce0c
PH
3import json
4import re
5
a9a3876d 6from .subtitles import SubtitlesInfoExtractor
9fd5ce0c 7
4ed3e510 8from ..utils import (
ca1fee34 9 compat_str,
4ed3e510
IM
10)
11
f853f859 12
a9a3876d 13class TEDIE(SubtitlesInfoExtractor):
aab74fa1
PH
14 _VALID_URL = r'''(?x)
15 (?P<proto>https?://)
16 (?P<type>www|embed)(?P<urlmain>\.ted\.com/
bacac173
JMF
17 (
18 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19 |
20 ((?P<type_talk>talks)) # We have a simple talk
ac6c1048
PH
21 |
22 (?P<type_watch>watch)/[^/]+/[^/]+
bacac173
JMF
23 )
24 (/lang/(.*?))? # The url may contain the language
ac6c1048 25 /(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1 26 .*)$
bacac173 27 '''
ac6c1048 28 _TESTS = [{
f853f859 29 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
26dca166 30 'md5': '4ea1dada91e4174b53dac2bb8ace429d',
f853f859 31 'info_dict': {
7b9965ea
JMF
32 'id': '102',
33 'ext': 'mp4',
652bee05 34 'title': 'The illusion of consciousness',
bacac173
JMF
35 'description': ('Philosopher Dan Dennett makes a compelling '
36 'argument that not only don\'t we understand our own '
37 'consciousness, but that half the time our brains are '
38 'actively fooling us.'),
652bee05 39 'uploader': 'Dan Dennett',
6f5ac90c 40 }
ac6c1048
PH
41 }, {
42 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
43 'md5': '226f4fb9c62380d11b7995efa4c87994',
44 'info_dict': {
45 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
46 'ext': 'mp4',
47 'title': 'Vishal Sikka: The beauty and power of algorithms',
48 'thumbnail': 're:^https?://.+\.jpg',
49 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
50 }
51 }]
9fd5ce0c 52
652bee05
JMF
53 _FORMATS_PREFERENCE = {
54 'low': 1,
55 'medium': 2,
56 'high': 3,
57 }
9fd5ce0c 58
ca1fee34 59 def _extract_info(self, webpage):
bacac173
JMF
60 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
61 webpage, 'info json')
ca1fee34
JMF
62 return json.loads(info_json)
63
9fd5ce0c 64 def _real_extract(self, url):
bacac173 65 m = re.match(self._VALID_URL, url, re.VERBOSE)
aab74fa1
PH
66 if m.group('type') == 'embed':
67 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
68 return self.url_result(desktop_url, 'TED')
bacac173 69 name = m.group('name')
9fd5ce0c 70 if m.group('type_talk'):
bacac173 71 return self._talk_info(url, name)
ac6c1048
PH
72 elif m.group('type_watch'):
73 return self._watch_info(url, name)
bacac173 74 else:
ca1fee34 75 return self._playlist_videos_info(url, name)
9fd5ce0c 76
ca1fee34 77 def _playlist_videos_info(self, url, name):
9fd5ce0c 78 '''Returns the videos of the playlist'''
fc2ef392 79
ca1fee34
JMF
80 webpage = self._download_webpage(url, name,
81 'Downloading playlist webpage')
82 info = self._extract_info(webpage)
83 playlist_info = info['playlist']
9fd5ce0c 84
fc2ef392 85 playlist_entries = [
ca1fee34
JMF
86 self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key())
87 for talk in info['talks']
fc2ef392
PH
88 ]
89 return self.playlist_result(
ca1fee34
JMF
90 playlist_entries,
91 playlist_id=compat_str(playlist_info['id']),
92 playlist_title=playlist_info['title'])
9fd5ce0c 93
bacac173
JMF
94 def _talk_info(self, url, video_name):
95 webpage = self._download_webpage(url, video_name)
9fd5ce0c 96 self.report_extraction(video_name)
a9a3876d 97
ca1fee34 98 talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d 99
652bee05
JMF
100 formats = [{
101 'ext': 'mp4',
102 'url': format_url,
103 'format_id': format_id,
104 'format': format_id,
105 'preference': self._FORMATS_PREFERENCE.get(format_id, -1),
106 } for (format_id, format_url) in talk_info['nativeDownloads'].items()]
107 self._sort_formats(formats)
108
7b9965ea 109 video_id = compat_str(talk_info['id'])
a9a3876d 110 # subtitles
652bee05 111 video_subtitles = self.extract_subtitles(video_id, talk_info)
a9a3876d 112 if self._downloader.params.get('listsubtitles', False):
652bee05 113 self._list_available_subtitles(video_id, talk_info)
a9a3876d
IM
114 return
115
b6c1cecc
JMF
116 thumbnail = talk_info['thumb']
117 if not thumbnail.startswith('http'):
118 thumbnail = 'http://' + thumbnail
463a9087 119 return {
a9a3876d 120 'id': video_id,
652bee05
JMF
121 'title': talk_info['title'],
122 'uploader': talk_info['speaker'],
b6c1cecc 123 'thumbnail': thumbnail,
652bee05 124 'description': self._og_search_description(webpage),
a9a3876d 125 'subtitles': video_subtitles,
0d8cb1cc
PH
126 'formats': formats,
127 }
128
652bee05
JMF
129 def _get_available_subtitles(self, video_id, talk_info):
130 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
131 if languages:
132 sub_lang_list = {}
133 for l in languages:
134 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
135 sub_lang_list[l] = url
136 return sub_lang_list
137 else:
4ed3e510 138 self._downloader.report_warning(u'video doesn\'t have subtitles')
652bee05 139 return {}
ac6c1048
PH
140
141 def _watch_info(self, url, name):
142 webpage = self._download_webpage(url, name)
143
144 config_json = self._html_search_regex(
145 r"data-config='([^']+)", webpage, 'config')
146 config = json.loads(config_json)
147 video_url = config['video']['url']
148 thumbnail = config.get('image', {}).get('url')
149
150 title = self._html_search_regex(
151 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
152 description = self._html_search_regex(
153 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
154 webpage, 'description', fatal=False)
155
156 return {
157 'id': name,
158 'url': video_url,
159 'title': title,
160 'thumbnail': thumbnail,
161 'description': description,
162 }