]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/ted.py
[generic] Modernize tests
[yt-dlp.git] / youtube_dl / extractor / ted.py
CommitLineData
f853f859
PH
1from __future__ import unicode_literals
2
9fd5ce0c
PH
3import json
4import re
5
a9a3876d 6from .subtitles import SubtitlesInfoExtractor
9fd5ce0c 7
4ed3e510 8from ..utils import (
ca1fee34 9 compat_str,
4ed3e510
IM
10)
11
f853f859 12
a9a3876d 13class TEDIE(SubtitlesInfoExtractor):
aab74fa1
PH
14 _VALID_URL = r'''(?x)
15 (?P<proto>https?://)
16 (?P<type>www|embed)(?P<urlmain>\.ted\.com/
bacac173
JMF
17 (
18 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19 |
20 ((?P<type_talk>talks)) # We have a simple talk
ac6c1048
PH
21 |
22 (?P<type_watch>watch)/[^/]+/[^/]+
bacac173
JMF
23 )
24 (/lang/(.*?))? # The url may contain the language
ac6c1048 25 /(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1 26 .*)$
bacac173 27 '''
ac6c1048 28 _TESTS = [{
f853f859 29 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
26dca166 30 'md5': '4ea1dada91e4174b53dac2bb8ace429d',
f853f859 31 'info_dict': {
7b9965ea
JMF
32 'id': '102',
33 'ext': 'mp4',
652bee05 34 'title': 'The illusion of consciousness',
bacac173
JMF
35 'description': ('Philosopher Dan Dennett makes a compelling '
36 'argument that not only don\'t we understand our own '
37 'consciousness, but that half the time our brains are '
38 'actively fooling us.'),
652bee05 39 'uploader': 'Dan Dennett',
0ba77818 40 'width': 854,
6f5ac90c 41 }
ac6c1048
PH
42 }, {
43 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
44 'md5': '226f4fb9c62380d11b7995efa4c87994',
45 'info_dict': {
46 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
47 'ext': 'mp4',
48 'title': 'Vishal Sikka: The beauty and power of algorithms',
49 'thumbnail': 're:^https?://.+\.jpg',
50 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
51 }
52 }]
9fd5ce0c 53
0ba77818
PH
54 _NATIVE_FORMATS = {
55 'low': {'preference': 1, 'width': 320, 'height': 180},
56 'medium': {'preference': 2, 'width': 512, 'height': 288},
57 'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05 58 }
9fd5ce0c 59
ca1fee34 60 def _extract_info(self, webpage):
bacac173
JMF
61 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
62 webpage, 'info json')
ca1fee34
JMF
63 return json.loads(info_json)
64
9fd5ce0c 65 def _real_extract(self, url):
bacac173 66 m = re.match(self._VALID_URL, url, re.VERBOSE)
aab74fa1
PH
67 if m.group('type') == 'embed':
68 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
69 return self.url_result(desktop_url, 'TED')
bacac173 70 name = m.group('name')
9fd5ce0c 71 if m.group('type_talk'):
bacac173 72 return self._talk_info(url, name)
ac6c1048
PH
73 elif m.group('type_watch'):
74 return self._watch_info(url, name)
bacac173 75 else:
ca1fee34 76 return self._playlist_videos_info(url, name)
9fd5ce0c 77
ca1fee34 78 def _playlist_videos_info(self, url, name):
9fd5ce0c 79 '''Returns the videos of the playlist'''
fc2ef392 80
ca1fee34
JMF
81 webpage = self._download_webpage(url, name,
82 'Downloading playlist webpage')
83 info = self._extract_info(webpage)
84 playlist_info = info['playlist']
9fd5ce0c 85
fc2ef392 86 playlist_entries = [
ca1fee34
JMF
87 self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key())
88 for talk in info['talks']
fc2ef392
PH
89 ]
90 return self.playlist_result(
ca1fee34
JMF
91 playlist_entries,
92 playlist_id=compat_str(playlist_info['id']),
93 playlist_title=playlist_info['title'])
9fd5ce0c 94
bacac173
JMF
95 def _talk_info(self, url, video_name):
96 webpage = self._download_webpage(url, video_name)
9fd5ce0c 97 self.report_extraction(video_name)
a9a3876d 98
ca1fee34 99 talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d 100
652bee05 101 formats = [{
652bee05
JMF
102 'url': format_url,
103 'format_id': format_id,
104 'format': format_id,
652bee05 105 } for (format_id, format_url) in talk_info['nativeDownloads'].items()]
0ba77818
PH
106 for f in formats:
107 finfo = self._NATIVE_FORMATS.get(f['format_id'])
108 if finfo:
109 f.update(finfo)
652bee05
JMF
110 self._sort_formats(formats)
111
7b9965ea 112 video_id = compat_str(talk_info['id'])
a9a3876d 113 # subtitles
652bee05 114 video_subtitles = self.extract_subtitles(video_id, talk_info)
a9a3876d 115 if self._downloader.params.get('listsubtitles', False):
652bee05 116 self._list_available_subtitles(video_id, talk_info)
a9a3876d
IM
117 return
118
b6c1cecc
JMF
119 thumbnail = talk_info['thumb']
120 if not thumbnail.startswith('http'):
121 thumbnail = 'http://' + thumbnail
463a9087 122 return {
a9a3876d 123 'id': video_id,
652bee05
JMF
124 'title': talk_info['title'],
125 'uploader': talk_info['speaker'],
b6c1cecc 126 'thumbnail': thumbnail,
652bee05 127 'description': self._og_search_description(webpage),
a9a3876d 128 'subtitles': video_subtitles,
0d8cb1cc
PH
129 'formats': formats,
130 }
131
652bee05
JMF
132 def _get_available_subtitles(self, video_id, talk_info):
133 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
134 if languages:
135 sub_lang_list = {}
136 for l in languages:
137 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
138 sub_lang_list[l] = url
139 return sub_lang_list
140 else:
4ed3e510 141 self._downloader.report_warning(u'video doesn\'t have subtitles')
652bee05 142 return {}
ac6c1048
PH
143
144 def _watch_info(self, url, name):
145 webpage = self._download_webpage(url, name)
146
147 config_json = self._html_search_regex(
148 r"data-config='([^']+)", webpage, 'config')
149 config = json.loads(config_json)
150 video_url = config['video']['url']
151 thumbnail = config.get('image', {}).get('url')
152
153 title = self._html_search_regex(
154 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
155 description = self._html_search_regex(
156 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
157 webpage, 'description', fatal=False)
158
159 return {
160 'id': name,
161 'url': video_url,
162 'title': title,
163 'thumbnail': thumbnail,
164 'description': description,
165 }