1 from __future__
import unicode_literals
6 from .common
import InfoExtractor
13 class TEDIE(InfoExtractor
):
16 (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
18 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
20 ((?P<type_talk>talks)) # We have a simple talk
22 (?P<type_watch>watch)/[^/]+/[^/]+
24 (/lang/(.*?))? # The url may contain the language
25 /(?P<name>[\w-]+) # Here goes the name and then ".html"
29 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
30 'md5': 'fc94ac279feebbce69f21c0c6ee82810',
34 'title': 'The illusion of consciousness',
35 'description': ('Philosopher Dan Dennett makes a compelling '
36 'argument that not only don\'t we understand our own '
37 'consciousness, but that half the time our brains are '
38 'actively fooling us.'),
39 'uploader': 'Dan Dennett',
44 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
45 'md5': '226f4fb9c62380d11b7995efa4c87994',
47 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
49 'title': 'Vishal Sikka: The beauty and power of algorithms',
50 'thumbnail': 're:^https?://.+\.jpg',
51 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
54 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
58 'title': 'Be passionate. Be courageous. Be your best.',
59 'uploader': 'Gabby Giffords and Mark Kelly',
60 'description': 'md5:5174aed4d0f16021b704120360f72b92',
64 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
67 'title': 'Who are the hackers?',
69 'playlist_mincount': 6,
71 # contains a youtube video
72 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
73 'add_ie': ['Youtube'],
77 'title': 'Douglas Adams: Parrots the Universe and Everything',
78 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
79 'uploader': 'University of California Television (UCTV)',
80 'uploader_id': 'UCtelevision',
81 'upload_date': '20080522',
84 'skip_download': True,
88 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
89 'add_ie': ['Youtube'],
93 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
94 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
95 'uploader': 'TEDx Talks',
96 'uploader_id': 'TEDxTalks',
97 'upload_date': '20111216',
100 'skip_download': True,
105 'low': {'preference': 1, 'width': 320, 'height': 180}
,
106 'medium': {'preference': 2, 'width': 512, 'height': 288}
,
107 'high': {'preference': 3, 'width': 854, 'height': 480}
,
110 def _extract_info(self
, webpage
):
111 info_json
= self
._search
_regex
(r
'q\("\w+.init",({.+})\)</script>',
112 webpage
, 'info json')
113 return json
.loads(info_json
)
115 def _real_extract(self
, url
):
116 m
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
117 if m
.group('type').startswith('embed'):
118 desktop_url
= m
.group('proto') + 'www' + m
.group('urlmain')
119 return self
.url_result(desktop_url
, 'TED')
120 name
= m
.group('name')
121 if m
.group('type_talk'):
122 return self
._talk
_info
(url
, name
)
123 elif m
.group('type_watch'):
124 return self
._watch
_info
(url
, name
)
126 return self
._playlist
_videos
_info
(url
, name
)
128 def _playlist_videos_info(self
, url
, name
):
129 '''Returns the videos of the playlist'''
131 webpage
= self
._download
_webpage
(url
, name
,
132 'Downloading playlist webpage')
133 info
= self
._extract
_info
(webpage
)
134 playlist_info
= info
['playlist']
137 self
.url_result('http://www.ted.com/talks/' + talk
['slug'], self
.ie_key())
138 for talk
in info
['talks']
140 return self
.playlist_result(
142 playlist_id
=compat_str(playlist_info
['id']),
143 playlist_title
=playlist_info
['title'])
145 def _talk_info(self
, url
, video_name
):
146 webpage
= self
._download
_webpage
(url
, video_name
)
147 self
.report_extraction(video_name
)
149 talk_info
= self
._extract
_info
(webpage
)['talks'][0]
151 external
= talk_info
.get('external')
153 service
= external
['service']
154 self
.to_screen('Found video from %s' % service
)
156 if service
.lower() == 'youtube':
157 ext_url
= external
.get('code')
160 'url': ext_url
or external
['uri'],
165 'format_id': format_id
,
167 } for (format_id
, format_url
) in talk_info
['nativeDownloads'].items() if format_url
is not None]
170 finfo
= self
._NATIVE
_FORMATS
.get(f
['format_id'])
176 'format_id': f
['name'],
177 'url': talk_info
['streamer'],
178 'play_path': f
['file'],
181 'height': f
['height'],
183 } for f
in talk_info
['resources']['rtmp']]
184 self
._sort
_formats
(formats
)
186 video_id
= compat_str(talk_info
['id'])
188 thumbnail
= talk_info
['thumb']
189 if not thumbnail
.startswith('http'):
190 thumbnail
= 'http://' + thumbnail
193 'title': talk_info
['title'].strip(),
194 'uploader': talk_info
['speaker'],
195 'thumbnail': thumbnail
,
196 'description': self
._og
_search
_description
(webpage
),
197 'subtitles': self
._get
_subtitles
(video_id
, talk_info
),
199 'duration': talk_info
.get('duration'),
202 def _get_subtitles(self
, video_id
, talk_info
):
203 languages
= [lang
['languageCode'] for lang
in talk_info
.get('languages', [])]
209 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id
, l
, ext
),
212 for ext
in ['ted', 'srt']
218 def _watch_info(self
, url
, name
):
219 webpage
= self
._download
_webpage
(url
, name
)
221 config_json
= self
._html
_search
_regex
(
222 r
'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
224 config
= json
.loads(config_json
)['config']
225 video_url
= config
['video']['url']
226 thumbnail
= config
.get('image', {}).get('url')
228 title
= self
._html
_search
_regex
(
229 r
"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage
, 'title')
230 description
= self
._html
_search
_regex
(
232 r
'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
233 r
'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
235 webpage
, 'description', fatal
=False)
241 'thumbnail': thumbnail
,
242 'description': description
,