1 from __future__
import unicode_literals
6 from .common
import InfoExtractor
8 from ..compat
import compat_str
15 class TEDIE(InfoExtractor
):
19 (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
21 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
23 ((?P<type_talk>talks)) # We have a simple talk
25 (?P<type_watch>watch)/[^/]+/[^/]+
27 (/lang/(.*?))? # The url may contain the language
28 /(?P<name>[\w-]+) # Here goes the name and then ".html"
32 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
33 'md5': '0de43ac406aa3e4ea74b66c9c7789b13',
37 'title': 'The illusion of consciousness',
38 'description': ('Philosopher Dan Dennett makes a compelling '
39 'argument that not only don\'t we understand our own '
40 'consciousness, but that half the time our brains are '
41 'actively fooling us.'),
42 'uploader': 'Dan Dennett',
47 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
48 'md5': 'b899ac15e345fb39534d913f7606082b',
52 'title': 'Vishal Sikka: The beauty and power of algorithms',
53 'thumbnail': r
're:^https?://.+\.jpg',
54 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4',
55 'upload_date': '20140122',
56 'uploader_id': 'TEDInstitute',
57 'uploader': 'TED Institute',
59 'add_ie': ['Youtube'],
61 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
62 'md5': '71b3ab2f4233012dce09d515c9c39ce2',
66 'title': 'Be passionate. Be courageous. Be your best.',
67 'uploader': 'Gabby Giffords and Mark Kelly',
68 'description': 'md5:5174aed4d0f16021b704120360f72b92',
72 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
75 'title': 'Who are the hackers?',
77 'playlist_mincount': 6,
79 # contains a youtube video
80 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
81 'add_ie': ['Youtube'],
85 'title': 'Douglas Adams: Parrots the Universe and Everything',
86 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
87 'uploader': 'University of California Television (UCTV)',
88 'uploader_id': 'UCtelevision',
89 'upload_date': '20080522',
92 'skip_download': True,
96 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
97 'add_ie': ['Youtube'],
101 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
102 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
103 'uploader': 'TEDx Talks',
104 'uploader_id': 'TEDxTalks',
105 'upload_date': '20111216',
108 'skip_download': True,
112 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
116 'title': 'The orchestra in my mouth',
117 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
118 'uploader': 'Tom Thum',
121 'skip_download': True,
126 'low': {'width': 320, 'height': 180}
,
127 'medium': {'width': 512, 'height': 288}
,
128 'high': {'width': 854, 'height': 480}
,
131 def _extract_info(self
, webpage
):
132 info_json
= self
._search
_regex
(
133 r
'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>',
134 webpage
, 'info json')
135 return json
.loads(info_json
)
137 def _real_extract(self
, url
):
138 m
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
139 if m
.group('type').startswith('embed'):
140 desktop_url
= m
.group('proto') + 'www' + m
.group('urlmain')
141 return self
.url_result(desktop_url
, 'TED')
142 name
= m
.group('name')
143 if m
.group('type_talk'):
144 return self
._talk
_info
(url
, name
)
145 elif m
.group('type_watch'):
146 return self
._watch
_info
(url
, name
)
148 return self
._playlist
_videos
_info
(url
, name
)
150 def _playlist_videos_info(self
, url
, name
):
151 '''Returns the videos of the playlist'''
153 webpage
= self
._download
_webpage
(url
, name
,
154 'Downloading playlist webpage')
155 info
= self
._extract
_info
(webpage
)
157 playlist_info
= try_get(
158 info
, lambda x
: x
['__INITIAL_DATA__']['playlist'],
159 dict) or info
['playlist']
162 self
.url_result('http://www.ted.com/talks/' + talk
['slug'], self
.ie_key())
164 info
, lambda x
: x
['__INITIAL_DATA__']['talks'],
165 dict) or info
['talks']
167 return self
.playlist_result(
169 playlist_id
=compat_str(playlist_info
['id']),
170 playlist_title
=playlist_info
['title'])
172 def _talk_info(self
, url
, video_name
):
173 webpage
= self
._download
_webpage
(url
, video_name
)
175 info
= self
._extract
_info
(webpage
)
178 info
, lambda x
: x
['__INITIAL_DATA__']['talks'][0],
179 dict) or info
['talks'][0]
181 title
= talk_info
['title'].strip()
183 external
= talk_info
.get('external')
185 service
= external
['service']
186 self
.to_screen('Found video from %s' % service
)
188 if service
.lower() == 'youtube':
189 ext_url
= external
.get('code')
192 'url': ext_url
or external
['uri'],
195 native_downloads
= try_get(
197 (lambda x
: x
['downloads']['nativeDownloads'],
198 lambda x
: x
['nativeDownloads']),
203 'format_id': format_id
,
205 } for (format_id
, format_url
) in native_downloads
.items() if format_url
is not None]
208 finfo
= self
._NATIVE
_FORMATS
.get(f
['format_id'])
212 player_talk
= talk_info
['player_talks'][0]
214 resources_
= player_talk
.get('resources') or talk_info
.get('resources')
217 for format_id
, resources
in resources_
.items():
218 if format_id
== 'h264':
219 for resource
in resources
:
220 h264_url
= resource
.get('file')
223 bitrate
= int_or_none(resource
.get('bitrate'))
226 'format_id': '%s-%sk' % (format_id
, bitrate
),
229 if re
.search(r
'\d+k', h264_url
):
231 elif format_id
== 'rtmp':
232 streamer
= talk_info
.get('streamer')
235 for resource
in resources
:
237 'format_id': '%s-%s' % (format_id
, resource
.get('name')),
239 'play_path': resource
['file'],
241 'width': int_or_none(resource
.get('width')),
242 'height': int_or_none(resource
.get('height')),
243 'tbr': int_or_none(resource
.get('bitrate')),
245 elif format_id
== 'hls':
246 formats
.extend(self
._extract
_m
3u8_formats
(
247 resources
.get('stream'), video_name
, 'mp4', m3u8_id
=format_id
, fatal
=False))
249 m3u8_formats
= list(filter(
250 lambda f
: f
.get('protocol') == 'm3u8' and f
.get('vcodec') != 'none',
253 for m3u8_format
in m3u8_formats
:
254 bitrate
= self
._search
_regex
(r
'(\d+k)', m3u8_format
['url'], 'bitrate', default
=None)
257 f
= m3u8_format
.copy()
259 'url': re
.sub(r
'\d+k', bitrate
, http_url
),
260 'format_id': m3u8_format
['format_id'].replace('hls', 'http'),
265 audio_download
= talk_info
.get('audioDownload')
268 'url': audio_download
,
269 'format_id': 'audio',
273 self
._sort
_formats
(formats
)
275 video_id
= compat_str(talk_info
['id'])
280 'uploader': player_talk
.get('speaker') or talk_info
.get('speaker'),
281 'thumbnail': player_talk
.get('thumb') or talk_info
.get('thumb'),
282 'description': self
._og
_search
_description
(webpage
),
283 'subtitles': self
._get
_subtitles
(video_id
, talk_info
),
285 'duration': talk_info
.get('duration'),
288 def _get_subtitles(self
, video_id
, talk_info
):
290 for language
in try_get(
292 (lambda x
: x
['downloads']['languages'],
293 lambda x
: x
['languages']), list):
294 lang_code
= language
.get('languageCode') or language
.get('ianaCode')
297 sub_lang_list
[lang_code
] = [
299 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id
, lang_code
, ext
),
302 for ext
in ['ted', 'srt']
306 def _watch_info(self
, url
, name
):
307 webpage
= self
._download
_webpage
(url
, name
)
309 config_json
= self
._html
_search
_regex
(
310 r
'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
311 webpage
, 'config', default
=None)
313 embed_url
= self
._search
_regex
(
314 r
"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage
, 'embed url')
315 return self
.url_result(self
._proto
_relative
_url
(embed_url
))
316 config
= json
.loads(config_json
)['config']
317 video_url
= config
['video']['url']
318 thumbnail
= config
.get('image', {}).get('url')
320 title
= self
._html
_search
_regex
(
321 r
"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage
, 'title')
322 description
= self
._html
_search
_regex
(
324 r
'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
325 r
'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
327 webpage
, 'description', fatal
=False)
333 'thumbnail': thumbnail
,
334 'description': description
,