]>
Commit | Line | Data |
---|---|---|
9fd5ce0c PH |
1 | import json |
2 | import re | |
3 | ||
4 | from .common import InfoExtractor | |
5 | ||
6 | ||
7 | class TEDIE(InfoExtractor): | |
8 | _VALID_URL=r'''http://www\.ted\.com/ | |
9 | ( | |
10 | ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist | |
11 | | | |
12 | ((?P<type_talk>talks)) # We have a simple talk | |
13 | ) | |
14 | (/lang/(.*?))? # The url may contain the language | |
15 | /(?P<name>\w+) # Here goes the name and then ".html" | |
16 | ''' | |
17 | ||
18 | @classmethod | |
19 | def suitable(cls, url): | |
20 | """Receives a URL and returns True if suitable for this IE.""" | |
21 | return re.match(cls._VALID_URL, url, re.VERBOSE) is not None | |
22 | ||
23 | def _real_extract(self, url): | |
24 | m=re.match(self._VALID_URL, url, re.VERBOSE) | |
25 | if m.group('type_talk'): | |
26 | return [self._talk_info(url)] | |
27 | else : | |
28 | playlist_id=m.group('playlist_id') | |
29 | name=m.group('name') | |
30 | self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) | |
31 | return [self._playlist_videos_info(url,name,playlist_id)] | |
32 | ||
33 | def _playlist_videos_info(self,url,name,playlist_id=0): | |
34 | '''Returns the videos of the playlist''' | |
35 | video_RE=r''' | |
36 | <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)" | |
37 | ([.\s]*?)data-playlist_item_id="(\d+)" | |
38 | ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)" | |
39 | ''' | |
40 | video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>' | |
41 | webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') | |
42 | m_videos=re.finditer(video_RE,webpage,re.VERBOSE) | |
43 | m_names=re.finditer(video_name_RE,webpage) | |
44 | ||
45 | playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>', | |
46 | webpage, 'playlist title') | |
47 | ||
48 | playlist_entries = [] | |
49 | for m_video, m_name in zip(m_videos,m_names): | |
50 | talk_url='http://www.ted.com%s' % m_name.group('talk_url') | |
51 | playlist_entries.append(self.url_result(talk_url, 'TED')) | |
52 | return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title) | |
53 | ||
54 | def _talk_info(self, url, video_id=0): | |
55 | """Return the video for the talk in the url""" | |
56 | m = re.match(self._VALID_URL, url,re.VERBOSE) | |
57 | video_name = m.group('name') | |
58 | webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name) | |
59 | self.report_extraction(video_name) | |
60 | # If the url includes the language we get the title translated | |
61 | title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>', | |
62 | webpage, 'title') | |
63 | json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>', | |
64 | webpage, 'json data') | |
65 | info = json.loads(json_data) | |
66 | desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>', | |
67 | webpage, 'description', flags = re.DOTALL) | |
68 | ||
69 | thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"', | |
70 | webpage, 'thumbnail') | |
71 | info = { | |
72 | 'id': info['id'], | |
73 | 'url': info['htmlStreams'][-1]['file'], | |
74 | 'ext': 'mp4', | |
75 | 'title': title, | |
76 | 'thumbnail': thumbnail, | |
77 | 'description': desc, | |
78 | } | |
79 | return info |