]>
Commit | Line | Data |
---|---|---|
9fd5ce0c PH |
1 | import json |
2 | import re | |
3 | ||
4 | from .common import InfoExtractor | |
5 | ||
6 | ||
7 | class TEDIE(InfoExtractor): | |
8 | _VALID_URL=r'''http://www\.ted\.com/ | |
9 | ( | |
10 | ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist | |
11 | | | |
12 | ((?P<type_talk>talks)) # We have a simple talk | |
13 | ) | |
14 | (/lang/(.*?))? # The url may contain the language | |
15 | /(?P<name>\w+) # Here goes the name and then ".html" | |
16 | ''' | |
6f5ac90c PH |
17 | _TEST = { |
18 | u'url': u'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', | |
19 | u'file': u'102.mp4', | |
9c5cd094 | 20 | u'md5': u'2d76ee1576672e0bd8f187513267adf6', |
6f5ac90c PH |
21 | u'info_dict': { |
22 | u"description": u"md5:c6fa72e6eedbd938c9caf6b2702f5922", | |
23 | u"title": u"Dan Dennett: The illusion of consciousness" | |
24 | } | |
25 | } | |
9fd5ce0c PH |
26 | |
27 | @classmethod | |
28 | def suitable(cls, url): | |
29 | """Receives a URL and returns True if suitable for this IE.""" | |
30 | return re.match(cls._VALID_URL, url, re.VERBOSE) is not None | |
31 | ||
32 | def _real_extract(self, url): | |
33 | m=re.match(self._VALID_URL, url, re.VERBOSE) | |
34 | if m.group('type_talk'): | |
35 | return [self._talk_info(url)] | |
36 | else : | |
37 | playlist_id=m.group('playlist_id') | |
38 | name=m.group('name') | |
39 | self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) | |
40 | return [self._playlist_videos_info(url,name,playlist_id)] | |
41 | ||
42 | def _playlist_videos_info(self,url,name,playlist_id=0): | |
43 | '''Returns the videos of the playlist''' | |
44 | video_RE=r''' | |
45 | <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)" | |
46 | ([.\s]*?)data-playlist_item_id="(\d+)" | |
47 | ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)" | |
48 | ''' | |
49 | video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>' | |
50 | webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') | |
51 | m_videos=re.finditer(video_RE,webpage,re.VERBOSE) | |
52 | m_names=re.finditer(video_name_RE,webpage) | |
53 | ||
54 | playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>', | |
55 | webpage, 'playlist title') | |
56 | ||
57 | playlist_entries = [] | |
58 | for m_video, m_name in zip(m_videos,m_names): | |
59 | talk_url='http://www.ted.com%s' % m_name.group('talk_url') | |
60 | playlist_entries.append(self.url_result(talk_url, 'TED')) | |
61 | return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title) | |
62 | ||
63 | def _talk_info(self, url, video_id=0): | |
64 | """Return the video for the talk in the url""" | |
65 | m = re.match(self._VALID_URL, url,re.VERBOSE) | |
66 | video_name = m.group('name') | |
67 | webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name) | |
68 | self.report_extraction(video_name) | |
69 | # If the url includes the language we get the title translated | |
177ed935 | 70 | title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>', |
9fd5ce0c PH |
71 | webpage, 'title') |
72 | json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>', | |
73 | webpage, 'json data') | |
74 | info = json.loads(json_data) | |
75 | desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>', | |
76 | webpage, 'description', flags = re.DOTALL) | |
77 | ||
78 | thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"', | |
79 | webpage, 'thumbnail') | |
80 | info = { | |
81 | 'id': info['id'], | |
82 | 'url': info['htmlStreams'][-1]['file'], | |
83 | 'ext': 'mp4', | |
84 | 'title': title, | |
85 | 'thumbnail': thumbnail, | |
86 | 'description': desc, | |
87 | } | |
88 | return info |