]>
Commit | Line | Data |
---|---|---|
24a267b5 JMF |
1 | # coding: utf-8 |
2 | ||
8bdfddf6 PH |
3 | from __future__ import unicode_literals |
4 | ||
9caa687d | 5 | import re |
24a267b5 | 6 | import json |
9caa687d YK |
7 | |
8 | from .common import InfoExtractor | |
9 | ||
10 | ||
11 | class TudouIE(InfoExtractor): | |
92b065dc | 12 | _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/.*?/(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])' |
9ed3bdc6 | 13 | _TESTS = [{ |
8bdfddf6 PH |
14 | 'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', |
15 | 'md5': '140a49ed444bd22f93330985d8475fcb', | |
16 | 'info_dict': { | |
17 | 'id': '159448201', | |
18 | 'ext': 'f4v', | |
19 | 'title': '卡马乔国足开大脚长传冲吊集锦', | |
20 | 'thumbnail': 're:^https?://.*\.jpg$', | |
6f5ac90c | 21 | } |
a8be56ce PH |
22 | }, { |
23 | 'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/', | |
24 | 'info_dict': { | |
25 | 'id': '117049447', | |
26 | 'ext': 'f4v', | |
27 | 'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012', | |
28 | 'thumbnail': 're:^https?://.*\.jpg$', | |
29 | } | |
9ed3bdc6 | 30 | }] |
9caa687d | 31 | |
b74e86f4 | 32 | def _url_for_id(self, id, quality=None): |
2514d263 | 33 | info_url = "http://v2.tudou.com/f?id=" + str(id) |
24a267b5 JMF |
34 | if quality: |
35 | info_url += '&hd' + quality | |
36 | webpage = self._download_webpage(info_url, id, "Opening the info webpage") | |
5f6a1245 | 37 | final_url = self._html_search_regex('>(.+?)</f>', webpage, 'video url') |
24a267b5 JMF |
38 | return final_url |
39 | ||
9caa687d | 40 | def _real_extract(self, url): |
92b065dc | 41 | video_id = self._match_id(url) |
9caa687d | 42 | webpage = self._download_webpage(url, video_id) |
9ed3bdc6 PH |
43 | |
44 | m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage) | |
45 | if m and m.group(1): | |
46 | return { | |
47 | '_type': 'url', | |
8bdfddf6 | 48 | 'url': 'youku:' + m.group(1), |
9ed3bdc6 PH |
49 | 'ie_key': 'Youku' |
50 | } | |
51 | ||
7c58ef32 | 52 | title = self._search_regex( |
8bdfddf6 | 53 | r",kw:\s*['\"](.+?)[\"']", webpage, 'title') |
9ed3bdc6 | 54 | thumbnail_url = self._search_regex( |
8bdfddf6 | 55 | r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False) |
24a267b5 JMF |
56 | |
57 | segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments') | |
58 | segments = json.loads(segs_json) | |
59 | # It looks like the keys are the arguments that have to be passed as | |
60 | # the hd field in the request url, we pick the higher | |
ca9cd290 | 61 | # Also, filter non-number qualities (see issue #3643). |
f931e259 NJ |
62 | quality = sorted(filter(lambda k: k.isdigit(), segments.keys()), |
63 | key=lambda k: int(k))[-1] | |
24a267b5 JMF |
64 | parts = segments[quality] |
65 | result = [] | |
66 | len_parts = len(parts) | |
67 | if len_parts > 1: | |
8865bdeb | 68 | self.to_screen('%s: found %s parts' % (video_id, len_parts)) |
24a267b5 JMF |
69 | for part in parts: |
70 | part_id = part['k'] | |
71 | final_url = self._url_for_id(part_id, quality) | |
72 | ext = (final_url.split('?')[0]).split('.')[-1] | |
8bdfddf6 PH |
73 | part_info = { |
74 | 'id': '%s' % part_id, | |
75 | 'url': final_url, | |
76 | 'ext': ext, | |
77 | 'title': title, | |
78 | 'thumbnail': thumbnail_url, | |
79 | } | |
24a267b5 JMF |
80 | result.append(part_info) |
81 | ||
92b065dc PH |
82 | return { |
83 | '_type': 'multi_video', | |
84 | 'entries': result, | |
85 | 'id': video_id, | |
86 | 'title': title, | |
87 | } |