]>
Commit | Line | Data |
---|---|---|
6624a2b0 | 1 | # encoding: utf-8 |
2 | ||
3 | import re | |
4 | import json | |
5 | import time | |
6 | import logging | |
7 | import urllib2 | |
8 | ||
9 | from .common import InfoExtractor | |
10 | from ..utils import compat_urllib_request | |
11 | ||
12 | ||
13 | class SohuIE(InfoExtractor): | |
14 | _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P<id>\d+)\.shtml.*?' | |
15 | ||
16 | _TEST = { | |
17 | u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super', | |
18 | u'file': u'382479172.flv', | |
19 | u'md5': u'cc84eed6b6fbf0f2f9a8d3cb9da1939b', | |
20 | u'info_dict': { | |
21 | u'title': u'The Illest - Far East Movement Riff Raff', | |
22 | }, | |
23 | } | |
24 | ||
25 | def _clearn_html(self, string): | |
26 | tags = re.findall(r'<.+?>', string) | |
27 | for t in tags: | |
28 | string = string.replace(t, ' ') | |
29 | for i in range(2): | |
30 | spaces = re.findall(r'\s+', string) | |
31 | for s in spaces: | |
32 | string = string.replace(s, ' ') | |
33 | string = string.strip() | |
34 | return string | |
35 | ||
36 | def _real_extract(self, url): | |
37 | mobj = re.match(self._VALID_URL, url) | |
38 | video_id = mobj.group('id') | |
39 | webpage = self._download_webpage(url, video_id) | |
40 | pattern = r'<h1 id="video-title">\n*?(.+?)\n*?</h1>' | |
41 | compiled = re.compile(pattern, re.DOTALL) | |
42 | title = self._search_regex(compiled, webpage, u'video title').strip('\t\n') | |
43 | title = self._clearn_html(title) | |
44 | pattern = re.compile(r'var vid="(\d+)"') | |
45 | result = re.search(pattern, webpage) | |
46 | if not result: | |
47 | logging.info('[Sohu] could not get vid') | |
48 | return None | |
49 | vid = result.group(1) | |
50 | logging.info('vid: %s' % vid) | |
51 | base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' | |
52 | url_1 = base_url_1 + vid | |
53 | logging.info('json url: %s' % url_1) | |
54 | json_1 = json.loads(urllib2.urlopen(url_1).read()) | |
55 | # get the highest definition video vid and json infomation. | |
56 | vids = [] | |
57 | qualities = ('oriVid', 'superVid', 'highVid', 'norVid') | |
58 | for vid_name in qualities: | |
59 | vids.append(json_1['data'][vid_name]) | |
60 | clearest_vid = 0 | |
61 | for i, v in enumerate(vids): | |
62 | if v != 0: | |
63 | clearest_vid = v | |
64 | logging.info('quality definition: %s' % qualities[i][:-3]) | |
65 | break | |
66 | if not clearest_vid: | |
67 | logging.warning('could not find valid clearest_vid') | |
68 | return None | |
69 | if vid != clearest_vid: | |
70 | url_1 = '%s%d' % (base_url_1, clearest_vid) | |
71 | logging.info('highest definition json url: %s' % url_1) | |
72 | json_1 = json.loads(urllib2.urlopen(url_1).read()) | |
73 | allot = json_1['allot'] | |
74 | prot = json_1['prot'] | |
75 | clipsURL = json_1['data']['clipsURL'] | |
76 | su = json_1['data']['su'] | |
77 | num_of_parts = json_1['data']['totalBlocks'] | |
78 | logging.info('Total parts: %d' % num_of_parts) | |
79 | base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]' | |
80 | files_info = [] | |
81 | for i in range(num_of_parts): | |
82 | middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i]) | |
83 | logging.info('middle url part %d: %s' % (i, middle_url)) | |
84 | middle_info = urllib2.urlopen(middle_url).read().split('|') | |
85 | middle_part_1 = middle_info[0] | |
86 | download_url = '%s%s?key=%s' % (middle_info[0], su[i], middle_info[3]) | |
87 | ||
88 | info = { | |
89 | 'id': '%s_part%02d' % (video_id, i + 1), | |
90 | 'title': title, | |
91 | 'url': download_url, | |
92 | 'ext': 'mp4', | |
93 | } | |
94 | files_info.append(info) | |
95 | time.sleep(1) | |
96 | ||
97 | return files_info |