]>
Commit | Line | Data |
---|---|---|
605ec701 | 1 | # coding: utf-8 |
605ec701 P |
2 | from __future__ import unicode_literals |
3 | ||
958d0b65 | 4 | import hashlib |
73f9c286 | 5 | import itertools |
99709cc3 | 6 | import re |
605ec701 | 7 | import time |
958d0b65 YCH |
8 | |
9 | from .common import InfoExtractor | |
8e0548e1 | 10 | from ..compat import ( |
99709cc3 | 11 | compat_str, |
15707c7e | 12 | compat_urllib_parse_urlencode, |
8e0548e1 YCH |
13 | ) |
14 | from ..utils import ( | |
2644e911 | 15 | clean_html, |
f52354a8 | 16 | decode_packed_codes, |
2644e911 YCH |
17 | get_element_by_id, |
18 | get_element_by_attribute, | |
8e0548e1 | 19 | ExtractorError, |
99709cc3 | 20 | ohdave_rsa_encrypt, |
73f9c286 | 21 | remove_start, |
8e0548e1 | 22 | ) |
605ec701 | 23 | |
f1da8610 | 24 | |
99709cc3 YCH |
25 | def md5_text(text): |
26 | return hashlib.md5(text.encode('utf-8')).hexdigest() | |
27 | ||
28 | ||
29 | class IqiyiSDK(object): | |
30 | def __init__(self, target, ip, timestamp): | |
31 | self.target = target | |
32 | self.ip = ip | |
33 | self.timestamp = timestamp | |
34 | ||
35 | @staticmethod | |
36 | def split_sum(data): | |
37 | return compat_str(sum(map(lambda p: int(p, 16), list(data)))) | |
38 | ||
39 | @staticmethod | |
40 | def digit_sum(num): | |
41 | if isinstance(num, int): | |
42 | num = compat_str(num) | |
43 | return compat_str(sum(map(int, num))) | |
44 | ||
45 | def even_odd(self): | |
46 | even = self.digit_sum(compat_str(self.timestamp)[::2]) | |
47 | odd = self.digit_sum(compat_str(self.timestamp)[1::2]) | |
48 | return even, odd | |
49 | ||
50 | def preprocess(self, chunksize): | |
51 | self.target = md5_text(self.target) | |
52 | chunks = [] | |
53 | for i in range(32 // chunksize): | |
54 | chunks.append(self.target[chunksize * i:chunksize * (i + 1)]) | |
55 | if 32 % chunksize: | |
56 | chunks.append(self.target[32 - 32 % chunksize:]) | |
57 | return chunks, list(map(int, self.ip.split('.'))) | |
58 | ||
59 | def mod(self, modulus): | |
60 | chunks, ip = self.preprocess(32) | |
61 | self.target = chunks[0] + ''.join(map(lambda p: compat_str(p % modulus), ip)) | |
62 | ||
63 | def split(self, chunksize): | |
64 | modulus_map = { | |
65 | 4: 256, | |
66 | 5: 10, | |
67 | 8: 100, | |
68 | } | |
69 | ||
70 | chunks, ip = self.preprocess(chunksize) | |
71 | ret = '' | |
72 | for i in range(len(chunks)): | |
73 | ip_part = compat_str(ip[i] % modulus_map[chunksize]) if i < 4 else '' | |
74 | if chunksize == 8: | |
75 | ret += ip_part + chunks[i] | |
76 | else: | |
77 | ret += chunks[i] + ip_part | |
78 | self.target = ret | |
79 | ||
80 | def handle_input16(self): | |
81 | self.target = md5_text(self.target) | |
82 | self.target = self.split_sum(self.target[:16]) + self.target + self.split_sum(self.target[16:]) | |
83 | ||
84 | def handle_input8(self): | |
85 | self.target = md5_text(self.target) | |
86 | ret = '' | |
87 | for i in range(4): | |
88 | part = self.target[8 * i:8 * (i + 1)] | |
89 | ret += self.split_sum(part) + part | |
90 | self.target = ret | |
91 | ||
92 | def handleSum(self): | |
93 | self.target = md5_text(self.target) | |
94 | self.target = self.split_sum(self.target) + self.target | |
95 | ||
96 | def date(self, scheme): | |
97 | self.target = md5_text(self.target) | |
98 | d = time.localtime(self.timestamp) | |
99 | strings = { | |
100 | 'y': compat_str(d.tm_year), | |
101 | 'm': '%02d' % d.tm_mon, | |
102 | 'd': '%02d' % d.tm_mday, | |
103 | } | |
104 | self.target += ''.join(map(lambda c: strings[c], list(scheme))) | |
105 | ||
106 | def split_time_even_odd(self): | |
107 | even, odd = self.even_odd() | |
108 | self.target = odd + md5_text(self.target) + even | |
109 | ||
110 | def split_time_odd_even(self): | |
111 | even, odd = self.even_odd() | |
112 | self.target = even + md5_text(self.target) + odd | |
113 | ||
114 | def split_ip_time_sum(self): | |
115 | chunks, ip = self.preprocess(32) | |
116 | self.target = compat_str(sum(ip)) + chunks[0] + self.digit_sum(self.timestamp) | |
117 | ||
118 | def split_time_ip_sum(self): | |
119 | chunks, ip = self.preprocess(32) | |
120 | self.target = self.digit_sum(self.timestamp) + chunks[0] + compat_str(sum(ip)) | |
121 | ||
122 | ||
123 | class IqiyiSDKInterpreter(object): | |
99709cc3 YCH |
124 | def __init__(self, sdk_code): |
125 | self.sdk_code = sdk_code | |
126 | ||
99709cc3 | 127 | def run(self, target, ip, timestamp): |
f52354a8 | 128 | self.sdk_code = decode_packed_codes(self.sdk_code) |
99709cc3 YCH |
129 | |
130 | functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code) | |
131 | ||
132 | sdk = IqiyiSDK(target, ip, timestamp) | |
133 | ||
134 | other_functions = { | |
135 | 'handleSum': sdk.handleSum, | |
136 | 'handleInput8': sdk.handle_input8, | |
137 | 'handleInput16': sdk.handle_input16, | |
138 | 'splitTimeEvenOdd': sdk.split_time_even_odd, | |
139 | 'splitTimeOddEven': sdk.split_time_odd_even, | |
140 | 'splitIpTimeSum': sdk.split_ip_time_sum, | |
141 | 'splitTimeIpSum': sdk.split_time_ip_sum, | |
142 | } | |
143 | for function in functions: | |
144 | if re.match(r'mod\d+', function): | |
145 | sdk.mod(int(function[3:])) | |
146 | elif re.match(r'date[ymd]{3}', function): | |
147 | sdk.date(function[4:]) | |
148 | elif re.match(r'split\d+', function): | |
149 | sdk.split(int(function[5:])) | |
150 | elif function in other_functions: | |
151 | other_functions[function]() | |
152 | else: | |
153 | raise ExtractorError('Unknown funcion %s' % function) | |
154 | ||
155 | return sdk.target | |
156 | ||
157 | ||
605ec701 P |
158 | class IqiyiIE(InfoExtractor): |
159 | IE_NAME = 'iqiyi' | |
44c514eb | 160 | IE_DESC = '爱奇艺' |
605ec701 | 161 | |
7e176eff | 162 | _VALID_URL = r'https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html' |
605ec701 | 163 | |
99709cc3 YCH |
164 | _NETRC_MACHINE = 'iqiyi' |
165 | ||
99481135 | 166 | _TESTS = [{ |
f1da8610 | 167 | 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', |
3a212ed6 | 168 | # MD5 checksum differs on my machine and Travis CI |
f1da8610 YCH |
169 | 'info_dict': { |
170 | 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', | |
5b6ad863 | 171 | 'ext': 'mp4', |
f1da8610 | 172 | 'title': '美国德州空中惊现奇异云团 酷似UFO', |
f1da8610 | 173 | } |
99481135 YCH |
174 | }, { |
175 | 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html', | |
2644e911 | 176 | 'md5': '667171934041350c5de3f5015f7f1152', |
99481135 YCH |
177 | 'info_dict': { |
178 | 'id': 'e3f585b550a280af23c98b6cb2be19fb', | |
5b6ad863 | 179 | 'ext': 'mp4', |
2644e911 | 180 | 'title': '名侦探柯南 国语版:第752集 迫近灰原秘密的黑影 下篇', |
99481135 | 181 | }, |
fc3996bf | 182 | 'skip': 'Geo-restricted to China', |
59185202 YCH |
183 | }, { |
184 | 'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html', | |
185 | 'only_matching': True, | |
186 | }, { | |
187 | 'url': 'http://www.iqiyi.com/a_19rrhbc6kt.html', | |
188 | 'only_matching': True, | |
189 | }, { | |
190 | 'url': 'http://yule.iqiyi.com/pcb.html', | |
191 | 'only_matching': True, | |
8e0548e1 YCH |
192 | }, { |
193 | # VIP-only video. The first 2 parts (6 minutes) are available without login | |
1932476c | 194 | # MD5 sums omitted as values are different on Travis CI and my machine |
8e0548e1 YCH |
195 | 'url': 'http://www.iqiyi.com/v_19rrny4w8w.html', |
196 | 'info_dict': { | |
197 | 'id': 'f3cf468b39dddb30d676f89a91200dc1', | |
2644e911 | 198 | 'ext': 'mp4', |
8e0548e1 YCH |
199 | 'title': '泰坦尼克号', |
200 | }, | |
2644e911 | 201 | 'skip': 'Geo-restricted to China', |
73f9c286 YCH |
202 | }, { |
203 | 'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html', | |
204 | 'info_dict': { | |
205 | 'id': '202918101', | |
206 | 'title': '灌篮高手 国语版', | |
207 | }, | |
208 | 'playlist_count': 101, | |
7e176eff YCH |
209 | }, { |
210 | 'url': 'http://www.pps.tv/w_19rrbav0ph.html', | |
211 | 'only_matching': True, | |
99481135 | 212 | }] |
605ec701 | 213 | |
2644e911 YCH |
214 | _FORMATS_MAP = { |
215 | '96': 1, # 216p, 240p | |
216 | '1': 2, # 336p, 360p | |
217 | '2': 3, # 480p, 504p | |
218 | '21': 4, # 504p | |
219 | '4': 5, # 720p | |
220 | '17': 5, # 720p | |
221 | '5': 6, # 1072p, 1080p | |
222 | '18': 7, # 1080p | |
223 | } | |
08bb8ef2 | 224 | |
99709cc3 YCH |
225 | def _real_initialize(self): |
226 | self._login() | |
227 | ||
57565375 | 228 | @staticmethod |
99709cc3 YCH |
229 | def _rsa_fun(data): |
230 | # public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js | |
231 | N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd | |
232 | e = 65537 | |
233 | ||
234 | return ohdave_rsa_encrypt(data, e, N) | |
235 | ||
236 | def _login(self): | |
237 | (username, password) = self._get_login_info() | |
238 | ||
239 | # No authentication to be performed | |
240 | if not username: | |
241 | return True | |
242 | ||
243 | data = self._download_json( | |
244 | 'http://kylin.iqiyi.com/get_token', None, | |
245 | note='Get token for logging', errnote='Unable to get token for logging') | |
246 | sdk = data['sdk'] | |
247 | timestamp = int(time.time()) | |
248 | target = '/apis/reglogin/login.action?lang=zh_TW&area_code=null&email=%s&passwd=%s&agenttype=1&from=undefined&keeplogin=0&piccode=&fromurl=&_pos=1' % ( | |
249 | username, self._rsa_fun(password.encode('utf-8'))) | |
250 | ||
251 | interp = IqiyiSDKInterpreter(sdk) | |
252 | sign = interp.run(target, data['ip'], timestamp) | |
253 | ||
254 | validation_params = { | |
255 | 'target': target, | |
256 | 'server': 'BEA3AA1908656AABCCFF76582C4C6660', | |
257 | 'token': data['token'], | |
258 | 'bird_src': 'f8d91d57af224da7893dd397d52d811a', | |
259 | 'sign': sign, | |
260 | 'bird_t': timestamp, | |
261 | } | |
262 | validation_result = self._download_json( | |
15707c7e | 263 | 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse_urlencode(validation_params), None, |
99709cc3 YCH |
264 | note='Validate credentials', errnote='Unable to validate credentials') |
265 | ||
266 | MSG_MAP = { | |
267 | 'P00107': 'please login via the web interface and enter the CAPTCHA code', | |
268 | 'P00117': 'bad username or password', | |
269 | } | |
270 | ||
271 | code = validation_result['code'] | |
272 | if code != 'A00000': | |
273 | msg = MSG_MAP.get(code) | |
274 | if not msg: | |
275 | msg = 'error %s' % code | |
276 | if validation_result.get('msg'): | |
277 | msg += ': ' + validation_result['msg'] | |
278 | self._downloader.report_warning('unable to log in: ' + msg) | |
279 | return False | |
280 | ||
281 | return True | |
57565375 | 282 | |
5b6ad863 YCH |
283 | def get_raw_data(self, tvid, video_id): |
284 | tm = int(time.time() * 1000) | |
285 | ||
2644e911 YCH |
286 | key = 'd5fb4bd9d50c4be6948c97edd7254b0e' |
287 | sc = md5_text(compat_str(tm) + key + tvid) | |
5b6ad863 | 288 | params = { |
8e0548e1 | 289 | 'tvid': tvid, |
605ec701 | 290 | 'vid': video_id, |
2644e911 | 291 | 'src': '76f90cbd92f94a2e925d83e8ccd22cb7', |
5b6ad863 | 292 | 'sc': sc, |
2644e911 | 293 | 't': tm, |
605ec701 P |
294 | } |
295 | ||
5b6ad863 YCH |
296 | return self._download_json( |
297 | 'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id), | |
298 | video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='), | |
38cce791 | 299 | query=params, headers=self.geo_verification_headers()) |
605ec701 | 300 | |
73f9c286 YCH |
301 | def _extract_playlist(self, webpage): |
302 | PAGE_SIZE = 50 | |
303 | ||
304 | links = re.findall( | |
305 | r'<a[^>]+class="site-piclist_pic_link"[^>]+href="(http://www\.iqiyi\.com/.+\.html)"', | |
306 | webpage) | |
307 | if not links: | |
308 | return | |
309 | ||
310 | album_id = self._search_regex( | |
311 | r'albumId\s*:\s*(\d+),', webpage, 'album ID') | |
312 | album_title = self._search_regex( | |
313 | r'data-share-title="([^"]+)"', webpage, 'album title', fatal=False) | |
314 | ||
315 | entries = list(map(self.url_result, links)) | |
316 | ||
317 | # Start from 2 because links in the first page are already on webpage | |
318 | for page_num in itertools.count(2): | |
319 | pagelist_page = self._download_webpage( | |
320 | 'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE), | |
321 | album_id, | |
322 | note='Download playlist page %d' % page_num, | |
323 | errnote='Failed to download playlist page %d' % page_num) | |
324 | pagelist = self._parse_json( | |
325 | remove_start(pagelist_page, 'var tvInfoJs='), album_id) | |
326 | vlist = pagelist['data']['vlist'] | |
327 | for item in vlist: | |
328 | entries.append(self.url_result(item['vurl'])) | |
329 | if len(vlist) < PAGE_SIZE: | |
330 | break | |
331 | ||
332 | return self.playlist_result(entries, album_id, album_title) | |
333 | ||
605ec701 P |
334 | def _real_extract(self, url): |
335 | webpage = self._download_webpage( | |
336 | url, 'temp_id', note='download video page') | |
73f9c286 YCH |
337 | |
338 | # There's no simple way to determine whether an URL is a playlist or not | |
339 | # So detect it | |
340 | playlist_result = self._extract_playlist(webpage) | |
341 | if playlist_result: | |
342 | return playlist_result | |
343 | ||
605ec701 | 344 | tvid = self._search_regex( |
29e7e078 | 345 | r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') |
605ec701 | 346 | video_id = self._search_regex( |
29e7e078 | 347 | r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') |
5b6ad863 | 348 | |
2644e911 | 349 | formats = [] |
5b6ad863 YCH |
350 | for _ in range(5): |
351 | raw_data = self.get_raw_data(tvid, video_id) | |
352 | ||
353 | if raw_data['code'] != 'A00000': | |
354 | if raw_data['code'] == 'A00111': | |
355 | self.raise_geo_restricted() | |
356 | raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) | |
357 | ||
358 | data = raw_data['data'] | |
359 | ||
2644e911 YCH |
360 | for stream in data['vidl']: |
361 | if 'm3utx' not in stream: | |
362 | continue | |
363 | vd = compat_str(stream['vd']) | |
364 | formats.append({ | |
365 | 'url': stream['m3utx'], | |
366 | 'format_id': vd, | |
367 | 'ext': 'mp4', | |
368 | 'preference': self._FORMATS_MAP.get(vd, -1), | |
369 | 'protocol': 'm3u8_native', | |
370 | }) | |
371 | ||
372 | if formats: | |
373 | break | |
374 | ||
375 | self._sleep(5, video_id) | |
5b6ad863 | 376 | |
2644e911 YCH |
377 | self._sort_formats(formats) |
378 | title = (get_element_by_id('widget-videotitle', webpage) or | |
379 | clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage))) | |
5b6ad863 YCH |
380 | |
381 | return { | |
382 | 'id': video_id, | |
383 | 'title': title, | |
2644e911 | 384 | 'formats': formats, |
5b6ad863 | 385 | } |