]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/iqiyi.py
Add support for https for all extractors as preventive and future-proof measure
[yt-dlp.git] / youtube_dl / extractor / iqiyi.py
CommitLineData
605ec701 1# coding: utf-8
605ec701
P
2from __future__ import unicode_literals
3
958d0b65 4import hashlib
73f9c286 5import itertools
958d0b65 6import math
8e0548e1 7import os
958d0b65 8import random
99709cc3 9import re
605ec701 10import time
605ec701 11import uuid
958d0b65
YCH
12
13from .common import InfoExtractor
8e0548e1
YCH
14from ..compat import (
15 compat_parse_qs,
99709cc3 16 compat_str,
8e0548e1
YCH
17 compat_urllib_parse,
18 compat_urllib_parse_urlparse,
19)
20from ..utils import (
f52354a8 21 decode_packed_codes,
8e0548e1 22 ExtractorError,
99709cc3 23 ohdave_rsa_encrypt,
73f9c286 24 remove_start,
8e0548e1
YCH
25 sanitized_Request,
26 urlencode_postdata,
27 url_basename,
28)
605ec701 29
f1da8610 30
99709cc3
YCH
31def md5_text(text):
32 return hashlib.md5(text.encode('utf-8')).hexdigest()
33
34
35class IqiyiSDK(object):
36 def __init__(self, target, ip, timestamp):
37 self.target = target
38 self.ip = ip
39 self.timestamp = timestamp
40
41 @staticmethod
42 def split_sum(data):
43 return compat_str(sum(map(lambda p: int(p, 16), list(data))))
44
45 @staticmethod
46 def digit_sum(num):
47 if isinstance(num, int):
48 num = compat_str(num)
49 return compat_str(sum(map(int, num)))
50
51 def even_odd(self):
52 even = self.digit_sum(compat_str(self.timestamp)[::2])
53 odd = self.digit_sum(compat_str(self.timestamp)[1::2])
54 return even, odd
55
56 def preprocess(self, chunksize):
57 self.target = md5_text(self.target)
58 chunks = []
59 for i in range(32 // chunksize):
60 chunks.append(self.target[chunksize * i:chunksize * (i + 1)])
61 if 32 % chunksize:
62 chunks.append(self.target[32 - 32 % chunksize:])
63 return chunks, list(map(int, self.ip.split('.')))
64
65 def mod(self, modulus):
66 chunks, ip = self.preprocess(32)
67 self.target = chunks[0] + ''.join(map(lambda p: compat_str(p % modulus), ip))
68
69 def split(self, chunksize):
70 modulus_map = {
71 4: 256,
72 5: 10,
73 8: 100,
74 }
75
76 chunks, ip = self.preprocess(chunksize)
77 ret = ''
78 for i in range(len(chunks)):
79 ip_part = compat_str(ip[i] % modulus_map[chunksize]) if i < 4 else ''
80 if chunksize == 8:
81 ret += ip_part + chunks[i]
82 else:
83 ret += chunks[i] + ip_part
84 self.target = ret
85
86 def handle_input16(self):
87 self.target = md5_text(self.target)
88 self.target = self.split_sum(self.target[:16]) + self.target + self.split_sum(self.target[16:])
89
90 def handle_input8(self):
91 self.target = md5_text(self.target)
92 ret = ''
93 for i in range(4):
94 part = self.target[8 * i:8 * (i + 1)]
95 ret += self.split_sum(part) + part
96 self.target = ret
97
98 def handleSum(self):
99 self.target = md5_text(self.target)
100 self.target = self.split_sum(self.target) + self.target
101
102 def date(self, scheme):
103 self.target = md5_text(self.target)
104 d = time.localtime(self.timestamp)
105 strings = {
106 'y': compat_str(d.tm_year),
107 'm': '%02d' % d.tm_mon,
108 'd': '%02d' % d.tm_mday,
109 }
110 self.target += ''.join(map(lambda c: strings[c], list(scheme)))
111
112 def split_time_even_odd(self):
113 even, odd = self.even_odd()
114 self.target = odd + md5_text(self.target) + even
115
116 def split_time_odd_even(self):
117 even, odd = self.even_odd()
118 self.target = even + md5_text(self.target) + odd
119
120 def split_ip_time_sum(self):
121 chunks, ip = self.preprocess(32)
122 self.target = compat_str(sum(ip)) + chunks[0] + self.digit_sum(self.timestamp)
123
124 def split_time_ip_sum(self):
125 chunks, ip = self.preprocess(32)
126 self.target = self.digit_sum(self.timestamp) + chunks[0] + compat_str(sum(ip))
127
128
129class IqiyiSDKInterpreter(object):
99709cc3
YCH
130 def __init__(self, sdk_code):
131 self.sdk_code = sdk_code
132
99709cc3 133 def run(self, target, ip, timestamp):
f52354a8 134 self.sdk_code = decode_packed_codes(self.sdk_code)
99709cc3
YCH
135
136 functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code)
137
138 sdk = IqiyiSDK(target, ip, timestamp)
139
140 other_functions = {
141 'handleSum': sdk.handleSum,
142 'handleInput8': sdk.handle_input8,
143 'handleInput16': sdk.handle_input16,
144 'splitTimeEvenOdd': sdk.split_time_even_odd,
145 'splitTimeOddEven': sdk.split_time_odd_even,
146 'splitIpTimeSum': sdk.split_ip_time_sum,
147 'splitTimeIpSum': sdk.split_time_ip_sum,
148 }
149 for function in functions:
150 if re.match(r'mod\d+', function):
151 sdk.mod(int(function[3:]))
152 elif re.match(r'date[ymd]{3}', function):
153 sdk.date(function[4:])
154 elif re.match(r'split\d+', function):
155 sdk.split(int(function[5:]))
156 elif function in other_functions:
157 other_functions[function]()
158 else:
159 raise ExtractorError('Unknown funcion %s' % function)
160
161 return sdk.target
162
163
605ec701
P
164class IqiyiIE(InfoExtractor):
165 IE_NAME = 'iqiyi'
44c514eb 166 IE_DESC = '爱奇艺'
605ec701 167
5886b38d 168 _VALID_URL = r'https?://(?:[^.]+\.)?iqiyi\.com/.+\.html'
605ec701 169
99709cc3
YCH
170 _NETRC_MACHINE = 'iqiyi'
171
99481135 172 _TESTS = [{
f1da8610
YCH
173 'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
174 'md5': '2cb594dc2781e6c941a110d8f358118b',
175 'info_dict': {
176 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
177 'title': '美国德州空中惊现奇异云团 酷似UFO',
178 'ext': 'f4v',
179 }
99481135
YCH
180 }, {
181 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
182 'info_dict': {
183 'id': 'e3f585b550a280af23c98b6cb2be19fb',
184 'title': '名侦探柯南第752集',
185 },
186 'playlist': [{
99481135
YCH
187 'info_dict': {
188 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1',
189 'ext': 'f4v',
190 'title': '名侦探柯南第752集',
191 },
192 }, {
99481135
YCH
193 'info_dict': {
194 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2',
195 'ext': 'f4v',
196 'title': '名侦探柯南第752集',
197 },
198 }, {
99481135
YCH
199 'info_dict': {
200 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3',
201 'ext': 'f4v',
202 'title': '名侦探柯南第752集',
203 },
204 }, {
99481135
YCH
205 'info_dict': {
206 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4',
207 'ext': 'f4v',
208 'title': '名侦探柯南第752集',
209 },
210 }, {
99481135
YCH
211 'info_dict': {
212 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5',
213 'ext': 'f4v',
214 'title': '名侦探柯南第752集',
215 },
216 }, {
99481135
YCH
217 'info_dict': {
218 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6',
219 'ext': 'f4v',
220 'title': '名侦探柯南第752集',
221 },
222 }, {
99481135
YCH
223 'info_dict': {
224 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7',
225 'ext': 'f4v',
226 'title': '名侦探柯南第752集',
227 },
228 }, {
99481135
YCH
229 'info_dict': {
230 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8',
231 'ext': 'f4v',
232 'title': '名侦探柯南第752集',
233 },
234 }],
c2d1be89
YCH
235 'params': {
236 'skip_download': True,
237 },
59185202
YCH
238 }, {
239 'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html',
240 'only_matching': True,
241 }, {
242 'url': 'http://www.iqiyi.com/a_19rrhbc6kt.html',
243 'only_matching': True,
244 }, {
245 'url': 'http://yule.iqiyi.com/pcb.html',
246 'only_matching': True,
8e0548e1
YCH
247 }, {
248 # VIP-only video. The first 2 parts (6 minutes) are available without login
1932476c 249 # MD5 sums omitted as values are different on Travis CI and my machine
8e0548e1
YCH
250 'url': 'http://www.iqiyi.com/v_19rrny4w8w.html',
251 'info_dict': {
252 'id': 'f3cf468b39dddb30d676f89a91200dc1',
253 'title': '泰坦尼克号',
254 },
255 'playlist': [{
8e0548e1
YCH
256 'info_dict': {
257 'id': 'f3cf468b39dddb30d676f89a91200dc1_part1',
258 'ext': 'f4v',
259 'title': '泰坦尼克号',
260 },
261 }, {
8e0548e1
YCH
262 'info_dict': {
263 'id': 'f3cf468b39dddb30d676f89a91200dc1_part2',
264 'ext': 'f4v',
265 'title': '泰坦尼克号',
266 },
267 }],
268 'expected_warnings': ['Needs a VIP account for full video'],
73f9c286
YCH
269 }, {
270 'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html',
271 'info_dict': {
272 'id': '202918101',
273 'title': '灌篮高手 国语版',
274 },
275 'playlist_count': 101,
99481135 276 }]
605ec701 277
08bb8ef2
YCH
278 _FORMATS_MAP = [
279 ('1', 'h6'),
280 ('2', 'h5'),
281 ('3', 'h4'),
282 ('4', 'h3'),
283 ('5', 'h2'),
284 ('10', 'h1'),
285 ]
286
99709cc3
YCH
287 def _real_initialize(self):
288 self._login()
289
57565375 290 @staticmethod
99709cc3
YCH
291 def _rsa_fun(data):
292 # public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js
293 N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd
294 e = 65537
295
296 return ohdave_rsa_encrypt(data, e, N)
297
298 def _login(self):
299 (username, password) = self._get_login_info()
300
301 # No authentication to be performed
302 if not username:
303 return True
304
305 data = self._download_json(
306 'http://kylin.iqiyi.com/get_token', None,
307 note='Get token for logging', errnote='Unable to get token for logging')
308 sdk = data['sdk']
309 timestamp = int(time.time())
310 target = '/apis/reglogin/login.action?lang=zh_TW&area_code=null&email=%s&passwd=%s&agenttype=1&from=undefined&keeplogin=0&piccode=&fromurl=&_pos=1' % (
311 username, self._rsa_fun(password.encode('utf-8')))
312
313 interp = IqiyiSDKInterpreter(sdk)
314 sign = interp.run(target, data['ip'], timestamp)
315
316 validation_params = {
317 'target': target,
318 'server': 'BEA3AA1908656AABCCFF76582C4C6660',
319 'token': data['token'],
320 'bird_src': 'f8d91d57af224da7893dd397d52d811a',
321 'sign': sign,
322 'bird_t': timestamp,
323 }
324 validation_result = self._download_json(
325 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse.urlencode(validation_params), None,
326 note='Validate credentials', errnote='Unable to validate credentials')
327
328 MSG_MAP = {
329 'P00107': 'please login via the web interface and enter the CAPTCHA code',
330 'P00117': 'bad username or password',
331 }
332
333 code = validation_result['code']
334 if code != 'A00000':
335 msg = MSG_MAP.get(code)
336 if not msg:
337 msg = 'error %s' % code
338 if validation_result.get('msg'):
339 msg += ': ' + validation_result['msg']
340 self._downloader.report_warning('unable to log in: ' + msg)
341 return False
342
343 return True
57565375 344
8e0548e1
YCH
345 def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning):
346 auth_params = {
347 # version and platform hard-coded in com/qiyi/player/core/model/remote/AuthenticationRemote.as
348 'version': '2.0',
349 'platform': 'b6c13e26323c537d',
350 'aid': tvid,
351 'tvid': tvid,
352 'uid': '',
353 'deviceId': _uuid,
354 'playType': 'main', # XXX: always main?
355 'filename': os.path.splitext(url_basename(api_video_url))[0],
356 }
357
358 qd_items = compat_parse_qs(compat_urllib_parse_urlparse(api_video_url).query)
359 for key, val in qd_items.items():
360 auth_params[key] = val[0]
361
362 auth_req = sanitized_Request(
363 'http://api.vip.iqiyi.com/services/ckn.action',
364 urlencode_postdata(auth_params))
365 # iQiyi server throws HTTP 405 error without the following header
366 auth_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
367 auth_result = self._download_json(
368 auth_req, video_id,
369 note='Downloading video authentication JSON',
370 errnote='Unable to download video authentication JSON')
371 if auth_result['code'] == 'Q00506': # requires a VIP account
372 if do_report_warning:
373 self.report_warning('Needs a VIP account for full video')
374 return False
375
376 return auth_result
377
378 def construct_video_urls(self, data, video_id, _uuid, tvid):
605ec701
P
379 def do_xor(x, y):
380 a = y % 3
381 if a == 1:
382 return x ^ 121
383 if a == 2:
384 return x ^ 72
385 return x ^ 103
386
387 def get_encode_code(l):
388 a = 0
389 b = l.split('-')
390 c = len(b)
391 s = ''
392 for i in range(c - 1, -1, -1):
f1da8610 393 a = do_xor(int(b[c - i - 1], 16), i)
605ec701
P
394 s += chr(a)
395 return s[::-1]
396
ffba4edb 397 def get_path_key(x, format_id, segment_index):
605ec701
P
398 mg = ')(*&^flash@#$%a'
399 tm = self._download_json(
ffba4edb
YCH
400 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id,
401 note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
402 )['t']
f1da8610 403 t = str(int(math.floor(int(tm) / (600.0))))
99709cc3 404 return md5_text(t + mg + x)
605ec701
P
405
406 video_urls_dict = {}
8e0548e1 407 need_vip_warning_report = True
ffba4edb
YCH
408 for format_item in data['vp']['tkl'][0]['vs']:
409 if 0 < int(format_item['bid']) <= 10:
410 format_id = self.get_format(format_item['bid'])
670861bd
P
411 else:
412 continue
413
414 video_urls = []
605ec701 415
ffba4edb
YCH
416 video_urls_info = format_item['fs']
417 if not format_item['fs'][0]['l'].startswith('/'):
418 t = get_encode_code(format_item['fs'][0]['l'])
605ec701 419 if t.endswith('mp4'):
ffba4edb 420 video_urls_info = format_item['flvs']
605ec701 421
ffba4edb
YCH
422 for segment_index, segment in enumerate(video_urls_info):
423 vl = segment['l']
605ec701
P
424 if not vl.startswith('/'):
425 vl = get_encode_code(vl)
8e0548e1 426 is_vip_video = '/vip/' in vl
ffba4edb 427 filesize = segment['b']
605ec701 428 base_url = data['vp']['du'].split('/')
8e0548e1
YCH
429 if not is_vip_video:
430 key = get_path_key(
431 vl.split('/')[-1].split('.')[0], format_id, segment_index)
432 base_url.insert(-1, key)
605ec701
P
433 base_url = '/'.join(base_url)
434 param = {
435 'su': _uuid,
436 'qyid': uuid.uuid4().hex,
437 'client': '',
438 'z': '',
439 'bt': '',
440 'ct': '',
441 'tn': str(int(time.time()))
442 }
8e0548e1
YCH
443 api_video_url = base_url + vl
444 if is_vip_video:
445 api_video_url = api_video_url.replace('.f4v', '.hml')
446 auth_result = self._authenticate_vip_video(
447 api_video_url, video_id, tvid, _uuid, need_vip_warning_report)
448 if auth_result is False:
449 need_vip_warning_report = False
450 break
451 param.update({
452 't': auth_result['data']['t'],
453 # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as
454 'cid': 'afbe8fd3d73448c9',
455 'vid': video_id,
456 'QY00001': auth_result['data']['u'],
457 })
458 api_video_url += '?' if '?' not in api_video_url else '&'
459 api_video_url += compat_urllib_parse.urlencode(param)
ffba4edb
YCH
460 js = self._download_json(
461 api_video_url, video_id,
462 note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
605ec701
P
463 video_url = js['l']
464 video_urls.append(
465 (video_url, filesize))
466
467 video_urls_dict[format_id] = video_urls
468 return video_urls_dict
469
470 def get_format(self, bid):
08bb8ef2
YCH
471 matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)]
472 return matched_format_ids[0] if len(matched_format_ids) else None
670861bd
P
473
474 def get_bid(self, format_id):
08bb8ef2
YCH
475 matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id]
476 return matched_bids[0] if len(matched_bids) else None
605ec701
P
477
478 def get_raw_data(self, tvid, video_id, enc_key, _uuid):
479 tm = str(int(time.time()))
57565375 480 tail = tm + tvid
605ec701
P
481 param = {
482 'key': 'fvip',
99709cc3 483 'src': md5_text('youtube-dl'),
605ec701
P
484 'tvId': tvid,
485 'vid': video_id,
486 'vinfo': 1,
487 'tm': tm,
99709cc3 488 'enc': md5_text(enc_key + tail),
605ec701
P
489 'qyid': _uuid,
490 'tn': random.random(),
491 'um': 0,
99709cc3 492 'authkey': md5_text(md5_text('') + tail),
8e0548e1 493 'k_tag': 1,
605ec701
P
494 }
495
496 api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
497 compat_urllib_parse.urlencode(param)
498 raw_data = self._download_json(api_url, video_id)
499 return raw_data
500
9fb556ee 501 def get_enc_key(self, video_id):
57565375 502 # TODO: automatic key extraction
6b45f9ab 503 # last update at 2016-01-22 for Zombie::bite
31db8709 504 enc_key = '8ed797d224d043e7ac23d95b70227d32'
605ec701
P
505 return enc_key
506
73f9c286
YCH
507 def _extract_playlist(self, webpage):
508 PAGE_SIZE = 50
509
510 links = re.findall(
511 r'<a[^>]+class="site-piclist_pic_link"[^>]+href="(http://www\.iqiyi\.com/.+\.html)"',
512 webpage)
513 if not links:
514 return
515
516 album_id = self._search_regex(
517 r'albumId\s*:\s*(\d+),', webpage, 'album ID')
518 album_title = self._search_regex(
519 r'data-share-title="([^"]+)"', webpage, 'album title', fatal=False)
520
521 entries = list(map(self.url_result, links))
522
523 # Start from 2 because links in the first page are already on webpage
524 for page_num in itertools.count(2):
525 pagelist_page = self._download_webpage(
526 'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE),
527 album_id,
528 note='Download playlist page %d' % page_num,
529 errnote='Failed to download playlist page %d' % page_num)
530 pagelist = self._parse_json(
531 remove_start(pagelist_page, 'var tvInfoJs='), album_id)
532 vlist = pagelist['data']['vlist']
533 for item in vlist:
534 entries.append(self.url_result(item['vurl']))
535 if len(vlist) < PAGE_SIZE:
536 break
537
538 return self.playlist_result(entries, album_id, album_title)
539
605ec701
P
540 def _real_extract(self, url):
541 webpage = self._download_webpage(
542 url, 'temp_id', note='download video page')
73f9c286
YCH
543
544 # There's no simple way to determine whether an URL is a playlist or not
545 # So detect it
546 playlist_result = self._extract_playlist(webpage)
547 if playlist_result:
548 return playlist_result
549
605ec701 550 tvid = self._search_regex(
29e7e078 551 r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
605ec701 552 video_id = self._search_regex(
29e7e078 553 r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
605ec701
P
554 _uuid = uuid.uuid4().hex
555
9fb556ee 556 enc_key = self.get_enc_key(video_id)
605ec701
P
557
558 raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
aacda28b
YCH
559
560 if raw_data['code'] != 'A000000':
561 raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
562
605ec701
P
563 data = raw_data['data']
564
565 title = data['vi']['vn']
566
567 # generate video_urls_dict
670861bd 568 video_urls_dict = self.construct_video_urls(
8e0548e1 569 data, video_id, _uuid, tvid)
605ec701
P
570
571 # construct info
572 entries = []
573 for format_id in video_urls_dict:
574 video_urls = video_urls_dict[format_id]
575 for i, video_url_info in enumerate(video_urls):
f1da8610 576 if len(entries) < i + 1:
605ec701
P
577 entries.append({'formats': []})
578 entries[i]['formats'].append(
579 {
580 'url': video_url_info[0],
581 'filesize': video_url_info[-1],
582 'format_id': format_id,
670861bd 583 'preference': int(self.get_bid(format_id))
605ec701
P
584 }
585 )
586
587 for i in range(len(entries)):
670861bd 588 self._sort_formats(entries[i]['formats'])
605ec701
P
589 entries[i].update(
590 {
c4ee8702 591 'id': '%s_part%d' % (video_id, i + 1),
605ec701
P
592 'title': title,
593 }
594 )
595
596 if len(entries) > 1:
597 info = {
598 '_type': 'multi_video',
599 'id': video_id,
600 'title': title,
601 'entries': entries,
602 }
603 else:
604 info = entries[0]
605 info['id'] = video_id
606 info['title'] = title
607
608 return info