[yt-dlp.git] / youtube_dl / extractor / iqiyi.py

# coding: utf-8
from __future__ import unicode_literals

import hashlib
import math
import random
import time
import uuid

from .common import InfoExtractor
from ..compat import compat_urllib_parse
from ..utils import ExtractorError


class IqiyiIE(InfoExtractor):
    IE_NAME = 'iqiyi'
    IE_DESC = '爱奇艺'

    _VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html'

    _TESTS = [{
        'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
        'md5': '2cb594dc2781e6c941a110d8f358118b',
        'info_dict': {
            'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
            'title': '美国德州空中惊现奇异云团 酷似UFO',
            'ext': 'f4v',
        }
    }, {
        'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
        'info_dict': {
            'id': 'e3f585b550a280af23c98b6cb2be19fb',
            'title': '名侦探柯南第752集',
        },
        'playlist': [{
            'info_dict': {
                'id': 'e3f585b550a280af23c98b6cb2be19fb_part1',
                'ext': 'f4v',
                'title': '名侦探柯南第752集',
            },
        }, {
            'info_dict': {
                'id': 'e3f585b550a280af23c98b6cb2be19fb_part2',
                'ext': 'f4v',
                'title': '名侦探柯南第752集',
            },
        }, {
            'info_dict': {
                'id': 'e3f585b550a280af23c98b6cb2be19fb_part3',
                'ext': 'f4v',
                'title': '名侦探柯南第752集',
            },
        }, {
            'info_dict': {
                'id': 'e3f585b550a280af23c98b6cb2be19fb_part4',
                'ext': 'f4v',
                'title': '名侦探柯南第752集',
            },
        }, {
            'info_dict': {
                'id': 'e3f585b550a280af23c98b6cb2be19fb_part5',
                'ext': 'f4v',
                'title': '名侦探柯南第752集',
            },
        }, {
            'info_dict': {
                'id': 'e3f585b550a280af23c98b6cb2be19fb_part6',
                'ext': 'f4v',
                'title': '名侦探柯南第752集',
            },
        }, {
            'info_dict': {
                'id': 'e3f585b550a280af23c98b6cb2be19fb_part7',
                'ext': 'f4v',
                'title': '名侦探柯南第752集',
            },
        }, {
            'info_dict': {
                'id': 'e3f585b550a280af23c98b6cb2be19fb_part8',
                'ext': 'f4v',
                'title': '名侦探柯南第752集',
            },
        }],
        'params': {
            'skip_download': True,
        },
    }]

    _FORMATS_MAP = [
        ('1', 'h6'),
        ('2', 'h5'),
        ('3', 'h4'),
        ('4', 'h3'),
        ('5', 'h2'),
        ('10', 'h1'),
    ]

    @staticmethod
    def md5_text(text):
        return hashlib.md5(text.encode('utf-8')).hexdigest()

    def construct_video_urls(self, data, video_id, _uuid):
        def do_xor(x, y):
            a = y % 3
            if a == 1:
                return x ^ 121
            if a == 2:
                return x ^ 72
            return x ^ 103

        def get_encode_code(l):
            a = 0
            b = l.split('-')
            c = len(b)
            s = ''
            for i in range(c - 1, -1, -1):
                a = do_xor(int(b[c - i - 1], 16), i)
                s += chr(a)
            return s[::-1]

        def get_path_key(x, format_id, segment_index):
            mg = ')(*&^flash@#$%a'
            tm = self._download_json(
                'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id,
                note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
            )['t']
            t = str(int(math.floor(int(tm) / (600.0))))
            return self.md5_text(t + mg + x)

        video_urls_dict = {}
        for format_item in data['vp']['tkl'][0]['vs']:
            if 0 < int(format_item['bid']) <= 10:
                format_id = self.get_format(format_item['bid'])
            else:
                continue

            video_urls = []

            video_urls_info = format_item['fs']
            if not format_item['fs'][0]['l'].startswith('/'):
                t = get_encode_code(format_item['fs'][0]['l'])
                if t.endswith('mp4'):
                    video_urls_info = format_item['flvs']

            for segment_index, segment in enumerate(video_urls_info):
                vl = segment['l']
                if not vl.startswith('/'):
                    vl = get_encode_code(vl)
                key = get_path_key(
                    vl.split('/')[-1].split('.')[0], format_id, segment_index)
                filesize = segment['b']
                base_url = data['vp']['du'].split('/')
                base_url.insert(-1, key)
                base_url = '/'.join(base_url)
                param = {
                    'su': _uuid,
                    'qyid': uuid.uuid4().hex,
                    'client': '',
                    'z': '',
                    'bt': '',
                    'ct': '',
                    'tn': str(int(time.time()))
                }
                api_video_url = base_url + vl + '?' + \
                    compat_urllib_parse.urlencode(param)
                js = self._download_json(
                    api_video_url, video_id,
                    note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
                video_url = js['l']
                video_urls.append(
                    (video_url, filesize))

            video_urls_dict[format_id] = video_urls
        return video_urls_dict

    def get_format(self, bid):
        matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)]
        return matched_format_ids[0] if len(matched_format_ids) else None

    def get_bid(self, format_id):
        matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id]
        return matched_bids[0] if len(matched_bids) else None

    def get_raw_data(self, tvid, video_id, enc_key, _uuid):
        tm = str(int(time.time()))
        tail = tm + tvid
        param = {
            'key': 'fvip',
            'src': self.md5_text('youtube-dl'),
            'tvId': tvid,
            'vid': video_id,
            'vinfo': 1,
            'tm': tm,
            'enc': self.md5_text(enc_key + tail),
            'qyid': _uuid,
            'tn': random.random(),
            'um': 0,
            'authkey': self.md5_text(self.md5_text('') + tail),
        }

        api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
            compat_urllib_parse.urlencode(param)
        raw_data = self._download_json(api_url, video_id)
        return raw_data

    def get_enc_key(self, swf_url, video_id):
        # TODO: automatic key extraction
        # last update at 2015-12-18 for Zombie::bite
        enc_key = '8b6b683780897eb8d9a48a02ccc4817d'[::-1]
        return enc_key

    def _real_extract(self, url):
        webpage = self._download_webpage(
            url, 'temp_id', note='download video page')
        tvid = self._search_regex(
            r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
        video_id = self._search_regex(
            r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
        swf_url = self._search_regex(
            r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
        _uuid = uuid.uuid4().hex

        enc_key = self.get_enc_key(swf_url, video_id)

        raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)

        if raw_data['code'] != 'A000000':
            raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])

        if not raw_data['data']['vp']['tkl']:
            raise ExtractorError('No support iQiqy VIP video')

        data = raw_data['data']

        title = data['vi']['vn']

        # generate video_urls_dict
        video_urls_dict = self.construct_video_urls(
            data, video_id, _uuid)

        # construct info
        entries = []
        for format_id in video_urls_dict:
            video_urls = video_urls_dict[format_id]
            for i, video_url_info in enumerate(video_urls):
                if len(entries) < i + 1:
                    entries.append({'formats': []})
                entries[i]['formats'].append(
                    {
                        'url': video_url_info[0],
                        'filesize': video_url_info[-1],
                        'format_id': format_id,
                        'preference': int(self.get_bid(format_id))
                    }
                )

        for i in range(len(entries)):
            self._sort_formats(entries[i]['formats'])
            entries[i].update(
                {
                    'id': '%s_part%d' % (video_id, i + 1),
                    'title': title,
                }
            )

        if len(entries) > 1:
            info = {
                '_type': 'multi_video',
                'id': video_id,
                'title': title,
                'entries': entries,
            }
        else:
            info = entries[0]
            info['id'] = video_id
            info['title'] = title

        return info
Commit	Line	Data
605ec701	1	# coding: utf-8
605ec701 P	2	from __future__ import unicode_literals
605ec701 P	3
958d0b65 YCH	4	import hashlib
	5	import math
	6	import random
605ec701	7	import time
605ec701	8	import uuid
958d0b65 YCH	9
	10	from .common import InfoExtractor
	11	from ..compat import compat_urllib_parse
761ee0d8	12	from ..utils import ExtractorError
605ec701	13
f1da8610	14
605ec701 P	15	class IqiyiIE(InfoExtractor):
605ec701 P	16	IE_NAME = 'iqiyi'
44c514eb	17	IE_DESC = '爱奇艺'
605ec701	18
865ab62f	19	_VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html'
605ec701	20
99481135	21	_TESTS = [{
f1da8610 YCH	22	'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
	23	'md5': '2cb594dc2781e6c941a110d8f358118b',
	24	'info_dict': {
	25	'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
	26	'title': '美国德州空中惊现奇异云团酷似UFO',
	27	'ext': 'f4v',
	28	}
99481135 YCH	29	}, {
	30	'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
	31	'info_dict': {
	32	'id': 'e3f585b550a280af23c98b6cb2be19fb',
	33	'title': '名侦探柯南第752集',
	34	},
	35	'playlist': [{
99481135 YCH	36	'info_dict': {
	37	'id': 'e3f585b550a280af23c98b6cb2be19fb_part1',
	38	'ext': 'f4v',
	39	'title': '名侦探柯南第752集',
	40	},
	41	}, {
99481135 YCH	42	'info_dict': {
	43	'id': 'e3f585b550a280af23c98b6cb2be19fb_part2',
	44	'ext': 'f4v',
	45	'title': '名侦探柯南第752集',
	46	},
	47	}, {
99481135 YCH	48	'info_dict': {
	49	'id': 'e3f585b550a280af23c98b6cb2be19fb_part3',
	50	'ext': 'f4v',
	51	'title': '名侦探柯南第752集',
	52	},
	53	}, {
99481135 YCH	54	'info_dict': {
	55	'id': 'e3f585b550a280af23c98b6cb2be19fb_part4',
	56	'ext': 'f4v',
	57	'title': '名侦探柯南第752集',
	58	},
	59	}, {
99481135 YCH	60	'info_dict': {
	61	'id': 'e3f585b550a280af23c98b6cb2be19fb_part5',
	62	'ext': 'f4v',
	63	'title': '名侦探柯南第752集',
	64	},
	65	}, {
99481135 YCH	66	'info_dict': {
	67	'id': 'e3f585b550a280af23c98b6cb2be19fb_part6',
	68	'ext': 'f4v',
	69	'title': '名侦探柯南第752集',
	70	},
	71	}, {
99481135 YCH	72	'info_dict': {
	73	'id': 'e3f585b550a280af23c98b6cb2be19fb_part7',
	74	'ext': 'f4v',
	75	'title': '名侦探柯南第752集',
	76	},
	77	}, {
99481135 YCH	78	'info_dict': {
	79	'id': 'e3f585b550a280af23c98b6cb2be19fb_part8',
	80	'ext': 'f4v',
	81	'title': '名侦探柯南第752集',
	82	},
	83	}],
c2d1be89 YCH	84	'params': {
	85	'skip_download': True,
	86	},
99481135	87	}]
605ec701	88
08bb8ef2 YCH	89	_FORMATS_MAP = [
	90	('1', 'h6'),
	91	('2', 'h5'),
	92	('3', 'h4'),
	93	('4', 'h3'),
	94	('5', 'h2'),
	95	('10', 'h1'),
	96	]
	97
57565375 YCH	98	@staticmethod
	99	def md5_text(text):
	100	return hashlib.md5(text.encode('utf-8')).hexdigest()
	101
7012620e	102	def construct_video_urls(self, data, video_id, _uuid):
605ec701 P	103	def do_xor(x, y):
	104	a = y % 3
	105	if a == 1:
	106	return x ^ 121
	107	if a == 2:
	108	return x ^ 72
	109	return x ^ 103
	110
	111	def get_encode_code(l):
	112	a = 0
	113	b = l.split('-')
	114	c = len(b)
	115	s = ''
	116	for i in range(c - 1, -1, -1):
f1da8610	117	a = do_xor(int(b[c - i - 1], 16), i)
605ec701 P	118	s += chr(a)
	119	return s[::-1]
	120
ffba4edb	121	def get_path_key(x, format_id, segment_index):
605ec701 P	122	mg = ')(*&^flash@#$%a'
605ec701 P	123	tm = self._download_json(
ffba4edb YCH	124	'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id,
	125	note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
	126	)['t']
f1da8610	127	t = str(int(math.floor(int(tm) / (600.0))))
19f93d90	128	return self.md5_text(t + mg + x)
605ec701 P	129
605ec701 P	130	video_urls_dict = {}
ffba4edb YCH	131	for format_item in data['vp']['tkl'][0]['vs']:
	132	if 0 < int(format_item['bid']) <= 10:
	133	format_id = self.get_format(format_item['bid'])
670861bd P	134	else:
	135	continue
	136
	137	video_urls = []
605ec701	138
ffba4edb YCH	139	video_urls_info = format_item['fs']
	140	if not format_item['fs'][0]['l'].startswith('/'):
	141	t = get_encode_code(format_item['fs'][0]['l'])
605ec701	142	if t.endswith('mp4'):
ffba4edb	143	video_urls_info = format_item['flvs']
605ec701	144
ffba4edb YCH	145	for segment_index, segment in enumerate(video_urls_info):
ffba4edb YCH	146	vl = segment['l']
605ec701 P	147	if not vl.startswith('/'):
	148	vl = get_encode_code(vl)
	149	key = get_path_key(
ffba4edb YCH	150	vl.split('/')[-1].split('.')[0], format_id, segment_index)
ffba4edb YCH	151	filesize = segment['b']
605ec701 P	152	base_url = data['vp']['du'].split('/')
	153	base_url.insert(-1, key)
	154	base_url = '/'.join(base_url)
	155	param = {
	156	'su': _uuid,
	157	'qyid': uuid.uuid4().hex,
	158	'client': '',
	159	'z': '',
	160	'bt': '',
	161	'ct': '',
	162	'tn': str(int(time.time()))
	163	}
	164	api_video_url = base_url + vl + '?' + \
	165	compat_urllib_parse.urlencode(param)
ffba4edb YCH	166	js = self._download_json(
	167	api_video_url, video_id,
	168	note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
605ec701 P	169	video_url = js['l']
	170	video_urls.append(
	171	(video_url, filesize))
	172
	173	video_urls_dict[format_id] = video_urls
	174	return video_urls_dict
	175
	176	def get_format(self, bid):
08bb8ef2 YCH	177	matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)]
08bb8ef2 YCH	178	return matched_format_ids[0] if len(matched_format_ids) else None
670861bd P	179
670861bd P	180	def get_bid(self, format_id):
08bb8ef2 YCH	181	matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id]
08bb8ef2 YCH	182	return matched_bids[0] if len(matched_bids) else None
605ec701 P	183
	184	def get_raw_data(self, tvid, video_id, enc_key, _uuid):
	185	tm = str(int(time.time()))
57565375	186	tail = tm + tvid
605ec701 P	187	param = {
605ec701 P	188	'key': 'fvip',
19f93d90	189	'src': self.md5_text('youtube-dl'),
605ec701 P	190	'tvId': tvid,
	191	'vid': video_id,
	192	'vinfo': 1,
	193	'tm': tm,
6a959f2e	194	'enc': self.md5_text(enc_key + tail),
605ec701 P	195	'qyid': _uuid,
	196	'tn': random.random(),
	197	'um': 0,
57565375	198	'authkey': self.md5_text(self.md5_text('') + tail),
605ec701 P	199	}
	200
	201	api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
	202	compat_urllib_parse.urlencode(param)
	203	raw_data = self._download_json(api_url, video_id)
	204	return raw_data
	205
	206	def get_enc_key(self, swf_url, video_id):
57565375	207	# TODO: automatic key extraction
10171468 YCH	208	# last update at 2015-12-18 for Zombie::bite
10171468 YCH	209	enc_key = '8b6b683780897eb8d9a48a02ccc4817d'[::-1]
605ec701 P	210	return enc_key
	211
	212	def _real_extract(self, url):
	213	webpage = self._download_webpage(
	214	url, 'temp_id', note='download video page')
	215	tvid = self._search_regex(
29e7e078	216	r'data-player-tvid\s=\s[\'"](\d+)', webpage, 'tvid')
605ec701	217	video_id = self._search_regex(
29e7e078	218	r'data-player-videoid\s=\s[\'"]([a-f\d]+)', webpage, 'video_id')
605ec701	219	swf_url = self._search_regex(
9c5f685e	220	r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
605ec701 P	221	_uuid = uuid.uuid4().hex
	222
	223	enc_key = self.get_enc_key(swf_url, video_id)
	224
	225	raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
aacda28b YCH	226
	227	if raw_data['code'] != 'A000000':
	228	raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
	229
605ec701 P	230	if not raw_data['data']['vp']['tkl']:
	231	raise ExtractorError('No support iQiqy VIP video')
	232
	233	data = raw_data['data']
	234
	235	title = data['vi']['vn']
	236
	237	# generate video_urls_dict
670861bd	238	video_urls_dict = self.construct_video_urls(
7012620e	239	data, video_id, _uuid)
605ec701 P	240
	241	# construct info
	242	entries = []
	243	for format_id in video_urls_dict:
	244	video_urls = video_urls_dict[format_id]
	245	for i, video_url_info in enumerate(video_urls):
f1da8610	246	if len(entries) < i + 1:
605ec701 P	247	entries.append({'formats': []})
	248	entries[i]['formats'].append(
	249	{
	250	'url': video_url_info[0],
	251	'filesize': video_url_info[-1],
	252	'format_id': format_id,
670861bd	253	'preference': int(self.get_bid(format_id))
605ec701 P	254	}
	255	)
	256
	257	for i in range(len(entries)):
670861bd	258	self._sort_formats(entries[i]['formats'])
605ec701 P	259	entries[i].update(
605ec701 P	260	{
c4ee8702	261	'id': '%s_part%d' % (video_id, i + 1),
605ec701 P	262	'title': title,
	263	}
	264	)
	265
	266	if len(entries) > 1:
	267	info = {
	268	'_type': 'multi_video',
	269	'id': video_id,
	270	'title': title,
	271	'entries': entries,
	272	}
	273	else:
	274	info = entries[0]
	275	info['id'] = video_id
	276	info['title'] = title
	277
	278	return info