jfr.im git - yt-dlp.git/blame_incremental - youtube

... / ...

Commit	Line	Data
	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	import base64
	5	import datetime
	6	import hashlib
	7	import re
	8	import time
	9
	10	from .common import InfoExtractor
	11	from ..compat import (
	12	compat_ord,
	13	compat_str,
	14	compat_urllib_parse_urlencode,
	15	)
	16	from ..utils import (
	17	determine_ext,
	18	encode_data_uri,
	19	ExtractorError,
	20	int_or_none,
	21	orderedSet,
	22	parse_iso8601,
	23	str_or_none,
	24	url_basename,
	25	urshift,
	26	)
	27
	28
	29	class LeIE(InfoExtractor):
	30	IE_DESC = '乐视网'
	31	_VALID_URL = r'https?://(?:www\.le\.com/ptv/vplay\|(?:sports\.le\|(?:www\.)?lesports)\.com/(?:match\|video))/(?P<id>\d+)\.html'
	32	_GEO_COUNTRIES = ['CN']
	33	_URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html'
	34
	35	_TESTS = [{
	36	'url': 'http://www.le.com/ptv/vplay/22005890.html',
	37	'md5': 'edadcfe5406976f42f9f266057ee5e40',
	38	'info_dict': {
	39	'id': '22005890',
	40	'ext': 'mp4',
	41	'title': '第87届奥斯卡颁奖礼完美落幕《鸟人》成最大赢家',
	42	'description': 'md5:a9cb175fd753e2962176b7beca21a47c',
	43	},
	44	'params': {
	45	'hls_prefer_native': True,
	46	},
	47	}, {
	48	'url': 'http://www.le.com/ptv/vplay/1415246.html',
	49	'info_dict': {
	50	'id': '1415246',
	51	'ext': 'mp4',
	52	'title': '美人天下01',
	53	'description': 'md5:28942e650e82ed4fcc8e4de919ee854d',
	54	},
	55	'params': {
	56	'hls_prefer_native': True,
	57	},
	58	}, {
	59	'note': 'This video is available only in Mainland China, thus a proxy is needed',
	60	'url': 'http://www.le.com/ptv/vplay/1118082.html',
	61	'md5': '2424c74948a62e5f31988438979c5ad1',
	62	'info_dict': {
	63	'id': '1118082',
	64	'ext': 'mp4',
	65	'title': '与龙共舞完整版',
	66	'description': 'md5:7506a5eeb1722bb9d4068f85024e3986',
	67	},
	68	'params': {
	69	'hls_prefer_native': True,
	70	},
	71	}, {
	72	'url': 'http://sports.le.com/video/25737697.html',
	73	'only_matching': True,
	74	}, {
	75	'url': 'http://www.lesports.com/match/1023203003.html',
	76	'only_matching': True,
	77	}, {
	78	'url': 'http://sports.le.com/match/1023203003.html',
	79	'only_matching': True,
	80	}]
	81
	82	# ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf
	83	def ror(self, param1, param2):
	84	_loc3_ = 0
	85	while _loc3_ < param2:
	86	param1 = urshift(param1, 1) + ((param1 & 1) << 31)
	87	_loc3_ += 1
	88	return param1
	89
	90	def calc_time_key(self, param1):
	91	_loc2_ = 185025305
	92	return self.ror(param1, _loc2_ % 17) ^ _loc2_
	93
	94	# see M3U8Encryption class in KLetvPlayer.swf
	95	@staticmethod
	96	def decrypt_m3u8(encrypted_data):
	97	if encrypted_data[:5].decode('utf-8').lower() != 'vc_01':
	98	return encrypted_data
	99	encrypted_data = encrypted_data[5:]
	100
	101	_loc4_ = bytearray(2 * len(encrypted_data))
	102	for idx, val in enumerate(encrypted_data):
	103	b = compat_ord(val)
	104	_loc4_[2 * idx] = b // 16
	105	_loc4_[2 * idx + 1] = b % 16
	106	idx = len(_loc4_) - 11
	107	_loc4_ = _loc4_[idx:] + _loc4_[:idx]
	108	_loc7_ = bytearray(len(encrypted_data))
	109	for i in range(len(encrypted_data)):
	110	_loc7_[i] = _loc4_[2 * i] * 16 + _loc4_[2 * i + 1]
	111
	112	return bytes(_loc7_)
	113
	114	def _check_errors(self, play_json):
	115	# Check for errors
	116	playstatus = play_json['msgs']['playstatus']
	117	if playstatus['status'] == 0:
	118	flag = playstatus['flag']
	119	if flag == 1:
	120	self.raise_geo_restricted()
	121	else:
	122	raise ExtractorError('Generic error. flag = %d' % flag, expected=True)
	123
	124	def _real_extract(self, url):
	125	media_id = self._match_id(url)
	126	page = self._download_webpage(url, media_id)
	127
	128	play_json_flash = self._download_json(
	129	'http://player-pc.le.com/mms/out/video/playJson',
	130	media_id, 'Downloading flash playJson data', query={
	131	'id': media_id,
	132	'platid': 1,
	133	'splatid': 101,
	134	'format': 1,
	135	'source': 1000,
	136	'tkey': self.calc_time_key(int(time.time())),
	137	'domain': 'www.le.com',
	138	'region': 'cn',
	139	},
	140	headers=self.geo_verification_headers())
	141	self._check_errors(play_json_flash)
	142
	143	def get_flash_urls(media_url, format_id):
	144	nodes_data = self._download_json(
	145	media_url, media_id,
	146	'Download JSON metadata for format %s' % format_id,
	147	query={
	148	'm3v': 1,
	149	'format': 1,
	150	'expect': 3,
	151	'tss': 'ios',
	152	})
	153
	154	req = self._request_webpage(
	155	nodes_data['nodelist'][0]['location'], media_id,
	156	note='Downloading m3u8 information for format %s' % format_id)
	157
	158	m3u8_data = self.decrypt_m3u8(req.read())
	159
	160	return {
	161	'hls': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'),
	162	}
	163
	164	extracted_formats = []
	165	formats = []
	166	playurl = play_json_flash['msgs']['playurl']
	167	play_domain = playurl['domain'][0]
	168
	169	for format_id, format_data in playurl.get('dispatch', []).items():
	170	if format_id in extracted_formats:
	171	continue
	172	extracted_formats.append(format_id)
	173
	174	media_url = play_domain + format_data[0]
	175	for protocol, format_url in get_flash_urls(media_url, format_id).items():
	176	f = {
	177	'url': format_url,
	178	'ext': determine_ext(format_data[1]),
	179	'format_id': '%s-%s' % (protocol, format_id),
	180	'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
	181	'quality': int_or_none(format_id),
	182	}
	183
	184	if format_id[-1:] == 'p':
	185	f['height'] = int_or_none(format_id[:-1])
	186
	187	formats.append(f)
	188	self._sort_formats(formats, ('height', 'quality', 'format_id'))
	189
	190	publish_time = parse_iso8601(self._html_search_regex(
	191	r'发布时间 ([^<>]+) ', page, 'publish time', default=None),
	192	delimiter=' ', timezone=datetime.timedelta(hours=8))
	193	description = self._html_search_meta('description', page, fatal=False)
	194
	195	return {
	196	'id': media_id,
	197	'formats': formats,
	198	'title': playurl['title'],
	199	'thumbnail': playurl['pic'],
	200	'description': description,
	201	'timestamp': publish_time,
	202	}
	203
	204
	205	class LePlaylistIE(InfoExtractor):
	206	_VALID_URL = r'https?://[a-z]+\.le\.com/(?!video)[a-z]+/(?P<id>[a-z0-9_]+)'
	207
	208	_TESTS = [{
	209	'url': 'http://www.le.com/tv/46177.html',
	210	'info_dict': {
	211	'id': '46177',
	212	'title': '美人天下',
	213	'description': 'md5:395666ff41b44080396e59570dbac01c'
	214	},
	215	'playlist_count': 35
	216	}, {
	217	'url': 'http://tv.le.com/izt/wuzetian/index.html',
	218	'info_dict': {
	219	'id': 'wuzetian',
	220	'title': '武媚娘传奇',
	221	'description': 'md5:e12499475ab3d50219e5bba00b3cb248'
	222	},
	223	# This playlist contains some extra videos other than the drama itself
	224	'playlist_mincount': 96
	225	}, {
	226	'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml',
	227	# This series is moved to http://www.le.com/tv/10005297.html
	228	'only_matching': True,
	229	}, {
	230	'url': 'http://www.le.com/comic/92063.html',
	231	'only_matching': True,
	232	}, {
	233	'url': 'http://list.le.com/listn/c1009_sc532002_d2_p1_o1.html',
	234	'only_matching': True,
	235	}]
	236
	237	@classmethod
	238	def suitable(cls, url):
	239	return False if LeIE.suitable(url) else super(LePlaylistIE, cls).suitable(url)
	240
	241	def _real_extract(self, url):
	242	playlist_id = self._match_id(url)
	243	page = self._download_webpage(url, playlist_id)
	244
	245	# Currently old domain names are still used in playlists
	246	media_ids = orderedSet(re.findall(
	247	r'<a[^>]+href="http://www\.letv\.com/ptv/vplay/(\d+)\.html', page))
	248	entries = [self.url_result(LeIE._URL_TEMPLATE % media_id, ie='Le')
	249	for media_id in media_ids]
	250
	251	title = self._html_search_meta('keywords', page,
	252	fatal=False).split('，')[0]
	253	description = self._html_search_meta('description', page, fatal=False)
	254
	255	return self.playlist_result(entries, playlist_id, playlist_title=title,
	256	playlist_description=description)
	257
	258
	259	class LetvCloudIE(InfoExtractor):
	260	# Most of .letv.com is changed to .le.com on 2016/01/02
	261	# but yuntv.letv.com is kept, so also keep the extractor name
	262	IE_DESC = '乐视云'
	263	_VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+'
	264
	265	_TESTS = [{
	266	'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=467623dedf',
	267	'md5': '26450599afd64c513bc77030ad15db44',
	268	'info_dict': {
	269	'id': 'p7jnfw5hw9_467623dedf',
	270	'ext': 'mp4',
	271	'title': 'Video p7jnfw5hw9_467623dedf',
	272	},
	273	}, {
	274	'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=ec93197892&pu=2c7cd40209&auto_play=1&gpcflag=1&width=640&height=360',
	275	'md5': 'e03d9cc8d9c13191e1caf277e42dbd31',
	276	'info_dict': {
	277	'id': 'p7jnfw5hw9_ec93197892',
	278	'ext': 'mp4',
	279	'title': 'Video p7jnfw5hw9_ec93197892',
	280	},
	281	}, {
	282	'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=187060b6fd',
	283	'md5': 'cb988699a776b22d4a41b9d43acfb3ac',
	284	'info_dict': {
	285	'id': 'p7jnfw5hw9_187060b6fd',
	286	'ext': 'mp4',
	287	'title': 'Video p7jnfw5hw9_187060b6fd',
	288	},
	289	}]
	290
	291	@staticmethod
	292	def sign_data(obj):
	293	if obj['cf'] == 'flash':
	294	salt = '2f9d6924b33a165a6d8b5d3d42f4f987'
	295	items = ['cf', 'format', 'ran', 'uu', 'ver', 'vu']
	296	elif obj['cf'] == 'html5':
	297	salt = 'fbeh5player12c43eccf2bec3300344'
	298	items = ['cf', 'ran', 'uu', 'bver', 'vu']
	299	input_data = ''.join([item + obj[item] for item in items]) + salt
	300	obj['sign'] = hashlib.md5(input_data.encode('utf-8')).hexdigest()
	301
	302	def _get_formats(self, cf, uu, vu, media_id):
	303	def get_play_json(cf, timestamp):
	304	data = {
	305	'cf': cf,
	306	'ver': '2.2',
	307	'bver': 'firefox44.0',
	308	'format': 'json',
	309	'uu': uu,
	310	'vu': vu,
	311	'ran': compat_str(timestamp),
	312	}
	313	self.sign_data(data)
	314	return self._download_json(
	315	'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse_urlencode(data),
	316	media_id, 'Downloading playJson data for type %s' % cf)
	317
	318	play_json = get_play_json(cf, time.time())
	319	# The server time may be different from local time
	320	if play_json.get('code') == 10071:
	321	play_json = get_play_json(cf, play_json['timestamp'])
	322
	323	if not play_json.get('data'):
	324	if play_json.get('message'):
	325	raise ExtractorError('Letv cloud said: %s' % play_json['message'], expected=True)
	326	elif play_json.get('code'):
	327	raise ExtractorError('Letv cloud returned error %d' % play_json['code'], expected=True)
	328	else:
	329	raise ExtractorError('Letv cloud returned an unknwon error')
	330
	331	def b64decode(s):
	332	return base64.b64decode(s.encode('utf-8')).decode('utf-8')
	333
	334	formats = []
	335	for media in play_json['data']['video_info']['media'].values():
	336	play_url = media['play_url']
	337	url = b64decode(play_url['main_url'])
	338	decoded_url = b64decode(url_basename(url))
	339	formats.append({
	340	'url': url,
	341	'ext': determine_ext(decoded_url),
	342	'format_id': str_or_none(play_url.get('vtype')),
	343	'format_note': str_or_none(play_url.get('definition')),
	344	'width': int_or_none(play_url.get('vwidth')),
	345	'height': int_or_none(play_url.get('vheight')),
	346	})
	347
	348	return formats
	349
	350	def _real_extract(self, url):
	351	uu_mobj = re.search(r'uu=([\w]+)', url)
	352	vu_mobj = re.search(r'vu=([\w]+)', url)
	353
	354	if not uu_mobj or not vu_mobj:
	355	raise ExtractorError('Invalid URL: %s' % url, expected=True)
	356
	357	uu = uu_mobj.group(1)
	358	vu = vu_mobj.group(1)
	359	media_id = uu + '_' + vu
	360
	361	formats = self._get_formats('flash', uu, vu, media_id) + self._get_formats('html5', uu, vu, media_id)
	362	self._sort_formats(formats)
	363
	364	return {
	365	'id': media_id,
	366	'title': 'Video %s' % media_id,
	367	'formats': formats,
	368	}