jfr.im git - yt-dlp.git/blame_incremental - youtube

... / ...

Commit	Line	Data
	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	import collections
	5	import re
	6	import sys
	7
	8	from .common import InfoExtractor
	9	from ..compat import (
	10	compat_str,
	11	compat_urlparse,
	12	)
	13	from ..utils import (
	14	clean_html,
	15	ExtractorError,
	16	get_element_by_class,
	17	int_or_none,
	18	orderedSet,
	19	remove_start,
	20	str_to_int,
	21	unescapeHTML,
	22	unified_timestamp,
	23	urlencode_postdata,
	24	)
	25	from .dailymotion import DailymotionIE
	26	from .pladform import PladformIE
	27	from .vimeo import VimeoIE
	28	from .youtube import YoutubeIE
	29
	30
	31	class VKBaseIE(InfoExtractor):
	32	_NETRC_MACHINE = 'vk'
	33
	34	def _login(self):
	35	(username, password) = self._get_login_info()
	36	if username is None:
	37	return
	38
	39	login_page, url_handle = self._download_webpage_handle(
	40	'https://vk.com', None, 'Downloading login page')
	41
	42	login_form = self._hidden_inputs(login_page)
	43
	44	login_form.update({
	45	'email': username.encode('cp1251'),
	46	'pass': password.encode('cp1251'),
	47	})
	48
	49	# https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header
	50	# and expects the first one to be set rather than second (see
	51	# https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201).
	52	# As of RFC6265 the newer one cookie should be set into cookie store
	53	# what actually happens.
	54	# We will workaround this VK issue by resetting the remixlhk cookie to
	55	# the first one manually.
	56	for header, cookies in url_handle.headers.items():
	57	if header.lower() != 'set-cookie':
	58	continue
	59	if sys.version_info[0] >= 3:
	60	cookies = cookies.encode('iso-8859-1')
	61	cookies = cookies.decode('utf-8')
	62	remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]\|$)', cookies)
	63	if remixlhk:
	64	value, domain = remixlhk.groups()
	65	self._set_cookie(domain, 'remixlhk', value)
	66	break
	67
	68	login_page = self._download_webpage(
	69	'https://login.vk.com/?act=login', None,
	70	note='Logging in',
	71	data=urlencode_postdata(login_form))
	72
	73	if re.search(r'onLoginFailed', login_page):
	74	raise ExtractorError(
	75	'Unable to login, incorrect username and/or password', expected=True)
	76
	77	def _real_initialize(self):
	78	self._login()
	79
	80
	81	class VKIE(VKBaseIE):
	82	IE_NAME = 'vk'
	83	IE_DESC = 'VK'
	84	_VALID_URL = r'''(?x)
	85	https?://
	86	(?:
	87	(?:
	88	(?:(?:m\|new)\.)?vk\.com/video_\|
	89	(?:www\.)?daxab.com/
	90	)
	91	ext\.php\?(?P<embed_query>.?\boid=(?P<oid>-?\d+).?\bid=(?P<id>\d+).*)\|
	92	(?:
	93	(?:(?:m\|new)\.)?vk\.com/(?:.+?\?.*?z=)?video\|
	94	(?:www\.)?daxab.com/embed/
	95	)
	96	(?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))?
	97	)
	98	'''
	99	_TESTS = [
	100	{
	101	'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
	102	'md5': '0deae91935c54e00003c2a00646315f0',
	103	'info_dict': {
	104	'id': '162222515',
	105	'ext': 'flv',
	106	'title': 'ProtivoGunz - Хуёвая песня',
	107	'uploader': 're:(?:Noize MC\|Alexander Ilyashenko).*',
	108	'duration': 195,
	109	'timestamp': 1329060660,
	110	'upload_date': '20120212',
	111	'view_count': int,
	112	},
	113	},
	114	{
	115	'url': 'http://vk.com/video205387401_165548505',
	116	'md5': '6c0aeb2e90396ba97035b9cbde548700',
	117	'info_dict': {
	118	'id': '165548505',
	119	'ext': 'mp4',
	120	'uploader': 'Tom Cruise',
	121	'title': 'No name',
	122	'duration': 9,
	123	'timestamp': 1374374880,
	124	'upload_date': '20130721',
	125	'view_count': int,
	126	}
	127	},
	128	{
	129	'note': 'Embedded video',
	130	'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1',
	131	'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a',
	132	'info_dict': {
	133	'id': '162925554',
	134	'ext': 'mp4',
	135	'uploader': 'Vladimir Gavrin',
	136	'title': 'Lin Dan',
	137	'duration': 101,
	138	'upload_date': '20120730',
	139	'view_count': int,
	140	},
	141	'skip': 'This video has been removed from public access.',
	142	},
	143	{
	144	# VIDEO NOW REMOVED
	145	# please update if you find a video whose URL follows the same pattern
	146	'url': 'http://vk.com/video-8871596_164049491',
	147	'md5': 'a590bcaf3d543576c9bd162812387666',
	148	'note': 'Only available for registered users',
	149	'info_dict': {
	150	'id': '164049491',
	151	'ext': 'mp4',
	152	'uploader': 'Триллеры',
	153	'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',
	154	'duration': 8352,
	155	'upload_date': '20121218',
	156	'view_count': int,
	157	},
	158	'skip': 'Requires vk account credentials',
	159	},
	160	{
	161	'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d',
	162	'md5': '4d7a5ef8cf114dfa09577e57b2993202',
	163	'info_dict': {
	164	'id': '168067957',
	165	'ext': 'mp4',
	166	'uploader': 'Киномания - лучшее из мира кино',
	167	'title': ' ',
	168	'duration': 7291,
	169	'upload_date': '20140328',
	170	},
	171	'skip': 'Requires vk account credentials',
	172	},
	173	{
	174	'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540',
	175	'md5': '0c45586baa71b7cb1d0784ee3f4e00a6',
	176	'note': 'ivi.ru embed',
	177	'info_dict': {
	178	'id': '60690',
	179	'ext': 'mp4',
	180	'title': 'Книга Илая',
	181	'duration': 6771,
	182	'upload_date': '20140626',
	183	'view_count': int,
	184	},
	185	'skip': 'Only works from Russia',
	186	},
	187	{
	188	# video (removed?) only available with list id
	189	'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4',
	190	'md5': '091287af5402239a1051c37ec7b92913',
	191	'info_dict': {
	192	'id': '171201961',
	193	'ext': 'mp4',
	194	'title': 'ТюменцевВВ_09.07.2015',
	195	'uploader': 'Anton Ivanov',
	196	'duration': 109,
	197	'upload_date': '20150709',
	198	'view_count': int,
	199	},
	200	'skip': 'Removed',
	201	},
	202	{
	203	# youtube embed
	204	'url': 'https://vk.com/video276849682_170681728',
	205	'info_dict': {
	206	'id': 'V3K4mi0SYkc',
	207	'ext': 'webm',
	208	'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
	209	'description': 'md5:d9903938abdc74c738af77f527ca0596',
	210	'duration': 178,
	211	'upload_date': '20130116',
	212	'uploader': "Children's Joy Foundation",
	213	'uploader_id': 'thecjf',
	214	'view_count': int,
	215	},
	216	},
	217	{
	218	# dailymotion embed
	219	'url': 'https://vk.com/video-37468416_456239855',
	220	'info_dict': {
	221	'id': 'k3lz2cmXyRuJQSjGHUv',
	222	'ext': 'mp4',
	223	'title': 'md5:d52606645c20b0ddbb21655adaa4f56f',
	224	'description': 'md5:c651358f03c56f1150b555c26d90a0fd',
	225	'uploader': 'AniLibria.Tv',
	226	'upload_date': '20160914',
	227	'uploader_id': 'x1p5vl5',
	228	'timestamp': 1473877246,
	229	},
	230	'params': {
	231	'skip_download': True,
	232	},
	233	},
	234	{
	235	# video key is extra_data not url\d+
	236	'url': 'http://vk.com/video-110305615_171782105',
	237	'md5': 'e13fcda136f99764872e739d13fac1d1',
	238	'info_dict': {
	239	'id': '171782105',
	240	'ext': 'mp4',
	241	'title': 'S-Dance, репетиции к The way show',
	242	'uploader': 'THE WAY SHOW \| 17 апреля',
	243	'timestamp': 1454870100,
	244	'upload_date': '20160207',
	245	'view_count': int,
	246	},
	247	},
	248	{
	249	# finished live stream, postlive_mp4
	250	'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2',
	251	'md5': '90d22d051fccbbe9becfccc615be6791',
	252	'info_dict': {
	253	'id': '456242764',
	254	'ext': 'mp4',
	255	'title': 'ИгроМир 2016 — день 1',
	256	'uploader': 'Игромания',
	257	'duration': 5239,
	258	'view_count': int,
	259	},
	260	},
	261	{
	262	# live stream, hls and rtmp links, most likely already finished live
	263	# stream by the time you are reading this comment
	264	'url': 'https://vk.com/video-140332_456239111',
	265	'only_matching': True,
	266	},
	267	{
	268	# removed video, just testing that we match the pattern
	269	'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a',
	270	'only_matching': True,
	271	},
	272	{
	273	# age restricted video, requires vk account credentials
	274	'url': 'https://vk.com/video205387401_164765225',
	275	'only_matching': True,
	276	},
	277	{
	278	# pladform embed
	279	'url': 'https://vk.com/video-76116461_171554880',
	280	'only_matching': True,
	281	},
	282	{
	283	'url': 'http://new.vk.com/video205387401_165548505',
	284	'only_matching': True,
	285	},
	286	{
	287	# This video is no longer available, because its author has been blocked.
	288	'url': 'https://vk.com/video-10639516_456240611',
	289	'only_matching': True,
	290	}
	291	]
	292
	293	def _real_extract(self, url):
	294	mobj = re.match(self._VALID_URL, url)
	295	video_id = mobj.group('videoid')
	296
	297	if video_id:
	298	info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
	299	# Some videos (removed?) can only be downloaded with list id specified
	300	list_id = mobj.group('list_id')
	301	if list_id:
	302	info_url += '&list=%s' % list_id
	303	else:
	304	info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query')
	305	video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
	306
	307	info_page = self._download_webpage(info_url, video_id)
	308
	309	error_message = self._html_search_regex(
	310	[r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
	311	r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'],
	312	info_page, 'error message', default=None)
	313	if error_message:
	314	raise ExtractorError(error_message, expected=True)
	315
	316	if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page):
	317	raise ExtractorError(
	318	'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.',
	319	expected=True)
	320
	321	ERRORS = {
	322	r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<':
	323	'Video %s has been removed from public access due to rightholder complaint.',
	324
	325	r'<!>Please log in or <':
	326	'Video %s is only available for registered users, '
	327	'use --username and --password options to provide account credentials.',
	328
	329	r'<!>Unknown error':
	330	'Video %s does not exist.',
	331
	332	r'<!>Видео временно недоступно':
	333	'Video %s is temporarily unavailable.',
	334
	335	r'<!>Access denied':
	336	'Access denied to video %s.',
	337
	338	r'<!>Видеозапись недоступна, так как её автор был заблокирован.':
	339	'Video %s is no longer available, because its author has been blocked.',
	340
	341	r'<!>This video is no longer available, because its author has been blocked.':
	342	'Video %s is no longer available, because its author has been blocked.',
	343	}
	344
	345	for error_re, error_msg in ERRORS.items():
	346	if re.search(error_re, info_page):
	347	raise ExtractorError(error_msg % video_id, expected=True)
	348
	349	youtube_url = YoutubeIE._extract_url(info_page)
	350	if youtube_url:
	351	return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
	352
	353	vimeo_url = VimeoIE._extract_url(url, info_page)
	354	if vimeo_url is not None:
	355	return self.url_result(vimeo_url)
	356
	357	pladform_url = PladformIE._extract_url(info_page)
	358	if pladform_url:
	359	return self.url_result(pladform_url)
	360
	361	m_rutube = re.search(
	362	r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video\|play)\\?/embed(?:.*?))\\?"', info_page)
	363	if m_rutube is not None:
	364	rutube_url = self._proto_relative_url(
	365	m_rutube.group(1).replace('\\', ''))
	366	return self.url_result(rutube_url)
	367
	368	dailymotion_urls = DailymotionIE._extract_urls(info_page)
	369	if dailymotion_urls:
	370	return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key())
	371
	372	m_opts = re.search(r'(?s)var\s+opts\s=\s({.+?});', info_page)
	373	if m_opts:
	374	m_opts_url = re.search(r"url\s:\s'((?!/\b)[^']+)", m_opts.group(1))
	375	if m_opts_url:
	376	opts_url = m_opts_url.group(1)
	377	if opts_url.startswith('//'):
	378	opts_url = 'http:' + opts_url
	379	return self.url_result(opts_url)
	380
	381	# vars does not look to be served anymore since 24.10.2016
	382	data = self._parse_json(
	383	self._search_regex(
	384	r'var\s+vars\s=\s({.+?});', info_page, 'vars', default='{}'),
	385	video_id, fatal=False)
	386
	387	# <!json> is served instead
	388	if not data:
	389	data = self._parse_json(
	390	self._search_regex(
	391	r'<!json>\s({.+?})\s<!>', info_page, 'json', default='{}'),
	392	video_id)
	393	if data:
	394	data = data['player']['params'][0]
	395
	396	if not data:
	397	data = self._parse_json(
	398	self._search_regex(
	399	r'var\s+playerParams\s=\s({.+?})\s;\s\n', info_page,
	400	'player params'),
	401	video_id)['params'][0]
	402
	403	title = unescapeHTML(data['md_title'])
	404
	405	# 2 = live
	406	# 3 = post live (finished live)
	407	is_live = data.get('live') == 2
	408	if is_live:
	409	title = self._live_title(title)
	410
	411	timestamp = unified_timestamp(self._html_search_regex(
	412	r'class=["\']mv_info_date[^>]+>([^<]+)(?:<\|from)', info_page,
	413	'upload date', fatal=False))
	414
	415	view_count = str_to_int(self._search_regex(
	416	r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)',
	417	info_page, 'view count', default=None))
	418
	419	formats = []
	420	for format_id, format_url in data.items():
	421	if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//', 'rtmp')):
	422	continue
	423	if (format_id.startswith(('url', 'cache')) or
	424	format_id in ('extra_data', 'live_mp4', 'postlive_mp4')):
	425	height = int_or_none(self._search_regex(
	426	r'^(?:url\|cache)(\d+)', format_id, 'height', default=None))
	427	formats.append({
	428	'format_id': format_id,
	429	'url': format_url,
	430	'height': height,
	431	})
	432	elif format_id == 'hls':
	433	formats.extend(self._extract_m3u8_formats(
	434	format_url, video_id, 'mp4', 'm3u8_native',
	435	m3u8_id=format_id, fatal=False, live=is_live))
	436	elif format_id == 'rtmp':
	437	formats.append({
	438	'format_id': format_id,
	439	'url': format_url,
	440	'ext': 'flv',
	441	})
	442	self._sort_formats(formats)
	443
	444	return {
	445	'id': compat_str(data.get('vid') or video_id),
	446	'formats': formats,
	447	'title': title,
	448	'thumbnail': data.get('jpg'),
	449	'uploader': data.get('md_author'),
	450	'duration': data.get('duration'),
	451	'timestamp': timestamp,
	452	'view_count': view_count,
	453	'is_live': is_live,
	454	}
	455
	456
	457	class VKUserVideosIE(VKBaseIE):
	458	IE_NAME = 'vk:uservideos'
	459	IE_DESC = "VK - User's Videos"
	460	_VALID_URL = r'https?://(?:(?:m\|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]\|$)'
	461	_TEMPLATE_URL = 'https://vk.com/videos'
	462	_TESTS = [{
	463	'url': 'http://vk.com/videos205387401',
	464	'info_dict': {
	465	'id': '205387401',
	466	'title': "Tom Cruise's Videos",
	467	},
	468	'playlist_mincount': 4,
	469	}, {
	470	'url': 'http://vk.com/videos-77521',
	471	'only_matching': True,
	472	}, {
	473	'url': 'http://vk.com/videos-97664626?section=all',
	474	'only_matching': True,
	475	}, {
	476	'url': 'http://m.vk.com/videos205387401',
	477	'only_matching': True,
	478	}, {
	479	'url': 'http://new.vk.com/videos205387401',
	480	'only_matching': True,
	481	}]
	482
	483	def _real_extract(self, url):
	484	page_id = self._match_id(url)
	485
	486	webpage = self._download_webpage(url, page_id)
	487
	488	entries = [
	489	self.url_result(
	490	'http://vk.com/video' + video_id, 'VK', video_id=video_id)
	491	for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))]
	492
	493	title = unescapeHTML(self._search_regex(
	494	r'<title>\s*([^<]+?)\s+\\|\s+\d+\s+videos',
	495	webpage, 'title', default=page_id))
	496
	497	return self.playlist_result(entries, page_id, title)
	498
	499
	500	class VKWallPostIE(VKBaseIE):
	501	IE_NAME = 'vk:wallpost'
	502	_VALID_URL = r'https?://(?:(?:(?:(?:m\|new)\.)?vk\.com/(?:[^?]+\?.*\bw=)?wall(?P<id>-?\d+_\d+)))'
	503	_TESTS = [{
	504	# public page URL, audio playlist
	505	'url': 'https://vk.com/bs.official?w=wall-23538238_35',
	506	'info_dict': {
	507	'id': '23538238_35',
	508	'title': 'Black Shadow - Wall post 23538238_35',
	509	'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c',
	510	},
	511	'playlist': [{
	512	'md5': '5ba93864ec5b85f7ce19a9af4af080f6',
	513	'info_dict': {
	514	'id': '135220665_111806521',
	515	'ext': 'mp3',
	516	'title': 'Black Shadow - Слепое Верование',
	517	'duration': 370,
	518	'uploader': 'Black Shadow',
	519	'artist': 'Black Shadow',
	520	'track': 'Слепое Верование',
	521	},
	522	}, {
	523	'md5': '4cc7e804579122b17ea95af7834c9233',
	524	'info_dict': {
	525	'id': '135220665_111802303',
	526	'ext': 'mp3',
	527	'title': 'Black Shadow - Война - Негасимое Бездны Пламя!',
	528	'duration': 423,
	529	'uploader': 'Black Shadow',
	530	'artist': 'Black Shadow',
	531	'track': 'Война - Негасимое Бездны Пламя!',
	532	},
	533	'params': {
	534	'skip_download': True,
	535	},
	536	}],
	537	'params': {
	538	'usenetrc': True,
	539	},
	540	'skip': 'Requires vk account credentials',
	541	}, {
	542	# single YouTube embed, no leading -
	543	'url': 'https://vk.com/wall85155021_6319',
	544	'info_dict': {
	545	'id': '85155021_6319',
	546	'title': 'Sergey Gorbunov - Wall post 85155021_6319',
	547	},
	548	'playlist_count': 1,
	549	'params': {
	550	'usenetrc': True,
	551	},
	552	'skip': 'Requires vk account credentials',
	553	}, {
	554	# wall page URL
	555	'url': 'https://vk.com/wall-23538238_35',
	556	'only_matching': True,
	557	}, {
	558	# mobile wall page URL
	559	'url': 'https://m.vk.com/wall-23538238_35',
	560	'only_matching': True,
	561	}]
	562
	563	def _real_extract(self, url):
	564	post_id = self._match_id(url)
	565
	566	wall_url = 'https://vk.com/wall%s' % post_id
	567
	568	post_id = remove_start(post_id, '-')
	569
	570	webpage = self._download_webpage(wall_url, post_id)
	571
	572	error = self._html_search_regex(
	573	r'>Error</div>\s<div[^>]+class=["\']body["\'][^>]>([^<]+)',
	574	webpage, 'error', default=None)
	575	if error:
	576	raise ExtractorError('VK said: %s' % error, expected=True)
	577
	578	description = clean_html(get_element_by_class('wall_post_text', webpage))
	579	uploader = clean_html(get_element_by_class('author', webpage))
	580	thumbnail = self._og_search_thumbnail(webpage)
	581
	582	entries = []
	583
	584	audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage)
	585	if audio_ids:
	586	al_audio = self._download_webpage(
	587	'https://vk.com/al_audio.php', post_id,
	588	note='Downloading audio info', fatal=False,
	589	data=urlencode_postdata({
	590	'act': 'reload_audio',
	591	'al': '1',
	592	'ids': ','.join(audio_ids)
	593	}))
	594	if al_audio:
	595	Audio = collections.namedtuple(
	596	'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration'])
	597	audios = self._parse_json(
	598	self._search_regex(
	599	r'<!json>(.+?)<!>', al_audio, 'audios', default='[]'),
	600	post_id, fatal=False, transform_source=unescapeHTML)
	601	if isinstance(audios, list):
	602	for audio in audios:
	603	a = Audio._make(audio[:6])
	604	entries.append({
	605	'id': '%s_%s' % (a.user_id, a.id),
	606	'url': a.url,
	607	'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id,
	608	'thumbnail': thumbnail,
	609	'duration': a.duration,
	610	'uploader': uploader,
	611	'artist': a.artist,
	612	'track': a.track,
	613	})
	614
	615	for video in re.finditer(
	616	r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage):
	617	entries.append(self.url_result(
	618	compat_urlparse.urljoin(url, video.group('url')), VKIE.ie_key()))
	619
	620	title = 'Wall post %s' % post_id
	621
	622	return self.playlist_result(
	623	orderedSet(entries), post_id,
	624	'%s - %s' % (uploader, title) if uploader else title,
	625	description)