jfr.im git - yt-dlp.git/blame_incremental - youtube

Commit	Line	Data
	1	# coding: utf-8
	2
	3	import errno
	4	import io
	5	import itertools
	6	import json
	7	import os.path
	8	import re
	9	import traceback
	10
	11	from .common import InfoExtractor, SearchInfoExtractor
	12	from .subtitles import SubtitlesInfoExtractor
	13	from ..jsinterp import JSInterpreter
	14	from ..swfinterp import SWFInterpreter
	15	from ..utils import (
	16	compat_chr,
	17	compat_parse_qs,
	18	compat_urllib_parse,
	19	compat_urllib_request,
	20	compat_urlparse,
	21	compat_str,
	22
	23	clean_html,
	24	get_cachedir,
	25	get_element_by_id,
	26	get_element_by_attribute,
	27	ExtractorError,
	28	int_or_none,
	29	PagedList,
	30	unescapeHTML,
	31	unified_strdate,
	32	orderedSet,
	33	write_json_file,
	34	uppercase_escape,
	35	)
	36
	37	class YoutubeBaseInfoExtractor(InfoExtractor):
	38	"""Provide base functions for Youtube extractors"""
	39	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	40	_LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
	41	_AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
	42	_NETRC_MACHINE = 'youtube'
	43	# If True it will raise an error if no login info is provided
	44	_LOGIN_REQUIRED = False
	45
	46	def _set_language(self):
	47	return bool(self._download_webpage(
	48	self._LANG_URL, None,
	49	note=u'Setting language', errnote='unable to set language',
	50	fatal=False))
	51
	52	def _login(self):
	53	(username, password) = self._get_login_info()
	54	# No authentication to be performed
	55	if username is None:
	56	if self._LOGIN_REQUIRED:
	57	raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
	58	return False
	59
	60	login_page = self._download_webpage(
	61	self._LOGIN_URL, None,
	62	note=u'Downloading login page',
	63	errnote=u'unable to fetch login page', fatal=False)
	64	if login_page is False:
	65	return
	66
	67	galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
	68	login_page, u'Login GALX parameter')
	69
	70	# Log in
	71	login_form_strs = {
	72	u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
	73	u'Email': username,
	74	u'GALX': galx,
	75	u'Passwd': password,
	76	u'PersistentCookie': u'yes',
	77	u'_utf8': u'霱',
	78	u'bgresponse': u'js_disabled',
	79	u'checkConnection': u'',
	80	u'checkedDomains': u'youtube',
	81	u'dnConn': u'',
	82	u'pstMsg': u'0',
	83	u'rmShown': u'1',
	84	u'secTok': u'',
	85	u'signIn': u'Sign in',
	86	u'timeStmp': u'',
	87	u'service': u'youtube',
	88	u'uilel': u'3',
	89	u'hl': u'en_US',
	90	}
	91	# Convert to UTF-8 before urlencode because Python 2.x's urlencode
	92	# chokes on unicode
	93	login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
	94	login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
	95
	96	req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
	97	login_results = self._download_webpage(
	98	req, None,
	99	note=u'Logging in', errnote=u'unable to log in', fatal=False)
	100	if login_results is False:
	101	return False
	102	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
	103	self._downloader.report_warning(u'unable to log in: bad username or password')
	104	return False
	105	return True
	106
	107	def _confirm_age(self):
	108	age_form = {
	109	'next_url': '/',
	110	'action_confirm': 'Confirm',
	111	}
	112	req = compat_urllib_request.Request(self._AGE_URL,
	113	compat_urllib_parse.urlencode(age_form).encode('ascii'))
	114
	115	self._download_webpage(
	116	req, None,
	117	note=u'Confirming age', errnote=u'Unable to confirm age')
	118	return True
	119
	120	def _real_initialize(self):
	121	if self._downloader is None:
	122	return
	123	if not self._set_language():
	124	return
	125	if not self._login():
	126	return
	127	self._confirm_age()
	128
	129
	130	class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
	131	IE_DESC = u'YouTube.com'
	132	_VALID_URL = r"""(?x)^
	133	(
	134	(?:https?://\|//)? # http(s):// or protocol-independent URL (optional)
	135	(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/\|
	136	(?:www\.)?deturl\.com/www\.youtube\.com/\|
	137	(?:www\.)?pwnyoutube\.com/\|
	138	(?:www\.)?yourepeat\.com/\|
	139	tube\.majestyc\.net/\|
	140	youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
	141	(?:.*?\#/)? # handle anchor (#/) redirect urls
	142	(?: # the various things that can precede the ID:
	143	(?:(?:v\|embed\|e)/) # v/ or embed/ or e/
	144	\|(?: # or the v= param in all its forms
	145	(?:(?:watch\|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	146	(?:\?\|\#!?) # the params delimiter ? or # or #!
	147	(?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
	148	v=
	149	)
	150	))
	151	\|youtu\.be/ # just youtu.be/xxxx
	152	\|https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
	153	)
	154	)? # all until now is optional -> you can pass the naked ID
	155	([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
	156	(?(1).+)? # if we found the ID, everything can follow
	157	$"""
	158	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	159	_formats = {
	160	'5': {'ext': 'flv', 'width': 400, 'height': 240},
	161	'6': {'ext': 'flv', 'width': 450, 'height': 270},
	162	'13': {'ext': '3gp'},
	163	'17': {'ext': '3gp', 'width': 176, 'height': 144},
	164	'18': {'ext': 'mp4', 'width': 640, 'height': 360},
	165	'22': {'ext': 'mp4', 'width': 1280, 'height': 720},
	166	'34': {'ext': 'flv', 'width': 640, 'height': 360},
	167	'35': {'ext': 'flv', 'width': 854, 'height': 480},
	168	'36': {'ext': '3gp', 'width': 320, 'height': 240},
	169	'37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
	170	'38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
	171	'43': {'ext': 'webm', 'width': 640, 'height': 360},
	172	'44': {'ext': 'webm', 'width': 854, 'height': 480},
	173	'45': {'ext': 'webm', 'width': 1280, 'height': 720},
	174	'46': {'ext': 'webm', 'width': 1920, 'height': 1080},
	175
	176
	177	# 3d videos
	178	'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
	179	'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
	180	'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
	181	'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
	182	'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
	183	'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
	184	'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
	185
	186	# Apple HTTP Live Streaming
	187	'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
	188	'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
	189	'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
	190	'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
	191	'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
	192	'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
	193	'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
	194
	195	# DASH mp4 video
	196	'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	197	'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	198	'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	199	'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	200	'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	201	'138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	202	'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	203	'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	204
	205	# Dash mp4 audio
	206	'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
	207	'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
	208	'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
	209
	210	# Dash webm
	211	'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	212	'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	213	'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	214	'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	215	'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	216	'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	217	'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	218	'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	219	'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	220	'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	221	'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	222	'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	223	'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	224	'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	225	'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	226
	227	# Dash webm audio
	228	'171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
	229	'172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
	230
	231	# RTMP (unnamed)
	232	'_rtmp': {'protocol': 'rtmp'},
	233	}
	234
	235	IE_NAME = u'youtube'
	236	_TESTS = [
	237	{
	238	u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
	239	u"file": u"BaW_jenozKc.mp4",
	240	u"info_dict": {
	241	u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
	242	u"uploader": u"Philipp Hagemeister",
	243	u"uploader_id": u"phihag",
	244	u"upload_date": u"20121002",
	245	u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
	246	u"categories": [u'Science & Technology'],
	247	}
	248	},
	249	{
	250	u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
	251	u"file": u"UxxajLWwzqY.mp4",
	252	u"note": u"Test generic use_cipher_signature video (#897)",
	253	u"info_dict": {
	254	u"upload_date": u"20120506",
	255	u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
	256	u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
	257	u"uploader": u"Icona Pop",
	258	u"uploader_id": u"IconaPop"
	259	}
	260	},
	261	{
	262	u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
	263	u"file": u"07FYdnEawAQ.mp4",
	264	u"note": u"Test VEVO video with age protection (#956)",
	265	u"info_dict": {
	266	u"upload_date": u"20130703",
	267	u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
	268	u"description": u"md5:64249768eec3bc4276236606ea996373",
	269	u"uploader": u"justintimberlakeVEVO",
	270	u"uploader_id": u"justintimberlakeVEVO"
	271	}
	272	},
	273	{
	274	u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
	275	u"file": u"yZIXLfi8CZQ.mp4",
	276	u"note": u"Embed-only video (#1746)",
	277	u"info_dict": {
	278	u"upload_date": u"20120608",
	279	u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
	280	u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
	281	u"uploader": u"SET India",
	282	u"uploader_id": u"setindia"
	283	}
	284	},
	285	{
	286	u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
	287	u"file": u"a9LDPn-MO4I.m4a",
	288	u"note": u"256k DASH audio (format 141) via DASH manifest",
	289	u"info_dict": {
	290	u"upload_date": "20121002",
	291	u"uploader_id": "8KVIDEO",
	292	u"description": "No description available.",
	293	u"uploader": "8KVIDEO",
	294	u"title": "UHDTV TEST 8K VIDEO.mp4"
	295	},
	296	u"params": {
	297	u"youtube_include_dash_manifest": True,
	298	u"format": "141",
	299	},
	300	},
	301	# DASH manifest with encrypted signature
	302	{
	303	u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
	304	u'info_dict': {
	305	u'id': u'IB3lcPjvWLA',
	306	u'ext': u'm4a',
	307	u'title': u'Afrojack - The Spark ft. Spree Wilson',
	308	u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
	309	u'uploader': u'AfrojackVEVO',
	310	u'uploader_id': u'AfrojackVEVO',
	311	u'upload_date': u'20131011',
	312	},
	313	u"params": {
	314	u'youtube_include_dash_manifest': True,
	315	u'format': '141',
	316	},
	317	},
	318	]
	319
	320
	321	@classmethod
	322	def suitable(cls, url):
	323	"""Receives a URL and returns True if suitable for this IE."""
	324	if YoutubePlaylistIE.suitable(url): return False
	325	return re.match(cls._VALID_URL, url) is not None
	326
	327	def __init__(self, args, *kwargs):
	328	super(YoutubeIE, self).__init__(args, *kwargs)
	329	self._player_cache = {}
	330
	331	def report_video_info_webpage_download(self, video_id):
	332	"""Report attempt to download video info webpage."""
	333	self.to_screen(u'%s: Downloading video info webpage' % video_id)
	334
	335	def report_information_extraction(self, video_id):
	336	"""Report attempt to extract video information."""
	337	self.to_screen(u'%s: Extracting video information' % video_id)
	338
	339	def report_unavailable_format(self, video_id, format):
	340	"""Report extracted video URL."""
	341	self.to_screen(u'%s: Format %s not available' % (video_id, format))
	342
	343	def report_rtmp_download(self):
	344	"""Indicate the download will use the RTMP protocol."""
	345	self.to_screen(u'RTMP download detected')
	346
	347	def _extract_signature_function(self, video_id, player_url, slen):
	348	id_m = re.match(
	349	r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3\|/html5player)?\.(?P<ext>[a-z]+)$',
	350	player_url)
	351	if not id_m:
	352	raise ExtractorError('Cannot identify player %r' % player_url)
	353	player_type = id_m.group('ext')
	354	player_id = id_m.group('id')
	355
	356	# Read from filesystem cache
	357	func_id = '%s_%s_%d' % (player_type, player_id, slen)
	358	assert os.path.basename(func_id) == func_id
	359	cache_dir = get_cachedir(self._downloader.params)
	360
	361	cache_enabled = cache_dir is not None
	362	if cache_enabled:
	363	cache_fn = os.path.join(os.path.expanduser(cache_dir),
	364	u'youtube-sigfuncs',
	365	func_id + '.json')
	366	try:
	367	with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
	368	cache_spec = json.load(cachef)
	369	return lambda s: u''.join(s[i] for i in cache_spec)
	370	except IOError:
	371	pass # No cache available
	372
	373	if player_type == 'js':
	374	code = self._download_webpage(
	375	player_url, video_id,
	376	note=u'Downloading %s player %s' % (player_type, player_id),
	377	errnote=u'Download of %s failed' % player_url)
	378	res = self._parse_sig_js(code)
	379	elif player_type == 'swf':
	380	urlh = self._request_webpage(
	381	player_url, video_id,
	382	note=u'Downloading %s player %s' % (player_type, player_id),
	383	errnote=u'Download of %s failed' % player_url)
	384	code = urlh.read()
	385	res = self._parse_sig_swf(code)
	386	else:
	387	assert False, 'Invalid player type %r' % player_type
	388
	389	if cache_enabled:
	390	try:
	391	test_string = u''.join(map(compat_chr, range(slen)))
	392	cache_res = res(test_string)
	393	cache_spec = [ord(c) for c in cache_res]
	394	try:
	395	os.makedirs(os.path.dirname(cache_fn))
	396	except OSError as ose:
	397	if ose.errno != errno.EEXIST:
	398	raise
	399	write_json_file(cache_spec, cache_fn)
	400	except Exception:
	401	tb = traceback.format_exc()
	402	self._downloader.report_warning(
	403	u'Writing cache to %r failed: %s' % (cache_fn, tb))
	404
	405	return res
	406
	407	def _print_sig_code(self, func, slen):
	408	def gen_sig_code(idxs):
	409	def _genslice(start, end, step):
	410	starts = u'' if start == 0 else str(start)
	411	ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
	412	steps = u'' if step == 1 else (u':%d' % step)
	413	return u's[%s%s%s]' % (starts, ends, steps)
	414
	415	step = None
	416	start = '(Never used)' # Quelch pyflakes warnings - start will be
	417	# set as soon as step is set
	418	for i, prev in zip(idxs[1:], idxs[:-1]):
	419	if step is not None:
	420	if i - prev == step:
	421	continue
	422	yield _genslice(start, prev, step)
	423	step = None
	424	continue
	425	if i - prev in [-1, 1]:
	426	step = i - prev
	427	start = prev
	428	continue
	429	else:
	430	yield u's[%d]' % prev
	431	if step is None:
	432	yield u's[%d]' % i
	433	else:
	434	yield _genslice(start, i, step)
	435
	436	test_string = u''.join(map(compat_chr, range(slen)))
	437	cache_res = func(test_string)
	438	cache_spec = [ord(c) for c in cache_res]
	439	expr_code = u' + '.join(gen_sig_code(cache_spec))
	440	code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
	441	self.to_screen(u'Extracted signature function:\n' + code)
	442
	443	def _parse_sig_js(self, jscode):
	444	funcname = self._search_regex(
	445	r'signature=([$a-zA-Z]+)', jscode,
	446	u'Initial JS player signature function name')
	447
	448	jsi = JSInterpreter(jscode)
	449	initial_function = jsi.extract_function(funcname)
	450	return lambda s: initial_function([s])
	451
	452	def _parse_sig_swf(self, file_contents):
	453	swfi = SWFInterpreter(file_contents)
	454	TARGET_CLASSNAME = u'SignatureDecipher'
	455	searched_class = swfi.extract_class(TARGET_CLASSNAME)
	456	initial_function = swfi.extract_function(searched_class, u'decipher')
	457	return lambda s: initial_function([s])
	458
	459	def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
	460	"""Turn the encrypted s field into a working signature"""
	461
	462	if player_url is None:
	463	raise ExtractorError(u'Cannot decrypt signature without player_url')
	464
	465	if player_url.startswith(u'//'):
	466	player_url = u'https:' + player_url
	467	try:
	468	player_id = (player_url, len(s))
	469	if player_id not in self._player_cache:
	470	func = self._extract_signature_function(
	471	video_id, player_url, len(s)
	472	)
	473	self._player_cache[player_id] = func
	474	func = self._player_cache[player_id]
	475	if self._downloader.params.get('youtube_print_sig_code'):
	476	self._print_sig_code(func, len(s))
	477	return func(s)
	478	except Exception as e:
	479	tb = traceback.format_exc()
	480	raise ExtractorError(
	481	u'Automatic signature extraction failed: ' + tb, cause=e)
	482
	483	def _get_available_subtitles(self, video_id, webpage):
	484	try:
	485	sub_list = self._download_webpage(
	486	'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
	487	video_id, note=False)
	488	except ExtractorError as err:
	489	self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
	490	return {}
	491	lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
	492
	493	sub_lang_list = {}
	494	for l in lang_list:
	495	lang = l[1]
	496	params = compat_urllib_parse.urlencode({
	497	'lang': lang,
	498	'v': video_id,
	499	'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
	500	'name': unescapeHTML(l[0]).encode('utf-8'),

1

# coding: utf-8

import errno

import io

import itertools

import json

import os.path

import re

import traceback

from .common import InfoExtractor, SearchInfoExtractor

12

from .subtitles import SubtitlesInfoExtractor

13

from ..jsinterp import JSInterpreter

14

from ..swfinterp import SWFInterpreter

15

from ..utils import (

compat_chr,

compat_parse_qs,

compat_urllib_parse,

compat_urllib_request,

compat_urlparse,

compat_str,

clean_html,

get_cachedir,

get_element_by_id,

get_element_by_attribute,

ExtractorError,

int_or_none,

PagedList,

unescapeHTML,

unified_strdate,

orderedSet,

write_json_file,

uppercase_escape,

)

class YoutubeBaseInfoExtractor(InfoExtractor):

38

"""Provide base functions for Youtube extractors"""

39

_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'

40

_LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'

41

_AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'

42

_NETRC_MACHINE = 'youtube'

43

# If True it will raise an error if no login info is provided

44

_LOGIN_REQUIRED = False

45

46

def _set_language(self):

47

return bool(self._download_webpage(

48

self._LANG_URL, None,

49

note=u'Setting language', errnote='unable to set language',

fatal=False))

def _login(self):

(username, password) = self._get_login_info()

54

# No authentication to be performed

55

if username is None:

56

if self._LOGIN_REQUIRED:

57

raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)

58

return False

59

60

login_page = self._download_webpage(

61

self._LOGIN_URL, None,

62

note=u'Downloading login page',

63

errnote=u'unable to fetch login page', fatal=False)

64

if login_page is False:

65

return

66

67

galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',

68

login_page, u'Login GALX parameter')

# Log in

login_form_strs = {

u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',

u'Email': username,

u'GALX': galx,

u'Passwd': password,

u'PersistentCookie': u'yes',

77

u'_utf8': u'霱',

78

u'bgresponse': u'js_disabled',

79

u'checkConnection': u'',

80

u'checkedDomains': u'youtube',

u'dnConn': u'',

u'pstMsg': u'0',

u'rmShown': u'1',

u'secTok': u'',

u'signIn': u'Sign in',

86

u'timeStmp': u'',

87

u'service': u'youtube',

u'uilel': u'3',

u'hl': u'en_US',

}

# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode

92

# chokes on unicode

93

login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())

94

login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')

95

96

req = compat_urllib_request.Request(self._LOGIN_URL, login_data)

97

login_results = self._download_webpage(

98

req, None,

99

note=u'Logging in', errnote=u'unable to log in', fatal=False)

100

if login_results is False:

101

return False

102

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:

103

self._downloader.report_warning(u'unable to log in: bad username or password')

return False

return True

def _confirm_age(self):

108

age_form = {

109

'next_url': '/',

110

'action_confirm': 'Confirm',

111

}

112

req = compat_urllib_request.Request(self._AGE_URL,

113

compat_urllib_parse.urlencode(age_form).encode('ascii'))

114

115

self._download_webpage(

116

req, None,

117

note=u'Confirming age', errnote=u'Unable to confirm age')

118

return True

119

120

def _real_initialize(self):

121

if self._downloader is None:

122

return

123

if not self._set_language():

124

return

125

if not self._login():

return

self._confirm_age()

class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):

131

IE_DESC = u'YouTube.com'

132

_VALID_URL = r"""(?x)^

133

(

134

(?:https?://|//)? # http(s):// or protocol-independent URL (optional)

135

(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|

136

(?:www\.)?deturl\.com/www\.youtube\.com/|

137

(?:www\.)?pwnyoutube\.com/|

138

(?:www\.)?yourepeat\.com/|

139

tube\.majestyc\.net/|

140

youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains

141

(?:.*?\#/)? # handle anchor (#/) redirect urls

142

(?: # the various things that can precede the ID:

143

(?:(?:v|embed|e)/) # v/ or embed/ or e/

144

|(?: # or the v= param in all its forms

145

(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)

146

(?:\?|\#!?) # the params delimiter ? or # or #!

147

(?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)

v=

)

))

|youtu\.be/ # just youtu.be/xxxx

152

|https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=

153

)

154

)? # all until now is optional -> you can pass the naked ID

155

([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID

156

(?(1).+)? # if we found the ID, everything can follow

157

$"""

158

_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'

159

_formats = {

160

'5': {'ext': 'flv', 'width': 400, 'height': 240},

161

'6': {'ext': 'flv', 'width': 450, 'height': 270},

162

'13': {'ext': '3gp'},

163

'17': {'ext': '3gp', 'width': 176, 'height': 144},

164

'18': {'ext': 'mp4', 'width': 640, 'height': 360},

165

'22': {'ext': 'mp4', 'width': 1280, 'height': 720},

166

'34': {'ext': 'flv', 'width': 640, 'height': 360},

167

'35': {'ext': 'flv', 'width': 854, 'height': 480},

168

'36': {'ext': '3gp', 'width': 320, 'height': 240},

169

'37': {'ext': 'mp4', 'width': 1920, 'height': 1080},

170

'38': {'ext': 'mp4', 'width': 4096, 'height': 3072},

171

'43': {'ext': 'webm', 'width': 640, 'height': 360},

172

'44': {'ext': 'webm', 'width': 854, 'height': 480},

173

'45': {'ext': 'webm', 'width': 1280, 'height': 720},

174

'46': {'ext': 'webm', 'width': 1920, 'height': 1080},

# 3d videos

'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},

179

'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},

180

'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},

181

'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},

182

'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},

183

'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},

184

'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},

185

186

# Apple HTTP Live Streaming

187

'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},

188

'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},

189

'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},

190

'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},

191

'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},

192

'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},

193

'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},

194

195

# DASH mp4 video

196

'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

197

'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

198

'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

199

'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

200

'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

201

'138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

202

'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

203

'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

204

205

# Dash mp4 audio

206

'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},

207

'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},

208

'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},

209

210

# Dash webm

211

'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},

212

'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},

213

'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},

214

'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},

215

'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},

216

'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},

217

'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

218

'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

219

'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

220

'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

221

'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

222

'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

223

'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

224

'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

225

'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

226

227

# Dash webm audio

228

'171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},

229

'172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},

230

231

# RTMP (unnamed)

232

'_rtmp': {'protocol': 'rtmp'},

}

IE_NAME = u'youtube'

_TESTS = [

{

u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",

239

u"file": u"BaW_jenozKc.mp4",

240

u"info_dict": {

241

u"title": u"youtube-dl test video \"'/\\ä↭𝕐",

242

u"uploader": u"Philipp Hagemeister",

243

u"uploader_id": u"phihag",

244

u"upload_date": u"20121002",

245

u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",

246

u"categories": [u'Science & Technology'],

}

},

{

u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",

251

u"file": u"UxxajLWwzqY.mp4",

252

u"note": u"Test generic use_cipher_signature video (#897)",

253

u"info_dict": {

254

u"upload_date": u"20120506",

255

u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",

256

u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",

257

u"uploader": u"Icona Pop",

258

u"uploader_id": u"IconaPop"

}

},

{

u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",

263

u"file": u"07FYdnEawAQ.mp4",

264

u"note": u"Test VEVO video with age protection (#956)",

265

u"info_dict": {

266

u"upload_date": u"20130703",

267

u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",

268

u"description": u"md5:64249768eec3bc4276236606ea996373",

269

u"uploader": u"justintimberlakeVEVO",

270

u"uploader_id": u"justintimberlakeVEVO"

}

},

{

u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",

275

u"file": u"yZIXLfi8CZQ.mp4",

276

u"note": u"Embed-only video (#1746)",

277

u"info_dict": {

278

u"upload_date": u"20120608",

279

u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",

280

u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",

281

u"uploader": u"SET India",

282

u"uploader_id": u"setindia"

}

},

{

u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",

287

u"file": u"a9LDPn-MO4I.m4a",

288

u"note": u"256k DASH audio (format 141) via DASH manifest",

289

u"info_dict": {

290

u"upload_date": "20121002",

291

u"uploader_id": "8KVIDEO",

292

u"description": "No description available.",

293

u"uploader": "8KVIDEO",

294

u"title": "UHDTV TEST 8K VIDEO.mp4"

295

},

296

u"params": {

297

u"youtube_include_dash_manifest": True,

u"format": "141",

},

},

# DASH manifest with encrypted signature

302

{

303

u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',

304

u'info_dict': {

305

u'id': u'IB3lcPjvWLA',

306

u'ext': u'm4a',

307

u'title': u'Afrojack - The Spark ft. Spree Wilson',

308

u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',

309

u'uploader': u'AfrojackVEVO',

310

u'uploader_id': u'AfrojackVEVO',

311

u'upload_date': u'20131011',

312

},

313

u"params": {

314

u'youtube_include_dash_manifest': True,

u'format': '141',

},

},

]

@classmethod

def suitable(cls, url):

323

"""Receives a URL and returns True if suitable for this IE."""

324

if YoutubePlaylistIE.suitable(url): return False

325

return re.match(cls._VALID_URL, url) is not None

326

327

def __init__(self, *args, **kwargs):

328

super(YoutubeIE, self).__init__(*args, **kwargs)

329

self._player_cache = {}

330

331

def report_video_info_webpage_download(self, video_id):

332

"""Report attempt to download video info webpage."""

333

self.to_screen(u'%s: Downloading video info webpage' % video_id)

334

335

def report_information_extraction(self, video_id):

336

"""Report attempt to extract video information."""

337

self.to_screen(u'%s: Extracting video information' % video_id)

338

339

def report_unavailable_format(self, video_id, format):

340

"""Report extracted video URL."""

341

self.to_screen(u'%s: Format %s not available' % (video_id, format))

342

343

def report_rtmp_download(self):

344

"""Indicate the download will use the RTMP protocol."""

345

self.to_screen(u'RTMP download detected')

346

347

def _extract_signature_function(self, video_id, player_url, slen):

348

id_m = re.match(

349

r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',

350

player_url)

351

if not id_m:

352

raise ExtractorError('Cannot identify player %r' % player_url)

353

player_type = id_m.group('ext')

354

player_id = id_m.group('id')

355

356

# Read from filesystem cache

357

func_id = '%s_%s_%d' % (player_type, player_id, slen)

358

assert os.path.basename(func_id) == func_id

359

cache_dir = get_cachedir(self._downloader.params)

360

361

cache_enabled = cache_dir is not None

362

if cache_enabled:

363

cache_fn = os.path.join(os.path.expanduser(cache_dir),

u'youtube-sigfuncs',

func_id + '.json')

try:

with io.open(cache_fn, 'r', encoding='utf-8') as cachef:

368

cache_spec = json.load(cachef)

369

return lambda s: u''.join(s[i] for i in cache_spec)

370

except IOError:

371

pass # No cache available

372

373

if player_type == 'js':

374

code = self._download_webpage(

375

player_url, video_id,

376

note=u'Downloading %s player %s' % (player_type, player_id),

377

errnote=u'Download of %s failed' % player_url)

378

res = self._parse_sig_js(code)

379

elif player_type == 'swf':

380

urlh = self._request_webpage(

381

player_url, video_id,

382

note=u'Downloading %s player %s' % (player_type, player_id),

383

errnote=u'Download of %s failed' % player_url)

384

code = urlh.read()

385

res = self._parse_sig_swf(code)

386

else:

387

assert False, 'Invalid player type %r' % player_type

if cache_enabled:

try:

test_string = u''.join(map(compat_chr, range(slen)))

392

cache_res = res(test_string)

393

cache_spec = [ord(c) for c in cache_res]

394

try:

395

os.makedirs(os.path.dirname(cache_fn))

396

except OSError as ose:

397

if ose.errno != errno.EEXIST:

398

raise

399

write_json_file(cache_spec, cache_fn)

400

except Exception:

401

tb = traceback.format_exc()

402

self._downloader.report_warning(

403

u'Writing cache to %r failed: %s' % (cache_fn, tb))

return res

def _print_sig_code(self, func, slen):

408

def gen_sig_code(idxs):

409

def _genslice(start, end, step):

410

starts = u'' if start == 0 else str(start)

411

ends = (u':%d' % (end+step)) if end + step >= 0 else u':'

412

steps = u'' if step == 1 else (u':%d' % step)

413

return u's[%s%s%s]' % (starts, ends, steps)

414

415

step = None

416

start = '(Never used)' # Quelch pyflakes warnings - start will be

417

# set as soon as step is set

418

for i, prev in zip(idxs[1:], idxs[:-1]):

if step is not None:

if i - prev == step:

continue

yield _genslice(start, prev, step)

423

step = None

424

continue

425

if i - prev in [-1, 1]:

step = i - prev

start = prev

continue

else:

yield u's[%d]' % prev

if step is None:

yield u's[%d]' % i

else:

yield _genslice(start, i, step)

435

436

test_string = u''.join(map(compat_chr, range(slen)))

437

cache_res = func(test_string)

438

cache_spec = [ord(c) for c in cache_res]

439

expr_code = u' + '.join(gen_sig_code(cache_spec))

440

code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)

441

self.to_screen(u'Extracted signature function:\n' + code)

442

443

def _parse_sig_js(self, jscode):

444

funcname = self._search_regex(

445

r'signature=([$a-zA-Z]+)', jscode,

446

u'Initial JS player signature function name')

447

448

jsi = JSInterpreter(jscode)

449

initial_function = jsi.extract_function(funcname)

450

return lambda s: initial_function([s])

451

452

def _parse_sig_swf(self, file_contents):

453

swfi = SWFInterpreter(file_contents)

454

TARGET_CLASSNAME = u'SignatureDecipher'

455

searched_class = swfi.extract_class(TARGET_CLASSNAME)

456

initial_function = swfi.extract_function(searched_class, u'decipher')

457

return lambda s: initial_function([s])

458

459

def _decrypt_signature(self, s, video_id, player_url, age_gate=False):

460

"""Turn the encrypted s field into a working signature"""

461

462

if player_url is None:

463

raise ExtractorError(u'Cannot decrypt signature without player_url')

464

465

if player_url.startswith(u'//'):

466

player_url = u'https:' + player_url

467

try:

468

player_id = (player_url, len(s))

469

if player_id not in self._player_cache:

470

func = self._extract_signature_function(

471

video_id, player_url, len(s)

472

)

473

self._player_cache[player_id] = func

474

func = self._player_cache[player_id]

475

if self._downloader.params.get('youtube_print_sig_code'):

476

self._print_sig_code(func, len(s))

477

return func(s)

478

except Exception as e:

479

tb = traceback.format_exc()

480

raise ExtractorError(

481

u'Automatic signature extraction failed: ' + tb, cause=e)

482

483

def _get_available_subtitles(self, video_id, webpage):

484

try:

485

sub_list = self._download_webpage(

486

'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,

487

video_id, note=False)

488

except ExtractorError as err:

489

self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))

490

return {}

491

lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)

sub_lang_list = {}

for l in lang_list:

lang = l[1]

params = compat_urllib_parse.urlencode({

497

'lang': lang,

498

'v': video_id,

499

'fmt': self._downloader.params.get('subtitlesformat', 'srt'),

500

'name': unescapeHTML(l[0]).encode('utf-8'),

501

})

502

url = u'https://www.youtube.com/api/timedtext?' + params

503

sub_lang_list[lang] = url

504

if not sub_lang_list:

505

self._downloader.report_warning(u'video doesn\'t have subtitles')

return {}

return sub_lang_list

def _get_available_automatic_caption(self, video_id, webpage):

510

"""We need the webpage for getting the captions url, pass it as an

511

argument to speed up the process."""

512

sub_format = self._downloader.params.get('subtitlesformat', 'srt')

513

self.to_screen(u'%s: Looking for automatic captions' % video_id)

514

mobj = re.search(r';ytplayer.config = ({.*?});', webpage)

515

err_msg = u'Couldn\'t find automatic captions for %s' % video_id

516

if mobj is None:

517

self._downloader.report_warning(err_msg)

518

return {}

519

player_config = json.loads(mobj.group(1))

520

try:

521

args = player_config[u'args']

522

caption_url = args[u'ttsurl']

523

timestamp = args[u'timestamp']

524

# We get the available subtitles

525

list_params = compat_urllib_parse.urlencode({

'type': 'list',

'tlangs': 1,

'asrs': 1,

})

list_url = caption_url + '&' + list_params

531

caption_list = self._download_xml(list_url, video_id)

532

original_lang_node = caption_list.find('track')

533

if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :

534

self._downloader.report_warning(u'Video doesn\'t have automatic captions')

535

return {}

536

original_lang = original_lang_node.attrib['lang_code']

537

538

sub_lang_list = {}

539

for lang_node in caption_list.findall('target'):

540

sub_lang = lang_node.attrib['lang_code']

541

params = compat_urllib_parse.urlencode({

542

'lang': original_lang,

'tlang': sub_lang,

'fmt': sub_format,

'ts': timestamp,

'kind': 'asr',

})

sub_lang_list[sub_lang] = caption_url + '&' + params

549

return sub_lang_list

550

# An extractor error can be raise by the download process if there are

551

# no automatic captions but there are subtitles

552

except (KeyError, ExtractorError):

553

self._downloader.report_warning(err_msg)

return {}

@classmethod

def extract_id(cls, url):

558

mobj = re.match(cls._VALID_URL, url, re.VERBOSE)

559

if mobj is None:

560

raise ExtractorError(u'Invalid URL: %s' % url)

561

video_id = mobj.group(2)

562

return video_id

563

564

def _extract_from_m3u8(self, manifest_url, video_id):

565

url_map = {}

566

def _get_urls(_manifest):

567

lines = _manifest.split('\n')

568

urls = filter(lambda l: l and not l.startswith('#'),

569

lines)

570

return urls

571

manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')

572

formats_urls = _get_urls(manifest)

573

for format_url in formats_urls:

574

itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')

575

url_map[itag] = format_url

576

return url_map

577

578

def _extract_annotations(self, video_id):

579

url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id

580

return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')

581

582

def _real_extract(self, url):

583

proto = (

584

u'http' if self._downloader.params.get('prefer_insecure', False)

585

else u'https')

586

587

# Extract original video URL from URL with redirection, like age verification, using next_url parameter

588

mobj = re.search(self._NEXT_URL_RE, url)

589

if mobj:

590

url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')

591

video_id = self.extract_id(url)

592

593

# Get video webpage

594

url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id

595

video_webpage = self._download_webpage(url, video_id)

596

597

# Attempt to extract SWF player URL

598

mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

599

if mobj is not None:

600

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

else:

player_url = None

# Get video info

self.report_video_info_webpage_download(video_id)

606

if re.search(r'player-age-gate-content">', video_webpage) is not None:

607

self.report_age_confirmation()

608

age_gate = True

609

# We simulate the access to the video from www.youtube.com/v/{video_id}

610

# this can be viewed without login into Youtube

611

data = compat_urllib_parse.urlencode({

612

'video_id': video_id,

613

'eurl': 'https://youtube.googleapis.com/v/' + video_id,

614

'sts': self._search_regex(

615

r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),

616

})

617

video_info_url = proto + '://www.youtube.com/get_video_info?' + data

618

video_info_webpage = self._download_webpage(video_info_url, video_id,

619

note=False,

620

errnote='unable to download video info webpage')

621

video_info = compat_parse_qs(video_info_webpage)

622

else:

623

age_gate = False

624

for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:

625

video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'

626

% (video_id, el_type))

627

video_info_webpage = self._download_webpage(video_info_url, video_id,

628

note=False,

629

errnote='unable to download video info webpage')

630

video_info = compat_parse_qs(video_info_webpage)

631

if 'token' in video_info:

632

break

633

if 'token' not in video_info:

634

if 'reason' in video_info:

635

raise ExtractorError(

636

u'YouTube said: %s' % video_info['reason'][0],

637

expected=True, video_id=video_id)

638

else:

639

raise ExtractorError(

640

u'"token" parameter not in video info for unknown reason',

641

video_id=video_id)

642

643

if 'view_count' in video_info:

644

view_count = int(video_info['view_count'][0])

else:

view_count = None

# Check for "rental" videos

649

if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:

650

raise ExtractorError(u'"rental" videos not supported')

651

652

# Start extracting information

653

self.report_information_extraction(video_id)

654

655

# uploader

656

if 'author' not in video_info:

657

raise ExtractorError(u'Unable to extract uploader name')

658

video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])

659

660

# uploader_id

661

video_uploader_id = None

662

mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)

663

if mobj is not None:

664

video_uploader_id = mobj.group(1)

665

else:

666

self._downloader.report_warning(u'unable to extract uploader nickname')

667

668

# title

669

if 'title' in video_info:

670

video_title = video_info['title'][0]

671

else:

672

self._downloader.report_warning(u'Unable to extract video title')

video_title = u'_'

# thumbnail image

# We try first to get a high quality image:

677

m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',

678

video_webpage, re.DOTALL)

679

if m_thumb is not None:

680

video_thumbnail = m_thumb.group(1)

681

elif 'thumbnail_url' not in video_info:

682

self._downloader.report_warning(u'unable to extract video thumbnail')

683

video_thumbnail = None

684

else: # don't panic if we can't find it

685

video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])

# upload date

upload_date = None

mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)

690

if mobj is None:

691

mobj = re.search(

692

r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',

693

video_webpage)

694

if mobj is not None:

695

upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())

696

upload_date = unified_strdate(upload_date)

697

698

m_cat_container = get_element_by_id("eow-category", video_webpage)

699

if m_cat_container:

700

category = self._html_search_regex(

701

r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',

702

default=None)

703

video_categories = None if category is None else [category]

704

else:

705

video_categories = None

706

707

# description

708

video_description = get_element_by_id("eow-description", video_webpage)

709

if video_description:

710

video_description = re.sub(r'''(?x)

711

<a\s+

712

(?:[a-zA-Z-]+="[^"]+"\s+)*?

713

title="([^"]+)"\s+

714

(?:[a-zA-Z-]+="[^"]+"\s+)*?

715

class="yt-uix-redirect-link"\s*>

716

[^<]+

717

</a>

718

''', r'\1', video_description)

719

video_description = clean_html(video_description)

720

else:

721

fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)

722

if fd_mobj:

723

video_description = unescapeHTML(fd_mobj.group(1))

724

else:

725

video_description = u''

726

727

def _extract_count(klass):

728

count = self._search_regex(

729

r'class="%s">([\d,]+)</span>' % re.escape(klass),

730

video_webpage, klass, default=None)

731

if count is not None:

732

return int(count.replace(',', ''))

733

return None

734

like_count = _extract_count(u'likes-count')

735

dislike_count = _extract_count(u'dislikes-count')

736

737

# subtitles

738

video_subtitles = self.extract_subtitles(video_id, video_webpage)

739

740

if self._downloader.params.get('listsubtitles', False):

741

self._list_available_subtitles(video_id, video_webpage)

742

return

743

744

if 'length_seconds' not in video_info:

745

self._downloader.report_warning(u'unable to extract video duration')

746

video_duration = None

747

else:

748

video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))

749

750

# annotations

751

video_annotations = None

752

if self._downloader.params.get('writeannotations', False):

753

video_annotations = self._extract_annotations(video_id)

754

755

# Decide which formats to download

756

try:

757

mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)

758

if not mobj:

759

raise ValueError('Could not find vevo ID')

760

json_code = uppercase_escape(mobj.group(1))

761

ytplayer_config = json.loads(json_code)

762

args = ytplayer_config['args']

763

# Easy way to know if the 's' value is in url_encoded_fmt_stream_map

764

# this signatures are encrypted

765

if 'url_encoded_fmt_stream_map' not in args:

766

raise ValueError(u'No stream_map present') # caught below

767

re_signature = re.compile(r'[&,]s=')

768

m_s = re_signature.search(args['url_encoded_fmt_stream_map'])

769

if m_s is not None:

770

self.to_screen(u'%s: Encrypted signatures detected.' % video_id)

771

video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]

772

m_s = re_signature.search(args.get('adaptive_fmts', u''))

773

if m_s is not None:

774

if 'adaptive_fmts' in video_info:

775

video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']

776

else:

777

video_info['adaptive_fmts'] = [args['adaptive_fmts']]

except ValueError:

pass

def _map_to_format_list(urlmap):

782

formats = []

783

for itag, video_real_url in urlmap.items():

784

dct = {

785

'format_id': itag,

786

'url': video_real_url,

787

'player_url': player_url,

788

}

789

if itag in self._formats:

790

dct.update(self._formats[itag])

formats.append(dct)

return formats

if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

795

self.report_rtmp_download()

796

formats = [{

797

'format_id': '_rtmp',

798

'protocol': 'rtmp',

799

'url': video_info['conn'][0],

800

'player_url': player_url,

801

}]

802

elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:

803

encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]

804

if 'rtmpe%3Dyes' in encoded_url_map:

805

raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)

806

url_map = {}

807

for url_data_str in encoded_url_map.split(','):

808

url_data = compat_parse_qs(url_data_str)

809

if 'itag' in url_data and 'url' in url_data:

810

url = url_data['url'][0]

811

if 'sig' in url_data:

812

url += '&signature=' + url_data['sig'][0]

813

elif 's' in url_data:

814

encrypted_sig = url_data['s'][0]

815

816

if not age_gate:

817

jsplayer_url_json = self._search_regex(

818

r'"assets":.+?"js":\s*("[^"]+")',

819

video_webpage, u'JS player URL')

820

player_url = json.loads(jsplayer_url_json)

821

if player_url is None:

822

player_url_json = self._search_regex(

823

r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',

824

video_webpage, u'age gate player URL')

825

player_url = json.loads(player_url_json)

826

827

if self._downloader.params.get('verbose'):

828

if player_url is None:

829

player_version = 'unknown'

830

player_desc = 'unknown'

831

else:

832

if player_url.endswith('swf'):

833

player_version = self._search_regex(

834

r'-(.+?)(?:/watch_as3)?\.swf$', player_url,

835

u'flash player', fatal=False)

836

player_desc = 'flash player %s' % player_version

837

else:

838

player_version = self._search_regex(

839

r'html5player-([^/]+?)(?:/html5player)?\.js',

840

player_url,

841

'html5 player', fatal=False)

842

player_desc = u'html5 player %s' % player_version

843

844

parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))

845

self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %

846

(len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))

847

848

signature = self._decrypt_signature(

849

encrypted_sig, video_id, player_url, age_gate)

850

url += '&signature=' + signature

851

if 'ratebypass' not in url:

852

url += '&ratebypass=yes'

853

url_map[url_data['itag'][0]] = url

854

formats = _map_to_format_list(url_map)

855

elif video_info.get('hlsvp'):

856

manifest_url = video_info['hlsvp'][0]

857

url_map = self._extract_from_m3u8(manifest_url, video_id)

858

formats = _map_to_format_list(url_map)

859

else:

860

raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')

861

862

# Look for the DASH manifest

863

if (self._downloader.params.get('youtube_include_dash_manifest', False)):

864

try:

865

# The DASH manifest used needs to be the one from the original video_webpage.

866

# The one found in get_video_info seems to be using different signatures.

867

# However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.

868

# Luckily, it seems, this case uses some kind of default signature (len == 86), so the

869

# combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.

870

if age_gate:

871

dash_manifest_url = video_info.get('dashmpd')[0]

872

else:

873

dash_manifest_url = ytplayer_config['args']['dashmpd']

874

def decrypt_sig(mobj):

875

s = mobj.group(1)

876

dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)

877

return '/signature/%s' % dec_s

878

dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)

879

dash_doc = self._download_xml(

880

dash_manifest_url, video_id,

881

note=u'Downloading DASH manifest',

882

errnote=u'Could not download DASH manifest')

883

for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):

884

url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')

885

if url_el is None:

886

continue

887

format_id = r.attrib['id']

888

video_url = url_el.text

889

filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))

890

f = {

891

'format_id': format_id,

892

'url': video_url,

893

'width': int_or_none(r.attrib.get('width')),

894

'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),

895

'asr': int_or_none(r.attrib.get('audioSamplingRate')),

896

'filesize': filesize,

897

}

898

try:

899

existing_format = next(

900

fo for fo in formats

901

if fo['format_id'] == format_id)

902

except StopIteration:

903

f.update(self._formats.get(format_id, {}))

904

formats.append(f)

905

else:

906

existing_format.update(f)

907

908

except (ExtractorError, KeyError) as e:

909

self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)

910

911

self._sort_formats(formats)

return {

'id': video_id,

'uploader': video_uploader,

916

'uploader_id': video_uploader_id,

917

'upload_date': upload_date,

918

'title': video_title,

919

'thumbnail': video_thumbnail,

920

'description': video_description,

921

'categories': video_categories,

922

'subtitles': video_subtitles,

923

'duration': video_duration,

924

'age_limit': 18 if age_gate else 0,

925

'annotations': video_annotations,

926

'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,

927

'view_count': view_count,

928

'like_count': like_count,

929

'dislike_count': dislike_count,

'formats': formats,

}

class YoutubePlaylistIE(YoutubeBaseInfoExtractor):

934

IE_DESC = u'YouTube.com playlists'

935

_VALID_URL = r"""(?x)(?:

(?:https?://)?

(?:\w+\.)?

youtube\.com/

(?:

\? (?:.*?&)*? (?:p|a|list)=

| p/

)

(

(?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}

946

# Top tracks, they can also include dots

|(?:MC)[\w\.]*

)

.*

|

((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})

952

)"""

953

_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'

954

_MORE_PAGES_INDICATOR = r'data-link-type="next"'

955

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'

956

IE_NAME = u'youtube:playlist'

957

958

def _real_initialize(self):

959

self._login()

960

961

def _ids_to_results(self, ids):

962

return [self.url_result(vid_id, 'Youtube', video_id=vid_id)

963

for vid_id in ids]

964

965

def _extract_mix(self, playlist_id):

966

# The mixes are generated from a a single video

967

# the id of the playlist is just 'RD' + video_id

968

url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)

969

webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')

970

search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)

971

title_span = (search_title('playlist-title') or

972

search_title('title long-title') or search_title('title'))

973

title = clean_html(title_span)

974

video_re = r'''(?x)data-video-username=".*?".*?

975

href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id)

976

ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))

977

url_results = self._ids_to_results(ids)

978

979

return self.playlist_result(url_results, playlist_id, title)

980

981

def _real_extract(self, url):

982

# Extract playlist id

983

mobj = re.match(self._VALID_URL, url)

984

if mobj is None:

985

raise ExtractorError(u'Invalid URL: %s' % url)

986

playlist_id = mobj.group(1) or mobj.group(2)

987

988

# Check if it's a video-specific URL

989

query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)

990

if 'v' in query_dict:

991

video_id = query_dict['v'][0]

992

if self._downloader.params.get('noplaylist'):

993

self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)

994

return self.url_result(video_id, 'Youtube', video_id=video_id)

995

else:

996

self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))

997

998

if playlist_id.startswith('RD'):

999

# Mixes require a custom extraction process

1000

return self._extract_mix(playlist_id)

1001

if playlist_id.startswith('TL'):

1002

raise ExtractorError(u'For downloading YouTube.com top lists, use '

1003

u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)

1004

1005

url = self._TEMPLATE_URL % playlist_id

1006

page = self._download_webpage(url, playlist_id)

1007

more_widget_html = content_html = page

1008

1009

# Check if the playlist exists or is private

1010

if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:

1011

raise ExtractorError(

1012

u'The playlist doesn\'t exist or is private, use --username or '

1013

'--netrc to access it.',

1014

expected=True)

1015

1016

# Extract the video ids from the playlist pages

1017

ids = []

1018

1019

for page_num in itertools.count(1):

1020

matches = re.finditer(self._VIDEO_RE, content_html)

1021

# We remove the duplicates and the link with index 0

1022

# (it's not the first video of the playlist)

1023

new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')

1024

ids.extend(new_ids)

1025

1026

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

1031

'https://youtube.com/%s' % mobj.group('more'), playlist_id,

1032

'Downloading page #%s' % page_num,

1033

transform_source=uppercase_escape)

1034

content_html = more['content_html']

1035

more_widget_html = more['load_more_widget_html']

1036

1037

playlist_title = self._html_search_regex(

1038

r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',

1039

page, u'title')

1040

1041

url_results = self._ids_to_results(ids)

1042

return self.playlist_result(url_results, playlist_id, playlist_title)

1043

1044

1045

class YoutubeTopListIE(YoutubePlaylistIE):

1046

IE_NAME = u'youtube:toplist'

1047

IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'

1048

u' (Example: "yttoplist:music:Top Tracks")')

1049

_VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'

1050

1051

def _real_extract(self, url):

1052

mobj = re.match(self._VALID_URL, url)

1053

channel = mobj.group('chann')

1054

title = mobj.group('title')

1055

query = compat_urllib_parse.urlencode({'title': title})

1056

playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)

1057

channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)

1058

link = self._html_search_regex(playlist_re, channel_page, u'list')

1059

url = compat_urlparse.urljoin('https://www.youtube.com/', link)

1060

1061

video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'

1062

ids = []

1063

# sometimes the webpage doesn't contain the videos

1064

# retry until we get them

1065

for i in itertools.count(0):

1066

msg = u'Downloading Youtube mix'

1067

if i > 0:

1068

msg += ', retry #%d' % i

1069

webpage = self._download_webpage(url, title, msg)

1070

ids = orderedSet(re.findall(video_re, webpage))

1071

if ids:

1072

break

1073

url_results = self._ids_to_results(ids)

1074

return self.playlist_result(url_results, playlist_title=title)

1075

1076

1077

class YoutubeChannelIE(InfoExtractor):

1078

IE_DESC = u'YouTube.com channels'

1079

_VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"

1080

_MORE_PAGES_INDICATOR = 'yt-uix-load-more'

1081

_MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'

1082

IE_NAME = u'youtube:channel'

1083

1084

def extract_videos_from_page(self, page):

1085

ids_in_page = []

1086

for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):

1087

if mobj.group(1) not in ids_in_page:

1088

ids_in_page.append(mobj.group(1))

1089

return ids_in_page

1090

1091

def _real_extract(self, url):

1092

# Extract channel id

1093

mobj = re.match(self._VALID_URL, url)

1094

if mobj is None:

1095

raise ExtractorError(u'Invalid URL: %s' % url)

1096

1097

# Download channel page

1098

channel_id = mobj.group(1)

1099

video_ids = []

1100

url = 'https://www.youtube.com/channel/%s/videos' % channel_id

1101

channel_page = self._download_webpage(url, channel_id)

1102

autogenerated = re.search(r'''(?x)

1103

class="[^"]*?(?:

1104

channel-header-autogenerated-label|

1105

yt-channel-title-autogenerated

1106

)[^"]*"''', channel_page) is not None

1107

1108

if autogenerated:

1109

# The videos are contained in a single page

1110

# the ajax pages can't be used, they are empty

1111

video_ids = self.extract_videos_from_page(channel_page)

1112

else:

1113

# Download all channel pages using the json-based channel_ajax query

1114

for pagenum in itertools.count(1):

1115

url = self._MORE_PAGES_URL % (pagenum, channel_id)

1116

page = self._download_json(

1117

url, channel_id, note=u'Downloading page #%s' % pagenum,

1118

transform_source=uppercase_escape)

1119

1120

ids_in_page = self.extract_videos_from_page(page['content_html'])

1121

video_ids.extend(ids_in_page)

1122

1123

if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:

1124

break

1125

1126

self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))

1127

1128

url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)

1129

for video_id in video_ids]

1130

return self.playlist_result(url_entries, channel_id)

1131

1132

1133

class YoutubeUserIE(InfoExtractor):

1134

IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'

1135

_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'

1136

_TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'

1137

_GDATA_PAGE_SIZE = 50

1138

_GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'

1139

IE_NAME = u'youtube:user'

1140

1141

@classmethod

1142

def suitable(cls, url):

1143

# Don't return True if the url can be extracted with other youtube

1144

# extractor, the regex would is too permissive and it would match.

1145

other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)

1146

if any(ie.suitable(url) for ie in other_ies): return False

1147

else: return super(YoutubeUserIE, cls).suitable(url)

1148

1149

def _real_extract(self, url):

1150

# Extract username

1151

mobj = re.match(self._VALID_URL, url)

1152

if mobj is None:

1153

raise ExtractorError(u'Invalid URL: %s' % url)

1154

1155

username = mobj.group(1)

1156

1157

# Download video ids using YouTube Data API. Result size per

1158

# query is limited (currently to 50 videos) so we need to query

1159

# page by page until there are no video ids - it means we got

1160

# all of them.

1161

1162

def download_page(pagenum):

1163

start_index = pagenum * self._GDATA_PAGE_SIZE + 1

1164

1165

gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)

1166

page = self._download_webpage(

1167

gdata_url, username,

1168

u'Downloading video ids from %d to %d' % (

1169

start_index, start_index + self._GDATA_PAGE_SIZE))

1170

1171

try:

1172

response = json.loads(page)

1173

except ValueError as err:

1174

raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))

1175

if 'entry' not in response['feed']:

1176

return

1177

1178

# Extract video identifiers

1179

entries = response['feed']['entry']

1180

for entry in entries:

1181

title = entry['title']['$t']

1182

video_id = entry['id']['$t'].split('/')[-1]

yield {

'_type': 'url',

'url': video_id,

'ie_key': 'Youtube',

'id': video_id,

'title': title,

}

url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)

1191

1192

return self.playlist_result(url_results, playlist_title=username)

1193

1194

1195

class YoutubeSearchIE(SearchInfoExtractor):

1196

IE_DESC = u'YouTube.com searches'

1197

_API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'

1198

_MAX_RESULTS = 1000

1199

IE_NAME = u'youtube:search'

1200

_SEARCH_KEY = 'ytsearch'

1201

1202

def _get_n_results(self, query, n):

1203

"""Get a specified number of results for a query"""

video_ids = []

pagenum = 0

limit = n

PAGE_SIZE = 50

while (PAGE_SIZE * pagenum) < limit:

1211

result_url = self._API_URL % (

1212

compat_urllib_parse.quote_plus(query.encode('utf-8')),

1213

(PAGE_SIZE * pagenum) + 1)

1214

data_json = self._download_webpage(

1215

result_url, video_id=u'query "%s"' % query,

1216

note=u'Downloading page %s' % (pagenum + 1),

1217

errnote=u'Unable to download API page')

1218

data = json.loads(data_json)

1219

api_response = data['data']

1220

1221

if 'items' not in api_response:

1222

raise ExtractorError(

1223

u'[youtube] No video results', expected=True)

1224

1225

new_ids = list(video['id'] for video in api_response['items'])

1226

video_ids += new_ids

1227

1228

limit = min(n, api_response['totalItems'])

1229

pagenum += 1

1230

1231

if len(video_ids) > n:

1232

video_ids = video_ids[:n]

1233

videos = [self.url_result(video_id, 'Youtube', video_id=video_id)

1234

for video_id in video_ids]

1235

return self.playlist_result(videos, query)

1236

1237

1238

class YoutubeSearchDateIE(YoutubeSearchIE):

1239

IE_NAME = YoutubeSearchIE.IE_NAME + ':date'

1240

_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'

1241

_SEARCH_KEY = 'ytsearchdate'

1242

IE_DESC = u'YouTube.com searches, newest videos first'

1243

1244

1245

class YoutubeSearchURLIE(InfoExtractor):

1246

IE_DESC = u'YouTube.com search URLs'

1247

IE_NAME = u'youtube:search_url'

1248

_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'

1249

1250

def _real_extract(self, url):

1251

mobj = re.match(self._VALID_URL, url)

1252

query = compat_urllib_parse.unquote_plus(mobj.group('query'))

1253

1254

webpage = self._download_webpage(url, query)

1255

result_code = self._search_regex(

1256

r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')

1257

1258

part_codes = re.findall(

1259

r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)

1260

entries = []

1261

for part_code in part_codes:

1262

part_title = self._html_search_regex(

1263

[r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)

1264

part_url_snippet = self._html_search_regex(

1265

r'(?s)href="([^"]+)"', part_code, 'item URL')

1266

part_url = compat_urlparse.urljoin(

1267

'https://www.youtube.com/', part_url_snippet)

entries.append({

'_type': 'url',

'url': part_url,

'title': part_title,

})

return {

'_type': 'playlist',

'entries': entries,

'title': query,

}

class YoutubeShowIE(InfoExtractor):

1282

IE_DESC = u'YouTube.com (multi-season) shows'

1283

_VALID_URL = r'https?://www\.youtube\.com/show/(.*)'

1284

IE_NAME = u'youtube:show'

1285

1286

def _real_extract(self, url):

1287

mobj = re.match(self._VALID_URL, url)

1288

show_name = mobj.group(1)

1289

webpage = self._download_webpage(url, show_name, u'Downloading show webpage')

1290

# There's one playlist for each season of the show

1291

m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))

1292

self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))

1293

return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]

1294

1295

1296

class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):

1297

"""

1298

Base class for extractors that fetch info from

1299

http://www.youtube.com/feed_ajax

1300

Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.

1301

"""

1302

_LOGIN_REQUIRED = True

1303

# use action_load_personal_feed instead of action_load_system_feed

1304

_PERSONAL_FEED = False

1305

1306

@property

1307

def _FEED_TEMPLATE(self):

1308

action = 'action_load_system_feed'

1309

if self._PERSONAL_FEED:

1310

action = 'action_load_personal_feed'

1311

return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)

@property

def IE_NAME(self):

return u'youtube:%s' % self._FEED_NAME

1316

1317

def _real_initialize(self):

1318

self._login()

1319

1320

def _real_extract(self, url):

1321

feed_entries = []

1322

paging = 0

1323

for i in itertools.count(1):

1324

info = self._download_json(self._FEED_TEMPLATE % paging,

1325

u'%s feed' % self._FEED_NAME,

1326

u'Downloading page %s' % i)

1327

feed_html = info.get('feed_html') or info.get('content_html')

1328

m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)

1329

ids = orderedSet(m.group(1) for m in m_ids)

1330

feed_entries.extend(

1331

self.url_result(video_id, 'Youtube', video_id=video_id)

1332

for video_id in ids)

1333

mobj = re.search(

1334

r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',

feed_html)

if mobj is None:

break

paging = mobj.group('paging')

1339

return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)

1340

1341

class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):

1342

IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'

1343

_VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'

1344

_FEED_NAME = 'subscriptions'

1345

_PLAYLIST_TITLE = u'Youtube Subscriptions'

1346

1347

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):

1348

IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'

1349

_VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'

1350

_FEED_NAME = 'recommended'

1351

_PLAYLIST_TITLE = u'Youtube Recommended videos'

1352

1353

class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):

1354

IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'

1355

_VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'

1356

_FEED_NAME = 'watch_later'

1357

_PLAYLIST_TITLE = u'Youtube Watch Later'

1358

_PERSONAL_FEED = True

1359

1360

class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):

1361

IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'

1362

_VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'

1363

_FEED_NAME = 'history'

1364

_PERSONAL_FEED = True

1365

_PLAYLIST_TITLE = u'Youtube Watch History'

1366

1367

class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):

1368

IE_NAME = u'youtube:favorites'

1369

IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'

1370

_VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'

1371

_LOGIN_REQUIRED = True

1372

1373

def _real_extract(self, url):

1374

webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')

1375

playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')

1376

return self.url_result(playlist_id, 'YoutubePlaylist')

1377

1378

1379

class YoutubeTruncatedURLIE(InfoExtractor):

1380

IE_NAME = 'youtube:truncated_url'

1381

IE_DESC = False # Do not list

1382

_VALID_URL = r'''(?x)

1383

(?:https?://)?[^/]+/watch\?(?:

1384

feature=[a-z_]+|

1385

annotation_id=annotation_[^&]+

1386

)?$|

1387

(?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$

'''

_TESTS = [{

'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',

1392

'only_matching': True,

1393

}, {

1394

'url': 'http://www.youtube.com/watch?',

1395

'only_matching': True,

1396

}]

1397

1398

def _real_extract(self, url):

1399

raise ExtractorError(

1400

u'Did you forget to quote the URL? Remember that & is a meta '

1401

u'character in most shells, so you want to put the URL in quotes, '

1402

u'like youtube-dl '

1403

u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '

1404

u' or simply youtube-dl BaW_jenozKc .',

1405

expected=True)