jfr.im git - yt-dlp.git/blame_incremental - youtube

... / ...

Commit	Line	Data
	1	# coding: utf-8
	2
	3	import errno
	4	import io
	5	import itertools
	6	import json
	7	import os.path
	8	import re
	9	import traceback
	10
	11	from .common import InfoExtractor, SearchInfoExtractor
	12	from .subtitles import SubtitlesInfoExtractor
	13	from ..jsinterp import JSInterpreter
	14	from ..swfinterp import SWFInterpreter
	15	from ..utils import (
	16	compat_chr,
	17	compat_parse_qs,
	18	compat_urllib_parse,
	19	compat_urllib_request,
	20	compat_urlparse,
	21	compat_str,
	22
	23	clean_html,
	24	get_cachedir,
	25	get_element_by_id,
	26	get_element_by_attribute,
	27	ExtractorError,
	28	int_or_none,
	29	PagedList,
	30	unescapeHTML,
	31	unified_strdate,
	32	orderedSet,
	33	write_json_file,
	34	uppercase_escape,
	35	)
	36
	37	class YoutubeBaseInfoExtractor(InfoExtractor):
	38	"""Provide base functions for Youtube extractors"""
	39	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	40	_LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
	41	_AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
	42	_NETRC_MACHINE = 'youtube'
	43	# If True it will raise an error if no login info is provided
	44	_LOGIN_REQUIRED = False
	45
	46	def _set_language(self):
	47	return bool(self._download_webpage(
	48	self._LANG_URL, None,
	49	note=u'Setting language', errnote='unable to set language',
	50	fatal=False))
	51
	52	def _login(self):
	53	(username, password) = self._get_login_info()
	54	# No authentication to be performed
	55	if username is None:
	56	if self._LOGIN_REQUIRED:
	57	raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
	58	return False
	59
	60	login_page = self._download_webpage(
	61	self._LOGIN_URL, None,
	62	note=u'Downloading login page',
	63	errnote=u'unable to fetch login page', fatal=False)
	64	if login_page is False:
	65	return
	66
	67	galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
	68	login_page, u'Login GALX parameter')
	69
	70	# Log in
	71	login_form_strs = {
	72	u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
	73	u'Email': username,
	74	u'GALX': galx,
	75	u'Passwd': password,
	76	u'PersistentCookie': u'yes',
	77	u'_utf8': u'霱',
	78	u'bgresponse': u'js_disabled',
	79	u'checkConnection': u'',
	80	u'checkedDomains': u'youtube',
	81	u'dnConn': u'',
	82	u'pstMsg': u'0',
	83	u'rmShown': u'1',
	84	u'secTok': u'',
	85	u'signIn': u'Sign in',
	86	u'timeStmp': u'',
	87	u'service': u'youtube',
	88	u'uilel': u'3',
	89	u'hl': u'en_US',
	90	}
	91	# Convert to UTF-8 before urlencode because Python 2.x's urlencode
	92	# chokes on unicode
	93	login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
	94	login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
	95
	96	req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
	97	login_results = self._download_webpage(
	98	req, None,
	99	note=u'Logging in', errnote=u'unable to log in', fatal=False)
	100	if login_results is False:
	101	return False
	102	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
	103	self._downloader.report_warning(u'unable to log in: bad username or password')
	104	return False
	105	return True
	106
	107	def _confirm_age(self):
	108	age_form = {
	109	'next_url': '/',
	110	'action_confirm': 'Confirm',
	111	}
	112	req = compat_urllib_request.Request(self._AGE_URL,
	113	compat_urllib_parse.urlencode(age_form).encode('ascii'))
	114
	115	self._download_webpage(
	116	req, None,
	117	note=u'Confirming age', errnote=u'Unable to confirm age')
	118	return True
	119
	120	def _real_initialize(self):
	121	if self._downloader is None:
	122	return
	123	if not self._set_language():
	124	return
	125	if not self._login():
	126	return
	127	self._confirm_age()
	128
	129
	130	class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
	131	IE_DESC = u'YouTube.com'
	132	_VALID_URL = r"""(?x)^
	133	(
	134	(?:https?://\|//)? # http(s):// or protocol-independent URL (optional)
	135	(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/\|
	136	(?:www\.)?deturl\.com/www\.youtube\.com/\|
	137	(?:www\.)?pwnyoutube\.com/\|
	138	(?:www\.)?yourepeat\.com/\|
	139	tube\.majestyc\.net/\|
	140	youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
	141	(?:.*?\#/)? # handle anchor (#/) redirect urls
	142	(?: # the various things that can precede the ID:
	143	(?:(?:v\|embed\|e)/) # v/ or embed/ or e/
	144	\|(?: # or the v= param in all its forms
	145	(?:(?:watch\|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	146	(?:\?\|\#!?) # the params delimiter ? or # or #!
	147	(?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
	148	v=
	149	)
	150	))
	151	\|youtu\.be/ # just youtu.be/xxxx
	152	\|https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
	153	)
	154	)? # all until now is optional -> you can pass the naked ID
	155	([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
	156	(?(1).+)? # if we found the ID, everything can follow
	157	$"""
	158	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	159	_formats = {
	160	'5': {'ext': 'flv', 'width': 400, 'height': 240},
	161	'6': {'ext': 'flv', 'width': 450, 'height': 270},
	162	'13': {'ext': '3gp'},
	163	'17': {'ext': '3gp', 'width': 176, 'height': 144},
	164	'18': {'ext': 'mp4', 'width': 640, 'height': 360},
	165	'22': {'ext': 'mp4', 'width': 1280, 'height': 720},
	166	'34': {'ext': 'flv', 'width': 640, 'height': 360},
	167	'35': {'ext': 'flv', 'width': 854, 'height': 480},
	168	'36': {'ext': '3gp', 'width': 320, 'height': 240},
	169	'37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
	170	'38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
	171	'43': {'ext': 'webm', 'width': 640, 'height': 360},
	172	'44': {'ext': 'webm', 'width': 854, 'height': 480},
	173	'45': {'ext': 'webm', 'width': 1280, 'height': 720},
	174	'46': {'ext': 'webm', 'width': 1920, 'height': 1080},
	175
	176
	177	# 3d videos
	178	'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
	179	'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
	180	'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
	181	'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
	182	'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
	183	'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
	184	'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
	185
	186	# Apple HTTP Live Streaming
	187	'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
	188	'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
	189	'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
	190	'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
	191	'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
	192	'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
	193	'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
	194
	195	# DASH mp4 video
	196	'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	197	'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	198	'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	199	'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	200	'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	201	'138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	202	'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	203	'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	204
	205	# Dash mp4 audio
	206	'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
	207	'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
	208	'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
	209
	210	# Dash webm
	211	'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	212	'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	213	'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	214	'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	215	'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	216	'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	217	'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	218	'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	219	'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	220	'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	221	'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	222	'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	223	'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	224	'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	225	'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	226
	227	# Dash webm audio
	228	'171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
	229	'172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
	230
	231	# RTMP (unnamed)
	232	'_rtmp': {'protocol': 'rtmp'},
	233	}
	234
	235	IE_NAME = u'youtube'
	236	_TESTS = [
	237	{
	238	u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
	239	u"file": u"BaW_jenozKc.mp4",
	240	u"info_dict": {
	241	u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
	242	u"uploader": u"Philipp Hagemeister",
	243	u"uploader_id": u"phihag",
	244	u"upload_date": u"20121002",
	245	u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
	246	u"categories": [u'Science & Technology'],
	247	}
	248	},
	249	{
	250	u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
	251	u"file": u"UxxajLWwzqY.mp4",
	252	u"note": u"Test generic use_cipher_signature video (#897)",
	253	u"info_dict": {
	254	u"upload_date": u"20120506",
	255	u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
	256	u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
	257	u"uploader": u"Icona Pop",
	258	u"uploader_id": u"IconaPop"
	259	}
	260	},
	261	{
	262	u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
	263	u"file": u"07FYdnEawAQ.mp4",
	264	u"note": u"Test VEVO video with age protection (#956)",
	265	u"info_dict": {
	266	u"upload_date": u"20130703",
	267	u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
	268	u"description": u"md5:64249768eec3bc4276236606ea996373",
	269	u"uploader": u"justintimberlakeVEVO",
	270	u"uploader_id": u"justintimberlakeVEVO"
	271	}
	272	},
	273	{
	274	u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
	275	u"file": u"yZIXLfi8CZQ.mp4",
	276	u"note": u"Embed-only video (#1746)",
	277	u"info_dict": {
	278	u"upload_date": u"20120608",
	279	u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
	280	u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
	281	u"uploader": u"SET India",
	282	u"uploader_id": u"setindia"
	283	}
	284	},
	285	{
	286	u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
	287	u"file": u"a9LDPn-MO4I.m4a",
	288	u"note": u"256k DASH audio (format 141) via DASH manifest",
	289	u"info_dict": {
	290	u"upload_date": "20121002",
	291	u"uploader_id": "8KVIDEO",
	292	u"description": "No description available.",
	293	u"uploader": "8KVIDEO",
	294	u"title": "UHDTV TEST 8K VIDEO.mp4"
	295	},
	296	u"params": {
	297	u"youtube_include_dash_manifest": True,
	298	u"format": "141",
	299	},
	300	},
	301	# DASH manifest with encrypted signature
	302	{
	303	u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
	304	u'info_dict': {
	305	u'id': u'IB3lcPjvWLA',
	306	u'ext': u'm4a',
	307	u'title': u'Afrojack - The Spark ft. Spree Wilson',
	308	u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
	309	u'uploader': u'AfrojackVEVO',
	310	u'uploader_id': u'AfrojackVEVO',
	311	u'upload_date': u'20131011',
	312	},
	313	u"params": {
	314	u'youtube_include_dash_manifest': True,
	315	u'format': '141',
	316	},
	317	},
	318	]
	319
	320
	321	@classmethod
	322	def suitable(cls, url):
	323	"""Receives a URL and returns True if suitable for this IE."""
	324	if YoutubePlaylistIE.suitable(url): return False
	325	return re.match(cls._VALID_URL, url) is not None
	326
	327	def __init__(self, args, *kwargs):
	328	super(YoutubeIE, self).__init__(args, *kwargs)
	329	self._player_cache = {}
	330
	331	def report_video_info_webpage_download(self, video_id):
	332	"""Report attempt to download video info webpage."""
	333	self.to_screen(u'%s: Downloading video info webpage' % video_id)
	334
	335	def report_information_extraction(self, video_id):
	336	"""Report attempt to extract video information."""
	337	self.to_screen(u'%s: Extracting video information' % video_id)
	338
	339	def report_unavailable_format(self, video_id, format):
	340	"""Report extracted video URL."""
	341	self.to_screen(u'%s: Format %s not available' % (video_id, format))
	342
	343	def report_rtmp_download(self):
	344	"""Indicate the download will use the RTMP protocol."""
	345	self.to_screen(u'RTMP download detected')
	346
	347	def _signature_cache_id(self, example_sig):
	348	""" Return a string representation of a signature """
	349	return u'.'.join(compat_str(len(part)) for part in example_sig.split('.'))
	350
	351	def _extract_signature_function(self, video_id, player_url, example_sig):
	352	id_m = re.match(
	353	r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3\|/html5player)?\.(?P<ext>[a-z]+)$',
	354	player_url)
	355	if not id_m:
	356	raise ExtractorError('Cannot identify player %r' % player_url)
	357	player_type = id_m.group('ext')
	358	player_id = id_m.group('id')
	359
	360	# Read from filesystem cache
	361	func_id = '%s_%s_%s' % (
	362	player_type, player_id, self._signature_cache_id(example_sig))
	363	assert os.path.basename(func_id) == func_id
	364	cache_dir = get_cachedir(self._downloader.params)
	365
	366	cache_enabled = cache_dir is not None
	367	if cache_enabled:
	368	cache_fn = os.path.join(os.path.expanduser(cache_dir),
	369	u'youtube-sigfuncs',
	370	func_id + '.json')
	371	try:
	372	with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
	373	cache_spec = json.load(cachef)
	374	return lambda s: u''.join(s[i] for i in cache_spec)
	375	except IOError:
	376	pass # No cache available
	377
	378	if player_type == 'js':
	379	code = self._download_webpage(
	380	player_url, video_id,
	381	note=u'Downloading %s player %s' % (player_type, player_id),
	382	errnote=u'Download of %s failed' % player_url)
	383	res = self._parse_sig_js(code)
	384	elif player_type == 'swf':
	385	urlh = self._request_webpage(
	386	player_url, video_id,
	387	note=u'Downloading %s player %s' % (player_type, player_id),
	388	errnote=u'Download of %s failed' % player_url)
	389	code = urlh.read()
	390	res = self._parse_sig_swf(code)
	391	else:
	392	assert False, 'Invalid player type %r' % player_type
	393
	394	if cache_enabled:
	395	try:
	396	test_string = u''.join(map(compat_chr, range(len(example_sig))))
	397	cache_res = res(test_string)
	398	cache_spec = [ord(c) for c in cache_res]
	399	try:
	400	os.makedirs(os.path.dirname(cache_fn))
	401	except OSError as ose:
	402	if ose.errno != errno.EEXIST:
	403	raise
	404	write_json_file(cache_spec, cache_fn)
	405	except Exception:
	406	tb = traceback.format_exc()
	407	self._downloader.report_warning(
	408	u'Writing cache to %r failed: %s' % (cache_fn, tb))
	409
	410	return res
	411
	412	def _print_sig_code(self, func, example_sig):
	413	def gen_sig_code(idxs):
	414	def _genslice(start, end, step):
	415	starts = u'' if start == 0 else str(start)
	416	ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
	417	steps = u'' if step == 1 else (u':%d' % step)
	418	return u's[%s%s%s]' % (starts, ends, steps)
	419
	420	step = None
	421	start = '(Never used)' # Quelch pyflakes warnings - start will be
	422	# set as soon as step is set
	423	for i, prev in zip(idxs[1:], idxs[:-1]):
	424	if step is not None:
	425	if i - prev == step:
	426	continue
	427	yield _genslice(start, prev, step)
	428	step = None
	429	continue
	430	if i - prev in [-1, 1]:
	431	step = i - prev
	432	start = prev
	433	continue
	434	else:
	435	yield u's[%d]' % prev
	436	if step is None:
	437	yield u's[%d]' % i
	438	else:
	439	yield _genslice(start, i, step)
	440
	441	test_string = u''.join(map(compat_chr, range(len(example_sig))))
	442	cache_res = func(test_string)
	443	cache_spec = [ord(c) for c in cache_res]
	444	expr_code = u' + '.join(gen_sig_code(cache_spec))
	445	signature_id_tuple = '(%s)' % (
	446	', '.join(compat_str(len(p)) for p in example_sig.split('.')))
	447	code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
	448	u' return %s\n') % (signature_id_tuple, expr_code)
	449	self.to_screen(u'Extracted signature function:\n' + code)
	450
	451	def _parse_sig_js(self, jscode):
	452	funcname = self._search_regex(
	453	r'signature=([$a-zA-Z]+)', jscode,
	454	u'Initial JS player signature function name')
	455
	456	jsi = JSInterpreter(jscode)
	457	initial_function = jsi.extract_function(funcname)
	458	return lambda s: initial_function([s])
	459
	460	def _parse_sig_swf(self, file_contents):
	461	swfi = SWFInterpreter(file_contents)
	462	TARGET_CLASSNAME = u'SignatureDecipher'
	463	searched_class = swfi.extract_class(TARGET_CLASSNAME)
	464	initial_function = swfi.extract_function(searched_class, u'decipher')
	465	return lambda s: initial_function([s])
	466
	467	def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
	468	"""Turn the encrypted s field into a working signature"""
	469
	470	if player_url is None:
	471	raise ExtractorError(u'Cannot decrypt signature without player_url')
	472
	473	if player_url.startswith(u'//'):
	474	player_url = u'https:' + player_url
	475	try:
	476	player_id = (player_url, self._signature_cache_id(s))
	477	if player_id not in self._player_cache:
	478	func = self._extract_signature_function(
	479	video_id, player_url, s
	480	)
	481	self._player_cache[player_id] = func
	482	func = self._player_cache[player_id]
	483	if self._downloader.params.get('youtube_print_sig_code'):
	484	self._print_sig_code(func, s)
	485	return func(s)
	486	except Exception as e:
	487	tb = traceback.format_exc()
	488	raise ExtractorError(
	489	u'Signature extraction failed: ' + tb, cause=e)
	490
	491	def _get_available_subtitles(self, video_id, webpage):
	492	try:
	493	sub_list = self._download_webpage(
	494	'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
	495	video_id, note=False)
	496	except ExtractorError as err:
	497	self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
	498	return {}
	499	lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
	500
	501	sub_lang_list = {}
	502	for l in lang_list:
	503	lang = l[1]
	504	params = compat_urllib_parse.urlencode({
	505	'lang': lang,
	506	'v': video_id,
	507	'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
	508	'name': unescapeHTML(l[0]).encode('utf-8'),
	509	})
	510	url = u'https://www.youtube.com/api/timedtext?' + params
	511	sub_lang_list[lang] = url
	512	if not sub_lang_list:
	513	self._downloader.report_warning(u'video doesn\'t have subtitles')
	514	return {}
	515	return sub_lang_list
	516
	517	def _get_available_automatic_caption(self, video_id, webpage):
	518	"""We need the webpage for getting the captions url, pass it as an
	519	argument to speed up the process."""
	520	sub_format = self._downloader.params.get('subtitlesformat', 'srt')
	521	self.to_screen(u'%s: Looking for automatic captions' % video_id)
	522	mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
	523	err_msg = u'Couldn\'t find automatic captions for %s' % video_id
	524	if mobj is None:
	525	self._downloader.report_warning(err_msg)
	526	return {}
	527	player_config = json.loads(mobj.group(1))
	528	try:
	529	args = player_config[u'args']
	530	caption_url = args[u'ttsurl']
	531	timestamp = args[u'timestamp']
	532	# We get the available subtitles
	533	list_params = compat_urllib_parse.urlencode({
	534	'type': 'list',
	535	'tlangs': 1,
	536	'asrs': 1,
	537	})
	538	list_url = caption_url + '&' + list_params
	539	caption_list = self._download_xml(list_url, video_id)
	540	original_lang_node = caption_list.find('track')
	541	if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
	542	self._downloader.report_warning(u'Video doesn\'t have automatic captions')
	543	return {}
	544	original_lang = original_lang_node.attrib['lang_code']
	545
	546	sub_lang_list = {}
	547	for lang_node in caption_list.findall('target'):
	548	sub_lang = lang_node.attrib['lang_code']
	549	params = compat_urllib_parse.urlencode({
	550	'lang': original_lang,
	551	'tlang': sub_lang,
	552	'fmt': sub_format,
	553	'ts': timestamp,
	554	'kind': 'asr',
	555	})
	556	sub_lang_list[sub_lang] = caption_url + '&' + params
	557	return sub_lang_list
	558	# An extractor error can be raise by the download process if there are
	559	# no automatic captions but there are subtitles
	560	except (KeyError, ExtractorError):
	561	self._downloader.report_warning(err_msg)
	562	return {}
	563
	564	@classmethod
	565	def extract_id(cls, url):
	566	mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
	567	if mobj is None:
	568	raise ExtractorError(u'Invalid URL: %s' % url)
	569	video_id = mobj.group(2)
	570	return video_id
	571
	572	def _extract_from_m3u8(self, manifest_url, video_id):
	573	url_map = {}
	574	def _get_urls(_manifest):
	575	lines = _manifest.split('\n')
	576	urls = filter(lambda l: l and not l.startswith('#'),
	577	lines)
	578	return urls
	579	manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
	580	formats_urls = _get_urls(manifest)
	581	for format_url in formats_urls:
	582	itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
	583	url_map[itag] = format_url
	584	return url_map
	585
	586	def _extract_annotations(self, video_id):
	587	url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
	588	return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
	589
	590	def _real_extract(self, url):
	591	proto = (
	592	u'http' if self._downloader.params.get('prefer_insecure', False)
	593	else u'https')
	594
	595	# Extract original video URL from URL with redirection, like age verification, using next_url parameter
	596	mobj = re.search(self._NEXT_URL_RE, url)
	597	if mobj:
	598	url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
	599	video_id = self.extract_id(url)
	600
	601	# Get video webpage
	602	url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
	603	video_webpage = self._download_webpage(url, video_id)
	604
	605	# Attempt to extract SWF player URL
	606	mobj = re.search(r'swfConfig.?"(https?:\\/\\/.?watch.?-.?\.swf)"', video_webpage)
	607	if mobj is not None:
	608	player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
	609	else:
	610	player_url = None
	611
	612	# Get video info
	613	self.report_video_info_webpage_download(video_id)
	614	if re.search(r'player-age-gate-content">', video_webpage) is not None:
	615	self.report_age_confirmation()
	616	age_gate = True
	617	# We simulate the access to the video from www.youtube.com/v/{video_id}
	618	# this can be viewed without login into Youtube
	619	data = compat_urllib_parse.urlencode({
	620	'video_id': video_id,
	621	'eurl': 'https://youtube.googleapis.com/v/' + video_id,
	622	'sts': self._search_regex(
	623	r'"sts"\s:\s(\d+)', video_webpage, 'sts'),
	624	})
	625	video_info_url = proto + '://www.youtube.com/get_video_info?' + data
	626	video_info_webpage = self._download_webpage(video_info_url, video_id,
	627	note=False,
	628	errnote='unable to download video info webpage')
	629	video_info = compat_parse_qs(video_info_webpage)
	630	else:
	631	age_gate = False
	632	for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
	633	video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
	634	% (video_id, el_type))
	635	video_info_webpage = self._download_webpage(video_info_url, video_id,
	636	note=False,
	637	errnote='unable to download video info webpage')
	638	video_info = compat_parse_qs(video_info_webpage)
	639	if 'token' in video_info:
	640	break
	641	if 'token' not in video_info:
	642	if 'reason' in video_info:
	643	raise ExtractorError(
	644	u'YouTube said: %s' % video_info['reason'][0],
	645	expected=True, video_id=video_id)
	646	else:
	647	raise ExtractorError(
	648	u'"token" parameter not in video info for unknown reason',
	649	video_id=video_id)
	650
	651	if 'view_count' in video_info:
	652	view_count = int(video_info['view_count'][0])
	653	else:
	654	view_count = None
	655
	656	# Check for "rental" videos
	657	if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
	658	raise ExtractorError(u'"rental" videos not supported')
	659
	660	# Start extracting information
	661	self.report_information_extraction(video_id)
	662
	663	# uploader
	664	if 'author' not in video_info:
	665	raise ExtractorError(u'Unable to extract uploader name')
	666	video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
	667
	668	# uploader_id
	669	video_uploader_id = None
	670	mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user\|channel)/([^"]+)">', video_webpage)
	671	if mobj is not None:
	672	video_uploader_id = mobj.group(1)
	673	else:
	674	self._downloader.report_warning(u'unable to extract uploader nickname')
	675
	676	# title
	677	if 'title' in video_info:
	678	video_title = video_info['title'][0]
	679	else:
	680	self._downloader.report_warning(u'Unable to extract video title')
	681	video_title = u'_'
	682
	683	# thumbnail image
	684	# We try first to get a high quality image:
	685	m_thumb = re.search(r'<span itemprop="thumbnail".?href="(.?)">',
	686	video_webpage, re.DOTALL)
	687	if m_thumb is not None:
	688	video_thumbnail = m_thumb.group(1)
	689	elif 'thumbnail_url' not in video_info:
	690	self._downloader.report_warning(u'unable to extract video thumbnail')
	691	video_thumbnail = None
	692	else: # don't panic if we can't find it
	693	video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
	694
	695	# upload date
	696	upload_date = None
	697	mobj = re.search(r'(?s)id="eow-date.?>(.?)</span>', video_webpage)
	698	if mobj is None:
	699	mobj = re.search(
	700	r'(?s)id="watch-uploader-info".?>.?(?:Published\|Uploaded\|Streamed live) on (.*?)</strong>',
	701	video_webpage)
	702	if mobj is not None:
	703	upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
	704	upload_date = unified_strdate(upload_date)
	705
	706	m_cat_container = get_element_by_id("eow-category", video_webpage)
	707	if m_cat_container:
	708	category = self._html_search_regex(
	709	r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
	710	default=None)
	711	video_categories = None if category is None else [category]
	712	else:
	713	video_categories = None
	714
	715	# description
	716	video_description = get_element_by_id("eow-description", video_webpage)
	717	if video_description:
	718	video_description = re.sub(r'''(?x)
	719	<a\s+
	720	(?:[a-zA-Z-]+="[^"]+"\s+)*?
	721	title="([^"]+)"\s+
	722	(?:[a-zA-Z-]+="[^"]+"\s+)*?
	723	class="yt-uix-redirect-link"\s*>
	724	[^<]+
	725	</a>
	726	''', r'\1', video_description)
	727	video_description = clean_html(video_description)
	728	else:
	729	fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
	730	if fd_mobj:
	731	video_description = unescapeHTML(fd_mobj.group(1))
	732	else:
	733	video_description = u''
	734
	735	def _extract_count(klass):
	736	count = self._search_regex(
	737	r'class="%s">([\d,]+)</span>' % re.escape(klass),
	738	video_webpage, klass, default=None)
	739	if count is not None:
	740	return int(count.replace(',', ''))
	741	return None
	742	like_count = _extract_count(u'likes-count')
	743	dislike_count = _extract_count(u'dislikes-count')
	744
	745	# subtitles
	746	video_subtitles = self.extract_subtitles(video_id, video_webpage)
	747
	748	if self._downloader.params.get('listsubtitles', False):
	749	self._list_available_subtitles(video_id, video_webpage)
	750	return
	751
	752	if 'length_seconds' not in video_info:
	753	self._downloader.report_warning(u'unable to extract video duration')
	754	video_duration = None
	755	else:
	756	video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
	757
	758	# annotations
	759	video_annotations = None
	760	if self._downloader.params.get('writeannotations', False):
	761	video_annotations = self._extract_annotations(video_id)
	762
	763	# Decide which formats to download
	764	try:
	765	mobj = re.search(r';ytplayer\.config\s=\s({.*?});', video_webpage)
	766	if not mobj:
	767	raise ValueError('Could not find vevo ID')
	768	json_code = uppercase_escape(mobj.group(1))
	769	ytplayer_config = json.loads(json_code)
	770	args = ytplayer_config['args']
	771	# Easy way to know if the 's' value is in url_encoded_fmt_stream_map
	772	# this signatures are encrypted
	773	if 'url_encoded_fmt_stream_map' not in args:
	774	raise ValueError(u'No stream_map present') # caught below
	775	re_signature = re.compile(r'[&,]s=')
	776	m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
	777	if m_s is not None:
	778	self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
	779	video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
	780	m_s = re_signature.search(args.get('adaptive_fmts', u''))
	781	if m_s is not None:
	782	if 'adaptive_fmts' in video_info:
	783	video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
	784	else:
	785	video_info['adaptive_fmts'] = [args['adaptive_fmts']]
	786	except ValueError:
	787	pass
	788
	789	def _map_to_format_list(urlmap):
	790	formats = []
	791	for itag, video_real_url in urlmap.items():
	792	dct = {
	793	'format_id': itag,
	794	'url': video_real_url,
	795	'player_url': player_url,
	796	}
	797	if itag in self._formats:
	798	dct.update(self._formats[itag])
	799	formats.append(dct)
	800	return formats
	801
	802	if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
	803	self.report_rtmp_download()
	804	formats = [{
	805	'format_id': '_rtmp',
	806	'protocol': 'rtmp',
	807	'url': video_info['conn'][0],
	808	'player_url': player_url,
	809	}]
	810	elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
	811	encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
	812	if 'rtmpe%3Dyes' in encoded_url_map:
	813	raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
	814	url_map = {}
	815	for url_data_str in encoded_url_map.split(','):
	816	url_data = compat_parse_qs(url_data_str)
	817	if 'itag' not in url_data or 'url' not in url_data:
	818	continue
	819	format_id = url_data['itag'][0]
	820	url = url_data['url'][0]
	821
	822	if 'sig' in url_data:
	823	url += '&signature=' + url_data['sig'][0]
	824	elif 's' in url_data:
	825	encrypted_sig = url_data['s'][0]
	826
	827	if not age_gate:
	828	jsplayer_url_json = self._search_regex(
	829	r'"assets":.+?"js":\s*("[^"]+")',
	830	video_webpage, u'JS player URL')
	831	player_url = json.loads(jsplayer_url_json)
	832	if player_url is None:
	833	player_url_json = self._search_regex(
	834	r'ytplayer\.config.?"url"\s:\s*("[^"]+")',
	835	video_webpage, u'age gate player URL')
	836	player_url = json.loads(player_url_json)
	837
	838	if self._downloader.params.get('verbose'):
	839	if player_url is None:
	840	player_version = 'unknown'
	841	player_desc = 'unknown'
	842	else:
	843	if player_url.endswith('swf'):
	844	player_version = self._search_regex(
	845	r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
	846	u'flash player', fatal=False)
	847	player_desc = 'flash player %s' % player_version
	848	else:
	849	player_version = self._search_regex(
	850	r'html5player-([^/]+?)(?:/html5player)?\.js',
	851	player_url,
	852	'html5 player', fatal=False)
	853	player_desc = u'html5 player %s' % player_version
	854
	855	parts_sizes = self._signature_cache_id(encrypted_sig)
	856	self.to_screen(u'{%s} signature length %s, %s' %
	857	(format_id, parts_sizes, player_desc))
	858
	859	signature = self._decrypt_signature(
	860	encrypted_sig, video_id, player_url, age_gate)
	861	url += '&signature=' + signature
	862	if 'ratebypass' not in url:
	863	url += '&ratebypass=yes'
	864	url_map[format_id] = url
	865	formats = _map_to_format_list(url_map)
	866	elif video_info.get('hlsvp'):
	867	manifest_url = video_info['hlsvp'][0]
	868	url_map = self._extract_from_m3u8(manifest_url, video_id)
	869	formats = _map_to_format_list(url_map)
	870	else:
	871	raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
	872
	873	# Look for the DASH manifest
	874	if (self._downloader.params.get('youtube_include_dash_manifest', False)):
	875	try:
	876	# The DASH manifest used needs to be the one from the original video_webpage.
	877	# The one found in get_video_info seems to be using different signatures.
	878	# However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
	879	# Luckily, it seems, this case uses some kind of default signature (len == 86), so the
	880	# combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
	881	if age_gate:
	882	dash_manifest_url = video_info.get('dashmpd')[0]
	883	else:
	884	dash_manifest_url = ytplayer_config['args']['dashmpd']
	885	def decrypt_sig(mobj):
	886	s = mobj.group(1)
	887	dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
	888	return '/signature/%s' % dec_s
	889	dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
	890	dash_doc = self._download_xml(
	891	dash_manifest_url, video_id,
	892	note=u'Downloading DASH manifest',
	893	errnote=u'Could not download DASH manifest')
	894	for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
	895	url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
	896	if url_el is None:
	897	continue
	898	format_id = r.attrib['id']
	899	video_url = url_el.text
	900	filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
	901	f = {
	902	'format_id': format_id,
	903	'url': video_url,
	904	'width': int_or_none(r.attrib.get('width')),
	905	'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
	906	'asr': int_or_none(r.attrib.get('audioSamplingRate')),
	907	'filesize': filesize,
	908	}
	909	try:
	910	existing_format = next(
	911	fo for fo in formats
	912	if fo['format_id'] == format_id)
	913	except StopIteration:
	914	f.update(self._formats.get(format_id, {}))
	915	formats.append(f)
	916	else:
	917	existing_format.update(f)
	918
	919	except (ExtractorError, KeyError) as e:
	920	self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
	921
	922	self._sort_formats(formats)
	923
	924	return {
	925	'id': video_id,
	926	'uploader': video_uploader,
	927	'uploader_id': video_uploader_id,
	928	'upload_date': upload_date,
	929	'title': video_title,
	930	'thumbnail': video_thumbnail,
	931	'description': video_description,
	932	'categories': video_categories,
	933	'subtitles': video_subtitles,
	934	'duration': video_duration,
	935	'age_limit': 18 if age_gate else 0,
	936	'annotations': video_annotations,
	937	'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
	938	'view_count': view_count,
	939	'like_count': like_count,
	940	'dislike_count': dislike_count,
	941	'formats': formats,
	942	}
	943
	944	class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
	945	IE_DESC = u'YouTube.com playlists'
	946	_VALID_URL = r"""(?x)(?:
	947	(?:https?://)?
	948	(?:\w+\.)?
	949	youtube\.com/
	950	(?:
	951	(?:course\|view_play_list\|my_playlists\|artist\|playlist\|watch)
	952	\? (?:.?&)? (?:p\|a\|list)=
	953	\| p/
	954	)
	955	(
	956	(?:PL\|LL\|EC\|UU\|FL\|RD)?[0-9A-Za-z-_]{10,}
	957	# Top tracks, they can also include dots
	958	\|(?:MC)[\w\.]*
	959	)
	960	.*
	961	\|
	962	((?:PL\|LL\|EC\|UU\|FL\|RD)[0-9A-Za-z-_]{10,})
	963	)"""
	964	_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
	965	_MORE_PAGES_INDICATOR = r'data-link-type="next"'
	966	_VIDEO_RE = r'href="\s/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]?index=(?P<index>\d+)'
	967	IE_NAME = u'youtube:playlist'
	968
	969	def _real_initialize(self):
	970	self._login()
	971
	972	def _ids_to_results(self, ids):
	973	return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
	974	for vid_id in ids]
	975
	976	def _extract_mix(self, playlist_id):
	977	# The mixes are generated from a a single video
	978	# the id of the playlist is just 'RD' + video_id
	979	url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
	980	webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
	981	search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
	982	title_span = (search_title('playlist-title') or
	983	search_title('title long-title') or search_title('title'))
	984	title = clean_html(title_span)
	985	video_re = r'''(?x)data-video-username=".?".?
	986	href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id)
	987	ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
	988	url_results = self._ids_to_results(ids)
	989
	990	return self.playlist_result(url_results, playlist_id, title)
	991
	992	def _real_extract(self, url):
	993	# Extract playlist id
	994	mobj = re.match(self._VALID_URL, url)
	995	if mobj is None:
	996	raise ExtractorError(u'Invalid URL: %s' % url)
	997	playlist_id = mobj.group(1) or mobj.group(2)
	998
	999	# Check if it's a video-specific URL
	1000	query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
	1001	if 'v' in query_dict:
	1002	video_id = query_dict['v'][0]
	1003	if self._downloader.params.get('noplaylist'):
	1004	self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
	1005	return self.url_result(video_id, 'Youtube', video_id=video_id)
	1006	else:
	1007	self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
	1008
	1009	if playlist_id.startswith('RD'):
	1010	# Mixes require a custom extraction process
	1011	return self._extract_mix(playlist_id)
	1012	if playlist_id.startswith('TL'):
	1013	raise ExtractorError(u'For downloading YouTube.com top lists, use '
	1014	u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
	1015
	1016	url = self._TEMPLATE_URL % playlist_id
	1017	page = self._download_webpage(url, playlist_id)
	1018	more_widget_html = content_html = page
	1019
	1020	# Check if the playlist exists or is private
	1021	if re.search(r'<div class="yt-alert-message">[^<]?(The\|This) playlist (does not exist\|is private)[^<]?</div>', page) is not None:
	1022	raise ExtractorError(
	1023	u'The playlist doesn\'t exist or is private, use --username or '
	1024	'--netrc to access it.',
	1025	expected=True)
	1026
	1027	# Extract the video ids from the playlist pages
	1028	ids = []
	1029
	1030	for page_num in itertools.count(1):
	1031	matches = re.finditer(self._VIDEO_RE, content_html)
	1032	# We remove the duplicates and the link with index 0
	1033	# (it's not the first video of the playlist)
	1034	new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
	1035	ids.extend(new_ids)
	1036
	1037	mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
	1038	if not mobj:
	1039	break
	1040
	1041	more = self._download_json(
	1042	'https://youtube.com/%s' % mobj.group('more'), playlist_id,
	1043	'Downloading page #%s' % page_num,
	1044	transform_source=uppercase_escape)
	1045	content_html = more['content_html']
	1046	more_widget_html = more['load_more_widget_html']
	1047
	1048	playlist_title = self._html_search_regex(
	1049	r'(?s)<h1 class="pl-header-title[^"]">\s(.?)\s</h1>',
	1050	page, u'title')
	1051
	1052	url_results = self._ids_to_results(ids)
	1053	return self.playlist_result(url_results, playlist_id, playlist_title)
	1054
	1055
	1056	class YoutubeTopListIE(YoutubePlaylistIE):
	1057	IE_NAME = u'youtube:toplist'
	1058	IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
	1059	u' (Example: "yttoplist:music:Top Tracks")')
	1060	_VALID_URL = r'yttoplist:(?P<chann>.?):(?P<title>.?)$'
	1061
	1062	def _real_extract(self, url):
	1063	mobj = re.match(self._VALID_URL, url)
	1064	channel = mobj.group('chann')
	1065	title = mobj.group('title')
	1066	query = compat_urllib_parse.urlencode({'title': title})
	1067	playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
	1068	channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
	1069	link = self._html_search_regex(playlist_re, channel_page, u'list')
	1070	url = compat_urlparse.urljoin('https://www.youtube.com/', link)
	1071
	1072	video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
	1073	ids = []
	1074	# sometimes the webpage doesn't contain the videos
	1075	# retry until we get them
	1076	for i in itertools.count(0):
	1077	msg = u'Downloading Youtube mix'
	1078	if i > 0:
	1079	msg += ', retry #%d' % i
	1080	webpage = self._download_webpage(url, title, msg)
	1081	ids = orderedSet(re.findall(video_re, webpage))
	1082	if ids:
	1083	break
	1084	url_results = self._ids_to_results(ids)
	1085	return self.playlist_result(url_results, playlist_title=title)
	1086
	1087
	1088	class YoutubeChannelIE(InfoExtractor):
	1089	IE_DESC = u'YouTube.com channels'
	1090	_VALID_URL = r"^(?:https?://)?(?:youtu\.be\|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
	1091	_MORE_PAGES_INDICATOR = 'yt-uix-load-more'
	1092	_MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
	1093	IE_NAME = u'youtube:channel'
	1094
	1095	def extract_videos_from_page(self, page):
	1096	ids_in_page = []
	1097	for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
	1098	if mobj.group(1) not in ids_in_page:
	1099	ids_in_page.append(mobj.group(1))
	1100	return ids_in_page
	1101
	1102	def _real_extract(self, url):
	1103	# Extract channel id
	1104	mobj = re.match(self._VALID_URL, url)
	1105	if mobj is None:
	1106	raise ExtractorError(u'Invalid URL: %s' % url)
	1107
	1108	# Download channel page
	1109	channel_id = mobj.group(1)
	1110	video_ids = []
	1111	url = 'https://www.youtube.com/channel/%s/videos' % channel_id
	1112	channel_page = self._download_webpage(url, channel_id)
	1113	autogenerated = re.search(r'''(?x)
	1114	class="[^"]*?(?:
	1115	channel-header-autogenerated-label\|
	1116	yt-channel-title-autogenerated
	1117	)[^"]*"''', channel_page) is not None
	1118
	1119	if autogenerated:
	1120	# The videos are contained in a single page
	1121	# the ajax pages can't be used, they are empty
	1122	video_ids = self.extract_videos_from_page(channel_page)
	1123	else:
	1124	# Download all channel pages using the json-based channel_ajax query
	1125	for pagenum in itertools.count(1):
	1126	url = self._MORE_PAGES_URL % (pagenum, channel_id)
	1127	page = self._download_json(
	1128	url, channel_id, note=u'Downloading page #%s' % pagenum,
	1129	transform_source=uppercase_escape)
	1130
	1131	ids_in_page = self.extract_videos_from_page(page['content_html'])
	1132	video_ids.extend(ids_in_page)
	1133
	1134	if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
	1135	break
	1136
	1137	self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
	1138
	1139	url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
	1140	for video_id in video_ids]
	1141	return self.playlist_result(url_entries, channel_id)
	1142
	1143
	1144	class YoutubeUserIE(InfoExtractor):
	1145	IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
	1146	_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link\|watch\|results)(?:$\|[^a-z_A-Z0-9-])))\|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
	1147	_TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
	1148	_GDATA_PAGE_SIZE = 50
	1149	_GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
	1150	IE_NAME = u'youtube:user'
	1151
	1152	@classmethod
	1153	def suitable(cls, url):
	1154	# Don't return True if the url can be extracted with other youtube
	1155	# extractor, the regex would is too permissive and it would match.
	1156	other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
	1157	if any(ie.suitable(url) for ie in other_ies): return False
	1158	else: return super(YoutubeUserIE, cls).suitable(url)
	1159
	1160	def _real_extract(self, url):
	1161	# Extract username
	1162	mobj = re.match(self._VALID_URL, url)
	1163	if mobj is None:
	1164	raise ExtractorError(u'Invalid URL: %s' % url)
	1165
	1166	username = mobj.group(1)
	1167
	1168	# Download video ids using YouTube Data API. Result size per
	1169	# query is limited (currently to 50 videos) so we need to query
	1170	# page by page until there are no video ids - it means we got
	1171	# all of them.
	1172
	1173	def download_page(pagenum):
	1174	start_index = pagenum * self._GDATA_PAGE_SIZE + 1
	1175
	1176	gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
	1177	page = self._download_webpage(
	1178	gdata_url, username,
	1179	u'Downloading video ids from %d to %d' % (
	1180	start_index, start_index + self._GDATA_PAGE_SIZE))
	1181
	1182	try:
	1183	response = json.loads(page)
	1184	except ValueError as err:
	1185	raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
	1186	if 'entry' not in response['feed']:
	1187	return
	1188
	1189	# Extract video identifiers
	1190	entries = response['feed']['entry']
	1191	for entry in entries:
	1192	title = entry['title']['$t']
	1193	video_id = entry['id']['$t'].split('/')[-1]
	1194	yield {
	1195	'_type': 'url',
	1196	'url': video_id,
	1197	'ie_key': 'Youtube',
	1198	'id': video_id,
	1199	'title': title,
	1200	}
	1201	url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
	1202
	1203	return self.playlist_result(url_results, playlist_title=username)
	1204
	1205
	1206	class YoutubeSearchIE(SearchInfoExtractor):
	1207	IE_DESC = u'YouTube.com searches'
	1208	_API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
	1209	_MAX_RESULTS = 1000
	1210	IE_NAME = u'youtube:search'
	1211	_SEARCH_KEY = 'ytsearch'
	1212
	1213	def _get_n_results(self, query, n):
	1214	"""Get a specified number of results for a query"""
	1215
	1216	video_ids = []
	1217	pagenum = 0
	1218	limit = n
	1219	PAGE_SIZE = 50
	1220
	1221	while (PAGE_SIZE * pagenum) < limit:
	1222	result_url = self._API_URL % (
	1223	compat_urllib_parse.quote_plus(query.encode('utf-8')),
	1224	(PAGE_SIZE * pagenum) + 1)
	1225	data_json = self._download_webpage(
	1226	result_url, video_id=u'query "%s"' % query,
	1227	note=u'Downloading page %s' % (pagenum + 1),
	1228	errnote=u'Unable to download API page')
	1229	data = json.loads(data_json)
	1230	api_response = data['data']
	1231
	1232	if 'items' not in api_response:
	1233	raise ExtractorError(
	1234	u'[youtube] No video results', expected=True)
	1235
	1236	new_ids = list(video['id'] for video in api_response['items'])
	1237	video_ids += new_ids
	1238
	1239	limit = min(n, api_response['totalItems'])
	1240	pagenum += 1
	1241
	1242	if len(video_ids) > n:
	1243	video_ids = video_ids[:n]
	1244	videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
	1245	for video_id in video_ids]
	1246	return self.playlist_result(videos, query)
	1247
	1248
	1249	class YoutubeSearchDateIE(YoutubeSearchIE):
	1250	IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
	1251	_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
	1252	_SEARCH_KEY = 'ytsearchdate'
	1253	IE_DESC = u'YouTube.com searches, newest videos first'
	1254
	1255
	1256	class YoutubeSearchURLIE(InfoExtractor):
	1257	IE_DESC = u'YouTube.com search URLs'
	1258	IE_NAME = u'youtube:search_url'
	1259	_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]\|$)'
	1260
	1261	def _real_extract(self, url):
	1262	mobj = re.match(self._VALID_URL, url)
	1263	query = compat_urllib_parse.unquote_plus(mobj.group('query'))
	1264
	1265	webpage = self._download_webpage(url, query)
	1266	result_code = self._search_regex(
	1267	r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
	1268
	1269	part_codes = re.findall(
	1270	r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
	1271	entries = []
	1272	for part_code in part_codes:
	1273	part_title = self._html_search_regex(
	1274	[r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
	1275	part_url_snippet = self._html_search_regex(
	1276	r'(?s)href="([^"]+)"', part_code, 'item URL')
	1277	part_url = compat_urlparse.urljoin(
	1278	'https://www.youtube.com/', part_url_snippet)
	1279	entries.append({
	1280	'_type': 'url',
	1281	'url': part_url,
	1282	'title': part_title,
	1283	})
	1284
	1285	return {
	1286	'_type': 'playlist',
	1287	'entries': entries,
	1288	'title': query,
	1289	}
	1290
	1291
	1292	class YoutubeShowIE(InfoExtractor):
	1293	IE_DESC = u'YouTube.com (multi-season) shows'
	1294	_VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
	1295	IE_NAME = u'youtube:show'
	1296
	1297	def _real_extract(self, url):
	1298	mobj = re.match(self._VALID_URL, url)
	1299	show_name = mobj.group(1)
	1300	webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
	1301	# There's one playlist for each season of the show
	1302	m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
	1303	self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
	1304	return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
	1305
	1306
	1307	class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
	1308	"""
	1309	Base class for extractors that fetch info from
	1310	http://www.youtube.com/feed_ajax
	1311	Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
	1312	"""
	1313	_LOGIN_REQUIRED = True
	1314	# use action_load_personal_feed instead of action_load_system_feed
	1315	_PERSONAL_FEED = False
	1316
	1317	@property
	1318	def _FEED_TEMPLATE(self):
	1319	action = 'action_load_system_feed'
	1320	if self._PERSONAL_FEED:
	1321	action = 'action_load_personal_feed'
	1322	return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
	1323
	1324	@property
	1325	def IE_NAME(self):
	1326	return u'youtube:%s' % self._FEED_NAME
	1327
	1328	def _real_initialize(self):
	1329	self._login()
	1330
	1331	def _real_extract(self, url):
	1332	feed_entries = []
	1333	paging = 0
	1334	for i in itertools.count(1):
	1335	info = self._download_json(self._FEED_TEMPLATE % paging,
	1336	u'%s feed' % self._FEED_NAME,
	1337	u'Downloading page %s' % i)
	1338	feed_html = info.get('feed_html') or info.get('content_html')
	1339	m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
	1340	ids = orderedSet(m.group(1) for m in m_ids)
	1341	feed_entries.extend(
	1342	self.url_result(video_id, 'Youtube', video_id=video_id)
	1343	for video_id in ids)
	1344	mobj = re.search(
	1345	r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
	1346	feed_html)
	1347	if mobj is None:
	1348	break
	1349	paging = mobj.group('paging')
	1350	return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
	1351
	1352	class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
	1353	IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
	1354	_VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions\|:ytsubs(?:criptions)?'
	1355	_FEED_NAME = 'subscriptions'
	1356	_PLAYLIST_TITLE = u'Youtube Subscriptions'
	1357
	1358	class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
	1359	IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
	1360	_VALID_URL = r'https?://www\.youtube\.com/feed/recommended\|:ytrec(?:ommended)?'
	1361	_FEED_NAME = 'recommended'
	1362	_PLAYLIST_TITLE = u'Youtube Recommended videos'
	1363
	1364	class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
	1365	IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
	1366	_VALID_URL = r'https?://www\.youtube\.com/feed/watch_later\|:ytwatchlater'
	1367	_FEED_NAME = 'watch_later'
	1368	_PLAYLIST_TITLE = u'Youtube Watch Later'
	1369	_PERSONAL_FEED = True
	1370
	1371	class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
	1372	IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
	1373	_VALID_URL = u'https?://www\.youtube\.com/feed/history\|:ythistory'
	1374	_FEED_NAME = 'history'
	1375	_PERSONAL_FEED = True
	1376	_PLAYLIST_TITLE = u'Youtube Watch History'
	1377
	1378	class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
	1379	IE_NAME = u'youtube:favorites'
	1380	IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
	1381	_VALID_URL = r'https?://www\.youtube\.com/my_favorites\|:ytfav(?:ou?rites)?'
	1382	_LOGIN_REQUIRED = True
	1383
	1384	def _real_extract(self, url):
	1385	webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
	1386	playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
	1387	return self.url_result(playlist_id, 'YoutubePlaylist')
	1388
	1389
	1390	class YoutubeTruncatedURLIE(InfoExtractor):
	1391	IE_NAME = 'youtube:truncated_url'
	1392	IE_DESC = False # Do not list
	1393	_VALID_URL = r'''(?x)
	1394	(?:https?://)?[^/]+/watch\?(?:
	1395	feature=[a-z_]+\|
	1396	annotation_id=annotation_[^&]+
	1397	)?$\|
	1398	(?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
	1399	'''
	1400
	1401	_TESTS = [{
	1402	'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
	1403	'only_matching': True,
	1404	}, {
	1405	'url': 'http://www.youtube.com/watch?',
	1406	'only_matching': True,
	1407	}]
	1408
	1409	def _real_extract(self, url):
	1410	raise ExtractorError(
	1411	u'Did you forget to quote the URL? Remember that & is a meta '
	1412	u'character in most shells, so you want to put the URL in quotes, '
	1413	u'like youtube-dl '
	1414	u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
	1415	u' or simply youtube-dl BaW_jenozKc .',
	1416	expected=True)