jfr.im git - yt-dlp.git/blame_incremental - youtube

Commit	Line	Data
	1	# coding: utf-8
	2
	3	import errno
	4	import io
	5	import itertools
	6	import json
	7	import os.path
	8	import re
	9	import traceback
	10
	11	from .common import InfoExtractor, SearchInfoExtractor
	12	from .subtitles import SubtitlesInfoExtractor
	13	from ..jsinterp import JSInterpreter
	14	from ..swfinterp import SWFInterpreter
	15	from ..utils import (
	16	compat_chr,
	17	compat_parse_qs,
	18	compat_urllib_parse,
	19	compat_urllib_request,
	20	compat_urlparse,
	21	compat_str,
	22
	23	clean_html,
	24	get_cachedir,
	25	get_element_by_id,
	26	get_element_by_attribute,
	27	ExtractorError,
	28	int_or_none,
	29	PagedList,
	30	unescapeHTML,
	31	unified_strdate,
	32	orderedSet,
	33	write_json_file,
	34	uppercase_escape,
	35	)
	36
	37	class YoutubeBaseInfoExtractor(InfoExtractor):
	38	"""Provide base functions for Youtube extractors"""
	39	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	40	_TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
	41	_LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
	42	_AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
	43	_NETRC_MACHINE = 'youtube'
	44	# If True it will raise an error if no login info is provided
	45	_LOGIN_REQUIRED = False
	46
	47	def _set_language(self):
	48	return bool(self._download_webpage(
	49	self._LANG_URL, None,
	50	note=u'Setting language', errnote='unable to set language',
	51	fatal=False))
	52
	53	def _login(self):
	54	"""
	55	Attempt to log in to YouTube.
	56	True is returned if successful or skipped.
	57	False is returned if login failed.
	58
	59	If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
	60	"""
	61	(username, password) = self._get_login_info()
	62	# No authentication to be performed
	63	if username is None:
	64	if self._LOGIN_REQUIRED:
	65	raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
	66	return True
	67
	68	login_page = self._download_webpage(
	69	self._LOGIN_URL, None,
	70	note=u'Downloading login page',
	71	errnote=u'unable to fetch login page', fatal=False)
	72	if login_page is False:
	73	return
	74
	75	galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
	76	login_page, u'Login GALX parameter')
	77
	78	# Log in
	79	login_form_strs = {
	80	u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
	81	u'Email': username,
	82	u'GALX': galx,
	83	u'Passwd': password,
	84
	85	u'PersistentCookie': u'yes',
	86	u'_utf8': u'霱',
	87	u'bgresponse': u'js_disabled',
	88	u'checkConnection': u'',
	89	u'checkedDomains': u'youtube',
	90	u'dnConn': u'',
	91	u'pstMsg': u'0',
	92	u'rmShown': u'1',
	93	u'secTok': u'',
	94	u'signIn': u'Sign in',
	95	u'timeStmp': u'',
	96	u'service': u'youtube',
	97	u'uilel': u'3',
	98	u'hl': u'en_US',
	99	}
	100
	101	# Convert to UTF-8 before urlencode because Python 2.x's urlencode
	102	# chokes on unicode
	103	login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
	104	login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
	105
	106	req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
	107	login_results = self._download_webpage(
	108	req, None,
	109	note=u'Logging in', errnote=u'unable to log in', fatal=False)
	110	if login_results is False:
	111	return False
	112
	113	if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
	114	raise ExtractorError(u'Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
	115
	116	# Two-Factor
	117	# TODO add SMS and phone call support - these require making a request and then prompting the user
	118
	119	if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
	120	tfa_code = self._get_tfa_info()
	121
	122	if tfa_code is None:
	123	self._downloader.report_warning(u'Two-factor authentication required. Provide it with --twofactor <code>')
	124	self._downloader.report_warning(u'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
	125	return False
	126
	127	# Unlike the first login form, secTok and timeStmp are both required for the TFA form
	128
	129	match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M \| re.U)
	130	if match is None:
	131	self._downloader.report_warning(u'Failed to get secTok - did the page structure change?')
	132	secTok = match.group(1)
	133	match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M \| re.U)
	134	if match is None:
	135	self._downloader.report_warning(u'Failed to get timeStmp - did the page structure change?')
	136	timeStmp = match.group(1)
	137
	138	tfa_form_strs = {
	139	u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
	140	u'smsToken': u'',
	141	u'smsUserPin': tfa_code,
	142	u'smsVerifyPin': u'Verify',
	143
	144	u'PersistentCookie': u'yes',
	145	u'checkConnection': u'',
	146	u'checkedDomains': u'youtube',
	147	u'pstMsg': u'1',
	148	u'secTok': secTok,
	149	u'timeStmp': timeStmp,
	150	u'service': u'youtube',
	151	u'hl': u'en_US',
	152	}
	153	tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
	154	tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
	155
	156	tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
	157	tfa_results = self._download_webpage(
	158	tfa_req, None,
	159	note=u'Submitting TFA code', errnote=u'unable to submit tfa', fatal=False)
	160
	161	if tfa_results is False:
	162	return False
	163
	164	if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
	165	self._downloader.report_warning(u'Two-factor code expired. Please try again, or use a one-use backup code instead.')
	166	return False
	167	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
	168	self._downloader.report_warning(u'unable to log in - did the page structure change?')
	169	return False
	170	if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
	171	self._downloader.report_warning(u'Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
	172	return False
	173
	174	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
	175	self._downloader.report_warning(u'unable to log in: bad username or password')
	176	return False
	177	return True
	178
	179	def _confirm_age(self):
	180	age_form = {
	181	'next_url': '/',
	182	'action_confirm': 'Confirm',
	183	}
	184	req = compat_urllib_request.Request(self._AGE_URL,
	185	compat_urllib_parse.urlencode(age_form).encode('ascii'))
	186
	187	self._download_webpage(
	188	req, None,
	189	note=u'Confirming age', errnote=u'Unable to confirm age')
	190	return True
	191
	192	def _real_initialize(self):
	193	if self._downloader is None:
	194	return
	195	if not self._set_language():
	196	return
	197	if not self._login():
	198	return
	199	self._confirm_age()
	200
	201
	202	class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
	203	IE_DESC = u'YouTube.com'
	204	_VALID_URL = r"""(?x)^
	205	(
	206	(?:https?://\|//)? # http(s):// or protocol-independent URL (optional)
	207	(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/\|
	208	(?:www\.)?deturl\.com/www\.youtube\.com/\|
	209	(?:www\.)?pwnyoutube\.com/\|
	210	(?:www\.)?yourepeat\.com/\|
	211	tube\.majestyc\.net/\|
	212	youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
	213	(?:.*?\#/)? # handle anchor (#/) redirect urls
	214	(?: # the various things that can precede the ID:
	215	(?:(?:v\|embed\|e)/) # v/ or embed/ or e/
	216	\|(?: # or the v= param in all its forms
	217	(?:(?:watch\|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	218	(?:\?\|\#!?) # the params delimiter ? or # or #!
	219	(?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
	220	v=
	221	)
	222	))
	223	\|youtu\.be/ # just youtu.be/xxxx
	224	\|https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
	225	)
	226	)? # all until now is optional -> you can pass the naked ID
	227	([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
	228	(?(1).+)? # if we found the ID, everything can follow
	229	$"""
	230	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	231	_formats = {
	232	'5': {'ext': 'flv', 'width': 400, 'height': 240},
	233	'6': {'ext': 'flv', 'width': 450, 'height': 270},
	234	'13': {'ext': '3gp'},
	235	'17': {'ext': '3gp', 'width': 176, 'height': 144},
	236	'18': {'ext': 'mp4', 'width': 640, 'height': 360},
	237	'22': {'ext': 'mp4', 'width': 1280, 'height': 720},
	238	'34': {'ext': 'flv', 'width': 640, 'height': 360},
	239	'35': {'ext': 'flv', 'width': 854, 'height': 480},
	240	'36': {'ext': '3gp', 'width': 320, 'height': 240},
	241	'37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
	242	'38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
	243	'43': {'ext': 'webm', 'width': 640, 'height': 360},
	244	'44': {'ext': 'webm', 'width': 854, 'height': 480},
	245	'45': {'ext': 'webm', 'width': 1280, 'height': 720},
	246	'46': {'ext': 'webm', 'width': 1920, 'height': 1080},
	247
	248
	249	# 3d videos
	250	'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
	251	'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
	252	'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
	253	'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
	254	'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
	255	'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
	256	'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
	257
	258	# Apple HTTP Live Streaming
	259	'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
	260	'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
	261	'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
	262	'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
	263	'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
	264	'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
	265	'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
	266
	267	# DASH mp4 video
	268	'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	269	'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	270	'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	271	'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	272	'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	273	'138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	274	'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	275	'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	276
	277	# Dash mp4 audio
	278	'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
	279	'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
	280	'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
	281
	282	# Dash webm
	283	'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	284	'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	285	'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	286	'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	287	'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	288	'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
	289	'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	290	'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	291	'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	292	'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	293	'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	294	'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	295	'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	296	'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	297	'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	298
	299	# Dash webm audio
	300	'171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
	301	'172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
	302
	303	# RTMP (unnamed)
	304	'_rtmp': {'protocol': 'rtmp'},
	305	}
	306
	307	IE_NAME = u'youtube'
	308	_TESTS = [
	309	{
	310	u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
	311	u"file": u"BaW_jenozKc.mp4",
	312	u"info_dict": {
	313	u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
	314	u"uploader": u"Philipp Hagemeister",
	315	u"uploader_id": u"phihag",
	316	u"upload_date": u"20121002",
	317	u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
	318	u"categories": [u'Science & Technology'],
	319	}
	320	},
	321	{
	322	u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
	323	u"file": u"UxxajLWwzqY.mp4",
	324	u"note": u"Test generic use_cipher_signature video (#897)",
	325	u"info_dict": {
	326	u"upload_date": u"20120506",
	327	u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
	328	u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
	329	u"uploader": u"Icona Pop",
	330	u"uploader_id": u"IconaPop"
	331	}
	332	},
	333	{
	334	u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
	335	u"file": u"07FYdnEawAQ.mp4",
	336	u"note": u"Test VEVO video with age protection (#956)",
	337	u"info_dict": {
	338	u"upload_date": u"20130703",
	339	u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
	340	u"description": u"md5:64249768eec3bc4276236606ea996373",
	341	u"uploader": u"justintimberlakeVEVO",
	342	u"uploader_id": u"justintimberlakeVEVO"
	343	}
	344	},
	345	{
	346	u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
	347	u"file": u"yZIXLfi8CZQ.mp4",
	348	u"note": u"Embed-only video (#1746)",
	349	u"info_dict": {
	350	u"upload_date": u"20120608",
	351	u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
	352	u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
	353	u"uploader": u"SET India",
	354	u"uploader_id": u"setindia"
	355	}
	356	},
	357	{
	358	u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
	359	u"file": u"a9LDPn-MO4I.m4a",
	360	u"note": u"256k DASH audio (format 141) via DASH manifest",
	361	u"info_dict": {
	362	u"upload_date": "20121002",
	363	u"uploader_id": "8KVIDEO",
	364	u"description": "No description available.",
	365	u"uploader": "8KVIDEO",
	366	u"title": "UHDTV TEST 8K VIDEO.mp4"
	367	},
	368	u"params": {
	369	u"youtube_include_dash_manifest": True,
	370	u"format": "141",
	371	},
	372	},
	373	# DASH manifest with encrypted signature
	374	{
	375	u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
	376	u'info_dict': {
	377	u'id': u'IB3lcPjvWLA',
	378	u'ext': u'm4a',
	379	u'title': u'Afrojack - The Spark ft. Spree Wilson',
	380	u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
	381	u'uploader': u'AfrojackVEVO',
	382	u'uploader_id': u'AfrojackVEVO',
	383	u'upload_date': u'20131011',
	384	},
	385	u"params": {
	386	u'youtube_include_dash_manifest': True,
	387	u'format': '141',
	388	},
	389	},
	390	]
	391
	392
	393	@classmethod
	394	def suitable(cls, url):
	395	"""Receives a URL and returns True if suitable for this IE."""
	396	if YoutubePlaylistIE.suitable(url): return False
	397	return re.match(cls._VALID_URL, url) is not None
	398
	399	def __init__(self, args, *kwargs):
	400	super(YoutubeIE, self).__init__(args, *kwargs)
	401	self._player_cache = {}
	402
	403	def report_video_info_webpage_download(self, video_id):
	404	"""Report attempt to download video info webpage."""
	405	self.to_screen(u'%s: Downloading video info webpage' % video_id)
	406
	407	def report_information_extraction(self, video_id):
	408	"""Report attempt to extract video information."""
	409	self.to_screen(u'%s: Extracting video information' % video_id)
	410
	411	def report_unavailable_format(self, video_id, format):
	412	"""Report extracted video URL."""
	413	self.to_screen(u'%s: Format %s not available' % (video_id, format))
	414
	415	def report_rtmp_download(self):
	416	"""Indicate the download will use the RTMP protocol."""
	417	self.to_screen(u'RTMP download detected')
	418
	419	def _signature_cache_id(self, example_sig):
	420	""" Return a string representation of a signature """
	421	return u'.'.join(compat_str(len(part)) for part in example_sig.split('.'))
	422
	423	def _extract_signature_function(self, video_id, player_url, example_sig):
	424	id_m = re.match(
	425	r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3\|/html5player)?\.(?P<ext>[a-z]+)$',
	426	player_url)
	427	if not id_m:
	428	raise ExtractorError('Cannot identify player %r' % player_url)
	429	player_type = id_m.group('ext')
	430	player_id = id_m.group('id')
	431
	432	# Read from filesystem cache
	433	func_id = '%s_%s_%s' % (
	434	player_type, player_id, self._signature_cache_id(example_sig))
	435	assert os.path.basename(func_id) == func_id
	436	cache_dir = get_cachedir(self._downloader.params)
	437
	438	cache_enabled = cache_dir is not None
	439	if cache_enabled:
	440	cache_fn = os.path.join(os.path.expanduser(cache_dir),
	441	u'youtube-sigfuncs',
	442	func_id + '.json')
	443	try:
	444	with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
	445	cache_spec = json.load(cachef)
	446	return lambda s: u''.join(s[i] for i in cache_spec)
	447	except IOError:
	448	pass # No cache available
	449	except ValueError:
	450	try:
	451	file_size = os.path.getsize(cache_fn)
	452	except (OSError, IOError) as oe:
	453	file_size = str(oe)
	454	self._downloader.report_warning(
	455	u'Cache %s failed (%s)' % (cache_fn, file_size))
	456
	457	if player_type == 'js':
	458	code = self._download_webpage(
	459	player_url, video_id,
	460	note=u'Downloading %s player %s' % (player_type, player_id),
	461	errnote=u'Download of %s failed' % player_url)
	462	res = self._parse_sig_js(code)
	463	elif player_type == 'swf':
	464	urlh = self._request_webpage(
	465	player_url, video_id,
	466	note=u'Downloading %s player %s' % (player_type, player_id),
	467	errnote=u'Download of %s failed' % player_url)
	468	code = urlh.read()
	469	res = self._parse_sig_swf(code)
	470	else:
	471	assert False, 'Invalid player type %r' % player_type
	472
	473	if cache_enabled:
	474	try:
	475	test_string = u''.join(map(compat_chr, range(len(example_sig))))
	476	cache_res = res(test_string)
	477	cache_spec = [ord(c) for c in cache_res]
	478	try:
	479	os.makedirs(os.path.dirname(cache_fn))
	480	except OSError as ose:
	481	if ose.errno != errno.EEXIST:
	482	raise
	483	write_json_file(cache_spec, cache_fn)
	484	except Exception:
	485	tb = traceback.format_exc()
	486	self._downloader.report_warning(
	487	u'Writing cache to %r failed: %s' % (cache_fn, tb))
	488
	489	return res
	490
	491	def _print_sig_code(self, func, example_sig):
	492	def gen_sig_code(idxs):
	493	def _genslice(start, end, step):
	494	starts = u'' if start == 0 else str(start)
	495	ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
	496	steps = u'' if step == 1 else (u':%d' % step)
	497	return u's[%s%s%s]' % (starts, ends, steps)
	498
	499	step = None
	500	start = '(Never used)' # Quelch pyflakes warnings - start will be

1

# coding: utf-8

import errno

import io

import itertools

import json

import os.path

import re

import traceback

from .common import InfoExtractor, SearchInfoExtractor

12

from .subtitles import SubtitlesInfoExtractor

13

from ..jsinterp import JSInterpreter

14

from ..swfinterp import SWFInterpreter

15

from ..utils import (

compat_chr,

compat_parse_qs,

compat_urllib_parse,

compat_urllib_request,

compat_urlparse,

compat_str,

clean_html,

get_cachedir,

get_element_by_id,

get_element_by_attribute,

ExtractorError,

int_or_none,

PagedList,

unescapeHTML,

unified_strdate,

orderedSet,

write_json_file,

uppercase_escape,

)

class YoutubeBaseInfoExtractor(InfoExtractor):

38

"""Provide base functions for Youtube extractors"""

39

_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'

40

_TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'

41

_LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'

42

_AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'

43

_NETRC_MACHINE = 'youtube'

44

# If True it will raise an error if no login info is provided

45

_LOGIN_REQUIRED = False

46

47

def _set_language(self):

48

return bool(self._download_webpage(

49

self._LANG_URL, None,

50

note=u'Setting language', errnote='unable to set language',

fatal=False))

def _login(self):

"""

Attempt to log in to YouTube.

56

True is returned if successful or skipped.

57

False is returned if login failed.

58

59

If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.

60

"""

61

(username, password) = self._get_login_info()

62

# No authentication to be performed

63

if username is None:

64

if self._LOGIN_REQUIRED:

65

raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)

66

return True

67

68

login_page = self._download_webpage(

69

self._LOGIN_URL, None,

70

note=u'Downloading login page',

71

errnote=u'unable to fetch login page', fatal=False)

72

if login_page is False:

73

return

74

75

galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',

76

login_page, u'Login GALX parameter')

# Log in

login_form_strs = {

u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',

u'Email': username,

u'GALX': galx,

u'Passwd': password,

u'PersistentCookie': u'yes',

86

u'_utf8': u'霱',

87

u'bgresponse': u'js_disabled',

88

u'checkConnection': u'',

89

u'checkedDomains': u'youtube',

u'dnConn': u'',

u'pstMsg': u'0',

u'rmShown': u'1',

u'secTok': u'',

u'signIn': u'Sign in',

95

u'timeStmp': u'',

96

u'service': u'youtube',

u'uilel': u'3',

u'hl': u'en_US',

}

# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode

102

# chokes on unicode

103

login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())

104

login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')

105

106

req = compat_urllib_request.Request(self._LOGIN_URL, login_data)

107

login_results = self._download_webpage(

108

req, None,

109

note=u'Logging in', errnote=u'unable to log in', fatal=False)

110

if login_results is False:

111

return False

112

113

if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:

114

raise ExtractorError(u'Please use your account password and a two-factor code instead of an application-specific password.', expected=True)

115

116

# Two-Factor

117

# TODO add SMS and phone call support - these require making a request and then prompting the user

118

119

if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:

120

tfa_code = self._get_tfa_info()

121

122

if tfa_code is None:

123

self._downloader.report_warning(u'Two-factor authentication required. Provide it with --twofactor <code>')

124

self._downloader.report_warning(u'(Note that only TOTP (Google Authenticator App) codes work at this time.)')

125

return False

126

127

# Unlike the first login form, secTok and timeStmp are both required for the TFA form

128

129

match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)

130

if match is None:

131

self._downloader.report_warning(u'Failed to get secTok - did the page structure change?')

132

secTok = match.group(1)

133

match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)

134

if match is None:

135

self._downloader.report_warning(u'Failed to get timeStmp - did the page structure change?')

136

timeStmp = match.group(1)

137

138

tfa_form_strs = {

139

u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',

140

u'smsToken': u'',

141

u'smsUserPin': tfa_code,

142

u'smsVerifyPin': u'Verify',

143

144

u'PersistentCookie': u'yes',

145

u'checkConnection': u'',

146

u'checkedDomains': u'youtube',

147

u'pstMsg': u'1',

148

u'secTok': secTok,

149

u'timeStmp': timeStmp,

150

u'service': u'youtube',

151

u'hl': u'en_US',

152

}

153

tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())

154

tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')

155

156

tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)

157

tfa_results = self._download_webpage(

158

tfa_req, None,

159

note=u'Submitting TFA code', errnote=u'unable to submit tfa', fatal=False)

160

161

if tfa_results is False:

162

return False

163

164

if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:

165

self._downloader.report_warning(u'Two-factor code expired. Please try again, or use a one-use backup code instead.')

166

return False

167

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:

168

self._downloader.report_warning(u'unable to log in - did the page structure change?')

169

return False

170

if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:

171

self._downloader.report_warning(u'Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')

172

return False

173

174

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:

175

self._downloader.report_warning(u'unable to log in: bad username or password')

return False

return True

def _confirm_age(self):

180

age_form = {

181

'next_url': '/',

182

'action_confirm': 'Confirm',

183

}

184

req = compat_urllib_request.Request(self._AGE_URL,

185

compat_urllib_parse.urlencode(age_form).encode('ascii'))

186

187

self._download_webpage(

188

req, None,

189

note=u'Confirming age', errnote=u'Unable to confirm age')

190

return True

191

192

def _real_initialize(self):

193

if self._downloader is None:

194

return

195

if not self._set_language():

196

return

197

if not self._login():

return

self._confirm_age()

class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):

203

IE_DESC = u'YouTube.com'

204

_VALID_URL = r"""(?x)^

205

(

206

(?:https?://|//)? # http(s):// or protocol-independent URL (optional)

207

(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|

208

(?:www\.)?deturl\.com/www\.youtube\.com/|

209

(?:www\.)?pwnyoutube\.com/|

210

(?:www\.)?yourepeat\.com/|

211

tube\.majestyc\.net/|

212

youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains

213

(?:.*?\#/)? # handle anchor (#/) redirect urls

214

(?: # the various things that can precede the ID:

215

(?:(?:v|embed|e)/) # v/ or embed/ or e/

216

|(?: # or the v= param in all its forms

217

(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)

218

(?:\?|\#!?) # the params delimiter ? or # or #!

219

(?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)

v=

)

))

|youtu\.be/ # just youtu.be/xxxx

224

|https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=

225

)

226

)? # all until now is optional -> you can pass the naked ID

227

([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID

228

(?(1).+)? # if we found the ID, everything can follow

229

$"""

230

_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'

231

_formats = {

232

'5': {'ext': 'flv', 'width': 400, 'height': 240},

233

'6': {'ext': 'flv', 'width': 450, 'height': 270},

234

'13': {'ext': '3gp'},

235

'17': {'ext': '3gp', 'width': 176, 'height': 144},

236

'18': {'ext': 'mp4', 'width': 640, 'height': 360},

237

'22': {'ext': 'mp4', 'width': 1280, 'height': 720},

238

'34': {'ext': 'flv', 'width': 640, 'height': 360},

239

'35': {'ext': 'flv', 'width': 854, 'height': 480},

240

'36': {'ext': '3gp', 'width': 320, 'height': 240},

241

'37': {'ext': 'mp4', 'width': 1920, 'height': 1080},

242

'38': {'ext': 'mp4', 'width': 4096, 'height': 3072},

243

'43': {'ext': 'webm', 'width': 640, 'height': 360},

244

'44': {'ext': 'webm', 'width': 854, 'height': 480},

245

'45': {'ext': 'webm', 'width': 1280, 'height': 720},

246

'46': {'ext': 'webm', 'width': 1920, 'height': 1080},

# 3d videos

'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},

251

'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},

252

'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},

253

'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},

254

'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},

255

'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},

256

'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},

257

258

# Apple HTTP Live Streaming

259

'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},

260

'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},

261

'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},

262

'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},

263

'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},

264

'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},

265

'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},

266

267

# DASH mp4 video

268

'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

269

'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

270

'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

271

'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

272

'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

273

'138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

274

'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

275

'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

276

277

# Dash mp4 audio

278

'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},

279

'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},

280

'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},

281

282

# Dash webm

283

'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},

284

'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},

285

'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},

286

'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},

287

'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},

288

'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},

289

'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

290

'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

291

'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

292

'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

293

'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

294

'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

295

'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

296

'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

297

'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

298

299

# Dash webm audio

300

'171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},

301

'172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},

302

303

# RTMP (unnamed)

304

'_rtmp': {'protocol': 'rtmp'},

}

IE_NAME = u'youtube'

_TESTS = [

{

u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",

311

u"file": u"BaW_jenozKc.mp4",

312

u"info_dict": {

313

u"title": u"youtube-dl test video \"'/\\ä↭𝕐",

314

u"uploader": u"Philipp Hagemeister",

315

u"uploader_id": u"phihag",

316

u"upload_date": u"20121002",

317

u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",

318

u"categories": [u'Science & Technology'],

}

},

{

u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",

323

u"file": u"UxxajLWwzqY.mp4",

324

u"note": u"Test generic use_cipher_signature video (#897)",

325

u"info_dict": {

326

u"upload_date": u"20120506",

327

u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",

328

u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",

329

u"uploader": u"Icona Pop",

330

u"uploader_id": u"IconaPop"

}

},

{

u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",

335

u"file": u"07FYdnEawAQ.mp4",

336

u"note": u"Test VEVO video with age protection (#956)",

337

u"info_dict": {

338

u"upload_date": u"20130703",

339

u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",

340

u"description": u"md5:64249768eec3bc4276236606ea996373",

341

u"uploader": u"justintimberlakeVEVO",

342

u"uploader_id": u"justintimberlakeVEVO"

}

},

{

u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",

347

u"file": u"yZIXLfi8CZQ.mp4",

348

u"note": u"Embed-only video (#1746)",

349

u"info_dict": {

350

u"upload_date": u"20120608",

351

u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",

352

u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",

353

u"uploader": u"SET India",

354

u"uploader_id": u"setindia"

}

},

{

u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",

359

u"file": u"a9LDPn-MO4I.m4a",

360

u"note": u"256k DASH audio (format 141) via DASH manifest",

361

u"info_dict": {

362

u"upload_date": "20121002",

363

u"uploader_id": "8KVIDEO",

364

u"description": "No description available.",

365

u"uploader": "8KVIDEO",

366

u"title": "UHDTV TEST 8K VIDEO.mp4"

367

},

368

u"params": {

369

u"youtube_include_dash_manifest": True,

u"format": "141",

},

},

# DASH manifest with encrypted signature

374

{

375

u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',

376

u'info_dict': {

377

u'id': u'IB3lcPjvWLA',

378

u'ext': u'm4a',

379

u'title': u'Afrojack - The Spark ft. Spree Wilson',

380

u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',

381

u'uploader': u'AfrojackVEVO',

382

u'uploader_id': u'AfrojackVEVO',

383

u'upload_date': u'20131011',

384

},

385

u"params": {

386

u'youtube_include_dash_manifest': True,

u'format': '141',

},

},

]

@classmethod

def suitable(cls, url):

395

"""Receives a URL and returns True if suitable for this IE."""

396

if YoutubePlaylistIE.suitable(url): return False

397

return re.match(cls._VALID_URL, url) is not None

398

399

def __init__(self, *args, **kwargs):

400

super(YoutubeIE, self).__init__(*args, **kwargs)

401

self._player_cache = {}

402

403

def report_video_info_webpage_download(self, video_id):

404

"""Report attempt to download video info webpage."""

405

self.to_screen(u'%s: Downloading video info webpage' % video_id)

406

407

def report_information_extraction(self, video_id):

408

"""Report attempt to extract video information."""

409

self.to_screen(u'%s: Extracting video information' % video_id)

410

411

def report_unavailable_format(self, video_id, format):

412

"""Report extracted video URL."""

413

self.to_screen(u'%s: Format %s not available' % (video_id, format))

414

415

def report_rtmp_download(self):

416

"""Indicate the download will use the RTMP protocol."""

417

self.to_screen(u'RTMP download detected')

418

419

def _signature_cache_id(self, example_sig):

420

""" Return a string representation of a signature """

421

return u'.'.join(compat_str(len(part)) for part in example_sig.split('.'))

422

423

def _extract_signature_function(self, video_id, player_url, example_sig):

424

id_m = re.match(

425

r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',

426

player_url)

427

if not id_m:

428

raise ExtractorError('Cannot identify player %r' % player_url)

429

player_type = id_m.group('ext')

430

player_id = id_m.group('id')

431

432

# Read from filesystem cache

433

func_id = '%s_%s_%s' % (

434

player_type, player_id, self._signature_cache_id(example_sig))

435

assert os.path.basename(func_id) == func_id

436

cache_dir = get_cachedir(self._downloader.params)

437

438

cache_enabled = cache_dir is not None

439

if cache_enabled:

440

cache_fn = os.path.join(os.path.expanduser(cache_dir),

u'youtube-sigfuncs',

func_id + '.json')

try:

with io.open(cache_fn, 'r', encoding='utf-8') as cachef:

445

cache_spec = json.load(cachef)

446

return lambda s: u''.join(s[i] for i in cache_spec)

447

except IOError:

448

pass # No cache available

449

except ValueError:

450

try:

451

file_size = os.path.getsize(cache_fn)

452

except (OSError, IOError) as oe:

453

file_size = str(oe)

454

self._downloader.report_warning(

455

u'Cache %s failed (%s)' % (cache_fn, file_size))

456

457

if player_type == 'js':

458

code = self._download_webpage(

459

player_url, video_id,

460

note=u'Downloading %s player %s' % (player_type, player_id),

461

errnote=u'Download of %s failed' % player_url)

462

res = self._parse_sig_js(code)

463

elif player_type == 'swf':

464

urlh = self._request_webpage(

465

player_url, video_id,

466

note=u'Downloading %s player %s' % (player_type, player_id),

467

errnote=u'Download of %s failed' % player_url)

468

code = urlh.read()

469

res = self._parse_sig_swf(code)

470

else:

471

assert False, 'Invalid player type %r' % player_type

if cache_enabled:

try:

test_string = u''.join(map(compat_chr, range(len(example_sig))))

476

cache_res = res(test_string)

477

cache_spec = [ord(c) for c in cache_res]

478

try:

479

os.makedirs(os.path.dirname(cache_fn))

480

except OSError as ose:

481

if ose.errno != errno.EEXIST:

482

raise

483

write_json_file(cache_spec, cache_fn)

484

except Exception:

485

tb = traceback.format_exc()

486

self._downloader.report_warning(

487

u'Writing cache to %r failed: %s' % (cache_fn, tb))

return res

def _print_sig_code(self, func, example_sig):

492

def gen_sig_code(idxs):

493

def _genslice(start, end, step):

494

starts = u'' if start == 0 else str(start)

495

ends = (u':%d' % (end+step)) if end + step >= 0 else u':'

496

steps = u'' if step == 1 else (u':%d' % step)

497

return u's[%s%s%s]' % (starts, ends, steps)

498

499

step = None

500

start = '(Never used)' # Quelch pyflakes warnings - start will be

501

# set as soon as step is set

502

for i, prev in zip(idxs[1:], idxs[:-1]):

if step is not None:

if i - prev == step:

continue

yield _genslice(start, prev, step)

507

step = None

508

continue

509

if i - prev in [-1, 1]:

step = i - prev

start = prev

continue

else:

yield u's[%d]' % prev

if step is None:

yield u's[%d]' % i

else:

yield _genslice(start, i, step)

519

520

test_string = u''.join(map(compat_chr, range(len(example_sig))))

521

cache_res = func(test_string)

522

cache_spec = [ord(c) for c in cache_res]

523

expr_code = u' + '.join(gen_sig_code(cache_spec))

524

signature_id_tuple = '(%s)' % (

525

', '.join(compat_str(len(p)) for p in example_sig.split('.')))

526

code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'

527

u' return %s\n') % (signature_id_tuple, expr_code)

528

self.to_screen(u'Extracted signature function:\n' + code)

529

530

def _parse_sig_js(self, jscode):

531

funcname = self._search_regex(

532

r'signature=([$a-zA-Z]+)', jscode,

533

u'Initial JS player signature function name')

534

535

jsi = JSInterpreter(jscode)

536

initial_function = jsi.extract_function(funcname)

537

return lambda s: initial_function([s])

538

539

def _parse_sig_swf(self, file_contents):

540

swfi = SWFInterpreter(file_contents)

541

TARGET_CLASSNAME = u'SignatureDecipher'

542

searched_class = swfi.extract_class(TARGET_CLASSNAME)

543

initial_function = swfi.extract_function(searched_class, u'decipher')

544

return lambda s: initial_function([s])

545

546

def _decrypt_signature(self, s, video_id, player_url, age_gate=False):

547

"""Turn the encrypted s field into a working signature"""

548

549

if player_url is None:

550

raise ExtractorError(u'Cannot decrypt signature without player_url')

551

552

if player_url.startswith(u'//'):

553

player_url = u'https:' + player_url

554

try:

555

player_id = (player_url, self._signature_cache_id(s))

556

if player_id not in self._player_cache:

557

func = self._extract_signature_function(

558

video_id, player_url, s

559

)

560

self._player_cache[player_id] = func

561

func = self._player_cache[player_id]

562

if self._downloader.params.get('youtube_print_sig_code'):

563

self._print_sig_code(func, s)

564

return func(s)

565

except Exception as e:

566

tb = traceback.format_exc()

567

raise ExtractorError(

568

u'Signature extraction failed: ' + tb, cause=e)

569

570

def _get_available_subtitles(self, video_id, webpage):

571

try:

572

sub_list = self._download_webpage(

573

'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,

574

video_id, note=False)

575

except ExtractorError as err:

576

self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))

577

return {}

578

lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)

sub_lang_list = {}

for l in lang_list:

lang = l[1]

if lang in sub_lang_list:

584

continue

585

params = compat_urllib_parse.urlencode({

586

'lang': lang,

587

'v': video_id,

588

'fmt': self._downloader.params.get('subtitlesformat', 'srt'),

589

'name': unescapeHTML(l[0]).encode('utf-8'),

590

})

591

url = u'https://www.youtube.com/api/timedtext?' + params

592

sub_lang_list[lang] = url

593

if not sub_lang_list:

594

self._downloader.report_warning(u'video doesn\'t have subtitles')

return {}

return sub_lang_list

def _get_available_automatic_caption(self, video_id, webpage):

599

"""We need the webpage for getting the captions url, pass it as an

600

argument to speed up the process."""

601

sub_format = self._downloader.params.get('subtitlesformat', 'srt')

602

self.to_screen(u'%s: Looking for automatic captions' % video_id)

603

mobj = re.search(r';ytplayer.config = ({.*?});', webpage)

604

err_msg = u'Couldn\'t find automatic captions for %s' % video_id

605

if mobj is None:

606

self._downloader.report_warning(err_msg)

607

return {}

608

player_config = json.loads(mobj.group(1))

609

try:

610

args = player_config[u'args']

611

caption_url = args[u'ttsurl']

612

timestamp = args[u'timestamp']

613

# We get the available subtitles

614

list_params = compat_urllib_parse.urlencode({

'type': 'list',

'tlangs': 1,

'asrs': 1,

})

list_url = caption_url + '&' + list_params

620

caption_list = self._download_xml(list_url, video_id)

621

original_lang_node = caption_list.find('track')

622

if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :

623

self._downloader.report_warning(u'Video doesn\'t have automatic captions')

624

return {}

625

original_lang = original_lang_node.attrib['lang_code']

626

627

sub_lang_list = {}

628

for lang_node in caption_list.findall('target'):

629

sub_lang = lang_node.attrib['lang_code']

630

params = compat_urllib_parse.urlencode({

631

'lang': original_lang,

'tlang': sub_lang,

'fmt': sub_format,

'ts': timestamp,

'kind': 'asr',

})

sub_lang_list[sub_lang] = caption_url + '&' + params

638

return sub_lang_list

639

# An extractor error can be raise by the download process if there are

640

# no automatic captions but there are subtitles

641

except (KeyError, ExtractorError):

642

self._downloader.report_warning(err_msg)

return {}

@classmethod

def extract_id(cls, url):

647

mobj = re.match(cls._VALID_URL, url, re.VERBOSE)

648

if mobj is None:

649

raise ExtractorError(u'Invalid URL: %s' % url)

650

video_id = mobj.group(2)

651

return video_id

652

653

def _extract_from_m3u8(self, manifest_url, video_id):

654

url_map = {}

655

def _get_urls(_manifest):

656

lines = _manifest.split('\n')

657

urls = filter(lambda l: l and not l.startswith('#'),

658

lines)

659

return urls

660

manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')

661

formats_urls = _get_urls(manifest)

662

for format_url in formats_urls:

663

itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')

664

url_map[itag] = format_url

665

return url_map

666

667

def _extract_annotations(self, video_id):

668

url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id

669

return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')

670

671

def _real_extract(self, url):

672

proto = (

673

u'http' if self._downloader.params.get('prefer_insecure', False)

674

else u'https')

675

676

# Extract original video URL from URL with redirection, like age verification, using next_url parameter

677

mobj = re.search(self._NEXT_URL_RE, url)

678

if mobj:

679

url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')

680

video_id = self.extract_id(url)

681

682

# Get video webpage

683

url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id

684

video_webpage = self._download_webpage(url, video_id)

685

686

# Attempt to extract SWF player URL

687

mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

688

if mobj is not None:

689

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

else:

player_url = None

# Get video info

self.report_video_info_webpage_download(video_id)

695

if re.search(r'player-age-gate-content">', video_webpage) is not None:

696

self.report_age_confirmation()

697

age_gate = True

698

# We simulate the access to the video from www.youtube.com/v/{video_id}

699

# this can be viewed without login into Youtube

700

data = compat_urllib_parse.urlencode({

701

'video_id': video_id,

702

'eurl': 'https://youtube.googleapis.com/v/' + video_id,

703

'sts': self._search_regex(

704

r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),

705

})

706

video_info_url = proto + '://www.youtube.com/get_video_info?' + data

707

video_info_webpage = self._download_webpage(video_info_url, video_id,

708

note=False,

709

errnote='unable to download video info webpage')

710

video_info = compat_parse_qs(video_info_webpage)

711

else:

712

age_gate = False

713

for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:

714

video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'

715

% (video_id, el_type))

716

video_info_webpage = self._download_webpage(video_info_url, video_id,

717

note=False,

718

errnote='unable to download video info webpage')

719

video_info = compat_parse_qs(video_info_webpage)

720

if 'token' in video_info:

721

break

722

if 'token' not in video_info:

723

if 'reason' in video_info:

724

raise ExtractorError(

725

u'YouTube said: %s' % video_info['reason'][0],

726

expected=True, video_id=video_id)

727

else:

728

raise ExtractorError(

729

u'"token" parameter not in video info for unknown reason',

730

video_id=video_id)

731

732

if 'view_count' in video_info:

733

view_count = int(video_info['view_count'][0])

else:

view_count = None

# Check for "rental" videos

738

if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:

739

raise ExtractorError(u'"rental" videos not supported')

740

741

# Start extracting information

742

self.report_information_extraction(video_id)

743

744

# uploader

745

if 'author' not in video_info:

746

raise ExtractorError(u'Unable to extract uploader name')

747

video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])

748

749

# uploader_id

750

video_uploader_id = None

751

mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)

752

if mobj is not None:

753

video_uploader_id = mobj.group(1)

754

else:

755

self._downloader.report_warning(u'unable to extract uploader nickname')

756

757

# title

758

if 'title' in video_info:

759

video_title = video_info['title'][0]

760

else:

761

self._downloader.report_warning(u'Unable to extract video title')

video_title = u'_'

# thumbnail image

# We try first to get a high quality image:

766

m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',

767

video_webpage, re.DOTALL)

768

if m_thumb is not None:

769

video_thumbnail = m_thumb.group(1)

770

elif 'thumbnail_url' not in video_info:

771

self._downloader.report_warning(u'unable to extract video thumbnail')

772

video_thumbnail = None

773

else: # don't panic if we can't find it

774

video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])

# upload date

upload_date = None

mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)

779

if mobj is None:

780

mobj = re.search(

781

r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',

782

video_webpage)

783

if mobj is not None:

784

upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())

785

upload_date = unified_strdate(upload_date)

786

787

m_cat_container = get_element_by_id("eow-category", video_webpage)

788

if m_cat_container:

789

category = self._html_search_regex(

790

r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',

791

default=None)

792

video_categories = None if category is None else [category]

793

else:

794

video_categories = None

795

796

# description

797

video_description = get_element_by_id("eow-description", video_webpage)

798

if video_description:

799

video_description = re.sub(r'''(?x)

800

<a\s+

801

(?:[a-zA-Z-]+="[^"]+"\s+)*?

802

title="([^"]+)"\s+

803

(?:[a-zA-Z-]+="[^"]+"\s+)*?

804

class="yt-uix-redirect-link"\s*>

805

[^<]+

806

</a>

807

''', r'\1', video_description)

808

video_description = clean_html(video_description)

809

else:

810

fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)

811

if fd_mobj:

812

video_description = unescapeHTML(fd_mobj.group(1))

813

else:

814

video_description = u''

815

816

def _extract_count(count_name):

817

count = self._search_regex(

818

r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),

819

video_webpage, count_name, default=None)

820

if count is not None:

821

return int(count.replace(',', ''))

822

return None

823

like_count = _extract_count(u'like')

824

dislike_count = _extract_count(u'dislike')

825

826

# subtitles

827

video_subtitles = self.extract_subtitles(video_id, video_webpage)

828

829

if self._downloader.params.get('listsubtitles', False):

830

self._list_available_subtitles(video_id, video_webpage)

831

return

832

833

if 'length_seconds' not in video_info:

834

self._downloader.report_warning(u'unable to extract video duration')

835

video_duration = None

836

else:

837

video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))

838

839

# annotations

840

video_annotations = None

841

if self._downloader.params.get('writeannotations', False):

842

video_annotations = self._extract_annotations(video_id)

843

844

# Decide which formats to download

845

try:

846

mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)

847

if not mobj:

848

raise ValueError('Could not find vevo ID')

849

json_code = uppercase_escape(mobj.group(1))

850

ytplayer_config = json.loads(json_code)

851

args = ytplayer_config['args']

852

# Easy way to know if the 's' value is in url_encoded_fmt_stream_map

853

# this signatures are encrypted

854

if 'url_encoded_fmt_stream_map' not in args:

855

raise ValueError(u'No stream_map present') # caught below

856

re_signature = re.compile(r'[&,]s=')

857

m_s = re_signature.search(args['url_encoded_fmt_stream_map'])

858

if m_s is not None:

859

self.to_screen(u'%s: Encrypted signatures detected.' % video_id)

860

video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]

861

m_s = re_signature.search(args.get('adaptive_fmts', u''))

862

if m_s is not None:

863

if 'adaptive_fmts' in video_info:

864

video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']

865

else:

866

video_info['adaptive_fmts'] = [args['adaptive_fmts']]

except ValueError:

pass

def _map_to_format_list(urlmap):

871

formats = []

872

for itag, video_real_url in urlmap.items():

873

dct = {

874

'format_id': itag,

875

'url': video_real_url,

876

'player_url': player_url,

877

}

878

if itag in self._formats:

879

dct.update(self._formats[itag])

formats.append(dct)

return formats

if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

884

self.report_rtmp_download()

885

formats = [{

886

'format_id': '_rtmp',

887

'protocol': 'rtmp',

888

'url': video_info['conn'][0],

889

'player_url': player_url,

890

}]

891

elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:

892

encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]

893

if 'rtmpe%3Dyes' in encoded_url_map:

894

raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)

895

url_map = {}

896

for url_data_str in encoded_url_map.split(','):

897

url_data = compat_parse_qs(url_data_str)

898

if 'itag' not in url_data or 'url' not in url_data:

899

continue

900

format_id = url_data['itag'][0]

901

url = url_data['url'][0]

902

903

if 'sig' in url_data:

904

url += '&signature=' + url_data['sig'][0]

905

elif 's' in url_data:

906

encrypted_sig = url_data['s'][0]

907

908

if not age_gate:

909

jsplayer_url_json = self._search_regex(

910

r'"assets":.+?"js":\s*("[^"]+")',

911

video_webpage, u'JS player URL')

912

player_url = json.loads(jsplayer_url_json)

913

if player_url is None:

914

player_url_json = self._search_regex(

915

r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',

916

video_webpage, u'age gate player URL')

917

player_url = json.loads(player_url_json)

918

919

if self._downloader.params.get('verbose'):

920

if player_url is None:

921

player_version = 'unknown'

922

player_desc = 'unknown'

923

else:

924

if player_url.endswith('swf'):

925

player_version = self._search_regex(

926

r'-(.+?)(?:/watch_as3)?\.swf$', player_url,

927

u'flash player', fatal=False)

928

player_desc = 'flash player %s' % player_version

929

else:

930

player_version = self._search_regex(

931

r'html5player-([^/]+?)(?:/html5player)?\.js',

932

player_url,

933

'html5 player', fatal=False)

934

player_desc = u'html5 player %s' % player_version

935

936

parts_sizes = self._signature_cache_id(encrypted_sig)

937

self.to_screen(u'{%s} signature length %s, %s' %

938

(format_id, parts_sizes, player_desc))

939

940

signature = self._decrypt_signature(

941

encrypted_sig, video_id, player_url, age_gate)

942

url += '&signature=' + signature

943

if 'ratebypass' not in url:

944

url += '&ratebypass=yes'

945

url_map[format_id] = url

946

formats = _map_to_format_list(url_map)

947

elif video_info.get('hlsvp'):

948

manifest_url = video_info['hlsvp'][0]

949

url_map = self._extract_from_m3u8(manifest_url, video_id)

950

formats = _map_to_format_list(url_map)

951

else:

952

raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')

953

954

# Look for the DASH manifest

955

if (self._downloader.params.get('youtube_include_dash_manifest', False)):

956

try:

957

# The DASH manifest used needs to be the one from the original video_webpage.

958

# The one found in get_video_info seems to be using different signatures.

959

# However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.

960

# Luckily, it seems, this case uses some kind of default signature (len == 86), so the

961

# combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.

962

if age_gate:

963

dash_manifest_url = video_info.get('dashmpd')[0]

964

else:

965

dash_manifest_url = ytplayer_config['args']['dashmpd']

966

def decrypt_sig(mobj):

967

s = mobj.group(1)

968

dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)

969

return '/signature/%s' % dec_s

970

dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)

971

dash_doc = self._download_xml(

972

dash_manifest_url, video_id,

973

note=u'Downloading DASH manifest',

974

errnote=u'Could not download DASH manifest')

975

for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):

976

url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')

977

if url_el is None:

978

continue

979

format_id = r.attrib['id']

980

video_url = url_el.text

981

filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))

982

f = {

983

'format_id': format_id,

984

'url': video_url,

985

'width': int_or_none(r.attrib.get('width')),

986

'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),

987

'asr': int_or_none(r.attrib.get('audioSamplingRate')),

988

'filesize': filesize,

989

}

990

try:

991

existing_format = next(

992

fo for fo in formats

993

if fo['format_id'] == format_id)

994

except StopIteration:

995

f.update(self._formats.get(format_id, {}))

996

formats.append(f)

997

else:

998

existing_format.update(f)

999

1000

except (ExtractorError, KeyError) as e:

1001

self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)

1002

1003

self._sort_formats(formats)

return {

'id': video_id,

'uploader': video_uploader,

1008

'uploader_id': video_uploader_id,

1009

'upload_date': upload_date,

1010

'title': video_title,

1011

'thumbnail': video_thumbnail,

1012

'description': video_description,

1013

'categories': video_categories,

1014

'subtitles': video_subtitles,

1015

'duration': video_duration,

1016

'age_limit': 18 if age_gate else 0,

1017

'annotations': video_annotations,

1018

'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,

1019

'view_count': view_count,

1020

'like_count': like_count,

1021

'dislike_count': dislike_count,

'formats': formats,

}

class YoutubePlaylistIE(YoutubeBaseInfoExtractor):

1026

IE_DESC = u'YouTube.com playlists'

1027

_VALID_URL = r"""(?x)(?:

(?:https?://)?

(?:\w+\.)?

youtube\.com/

(?:

\? (?:.*?&)*? (?:p|a|list)=

| p/

)

(

(?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}

1038

# Top tracks, they can also include dots

|(?:MC)[\w\.]*

)

.*

|

((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})

1044

)"""

1045

_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'

1046

_MORE_PAGES_INDICATOR = r'data-link-type="next"'

1047

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'

1048

IE_NAME = u'youtube:playlist'

1049

1050

def _real_initialize(self):

1051

self._login()

1052

1053

def _ids_to_results(self, ids):

1054

return [self.url_result(vid_id, 'Youtube', video_id=vid_id)

1055

for vid_id in ids]

1056

1057

def _extract_mix(self, playlist_id):

1058

# The mixes are generated from a a single video

1059

# the id of the playlist is just 'RD' + video_id

1060

url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)

1061

webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')

1062

search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)

1063

title_span = (search_title('playlist-title') or

1064

search_title('title long-title') or search_title('title'))

1065

title = clean_html(title_span)

1066

video_re = r'''(?x)data-video-username=".*?".*?

1067

href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id)

1068

ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))

1069

url_results = self._ids_to_results(ids)

1070

1071

return self.playlist_result(url_results, playlist_id, title)

1072

1073

def _real_extract(self, url):

1074

# Extract playlist id

1075

mobj = re.match(self._VALID_URL, url)

1076

if mobj is None:

1077

raise ExtractorError(u'Invalid URL: %s' % url)

1078

playlist_id = mobj.group(1) or mobj.group(2)

1079

1080

# Check if it's a video-specific URL

1081

query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)

1082

if 'v' in query_dict:

1083

video_id = query_dict['v'][0]

1084

if self._downloader.params.get('noplaylist'):

1085

self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)

1086

return self.url_result(video_id, 'Youtube', video_id=video_id)

1087

else:

1088

self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))

1089

1090

if playlist_id.startswith('RD'):

1091

# Mixes require a custom extraction process

1092

return self._extract_mix(playlist_id)

1093

if playlist_id.startswith('TL'):

1094

raise ExtractorError(u'For downloading YouTube.com top lists, use '

1095

u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)

1096

1097

url = self._TEMPLATE_URL % playlist_id

1098

page = self._download_webpage(url, playlist_id)

1099

more_widget_html = content_html = page

1100

1101

# Check if the playlist exists or is private

1102

if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:

1103

raise ExtractorError(

1104

u'The playlist doesn\'t exist or is private, use --username or '

1105

'--netrc to access it.',

1106

expected=True)

1107

1108

# Extract the video ids from the playlist pages

1109

ids = []

1110

1111

for page_num in itertools.count(1):

1112

matches = re.finditer(self._VIDEO_RE, content_html)

1113

# We remove the duplicates and the link with index 0

1114

# (it's not the first video of the playlist)

1115

new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')

1116

ids.extend(new_ids)

1117

1118

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

1123

'https://youtube.com/%s' % mobj.group('more'), playlist_id,

1124

'Downloading page #%s' % page_num,

1125

transform_source=uppercase_escape)

1126

content_html = more['content_html']

1127

more_widget_html = more['load_more_widget_html']

1128

1129

playlist_title = self._html_search_regex(

1130

r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',

1131

page, u'title')

1132

1133

url_results = self._ids_to_results(ids)

1134

return self.playlist_result(url_results, playlist_id, playlist_title)

1135

1136

1137

class YoutubeTopListIE(YoutubePlaylistIE):

1138

IE_NAME = u'youtube:toplist'

1139

IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'

1140

u' (Example: "yttoplist:music:Top Tracks")')

1141

_VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'

1142

1143

def _real_extract(self, url):

1144

mobj = re.match(self._VALID_URL, url)

1145

channel = mobj.group('chann')

1146

title = mobj.group('title')

1147

query = compat_urllib_parse.urlencode({'title': title})

1148

playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)

1149

channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)

1150

link = self._html_search_regex(playlist_re, channel_page, u'list')

1151

url = compat_urlparse.urljoin('https://www.youtube.com/', link)

1152

1153

video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'

1154

ids = []

1155

# sometimes the webpage doesn't contain the videos

1156

# retry until we get them

1157

for i in itertools.count(0):

1158

msg = u'Downloading Youtube mix'

1159

if i > 0:

1160

msg += ', retry #%d' % i

1161

webpage = self._download_webpage(url, title, msg)

1162

ids = orderedSet(re.findall(video_re, webpage))

1163

if ids:

1164

break

1165

url_results = self._ids_to_results(ids)

1166

return self.playlist_result(url_results, playlist_title=title)

1167

1168

1169

class YoutubeChannelIE(InfoExtractor):

1170

IE_DESC = u'YouTube.com channels'

1171

_VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"

1172

_MORE_PAGES_INDICATOR = 'yt-uix-load-more'

1173

_MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'

1174

IE_NAME = u'youtube:channel'

1175

1176

def extract_videos_from_page(self, page):

1177

ids_in_page = []

1178

for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):

1179

if mobj.group(1) not in ids_in_page:

1180

ids_in_page.append(mobj.group(1))

1181

return ids_in_page

1182

1183

def _real_extract(self, url):

1184

# Extract channel id

1185

mobj = re.match(self._VALID_URL, url)

1186

if mobj is None:

1187

raise ExtractorError(u'Invalid URL: %s' % url)

1188

1189

# Download channel page

1190

channel_id = mobj.group(1)

1191

video_ids = []

1192

url = 'https://www.youtube.com/channel/%s/videos' % channel_id

1193

channel_page = self._download_webpage(url, channel_id)

1194

autogenerated = re.search(r'''(?x)

1195

class="[^"]*?(?:

1196

channel-header-autogenerated-label|

1197

yt-channel-title-autogenerated

1198

)[^"]*"''', channel_page) is not None

1199

1200

if autogenerated:

1201

# The videos are contained in a single page

1202

# the ajax pages can't be used, they are empty

1203

video_ids = self.extract_videos_from_page(channel_page)

1204

else:

1205

# Download all channel pages using the json-based channel_ajax query

1206

for pagenum in itertools.count(1):

1207

url = self._MORE_PAGES_URL % (pagenum, channel_id)

1208

page = self._download_json(

1209

url, channel_id, note=u'Downloading page #%s' % pagenum,

1210

transform_source=uppercase_escape)

1211

1212

ids_in_page = self.extract_videos_from_page(page['content_html'])

1213

video_ids.extend(ids_in_page)

1214

1215

if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:

1216

break

1217

1218

self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))

1219

1220

url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)

1221

for video_id in video_ids]

1222

return self.playlist_result(url_entries, channel_id)

1223

1224

1225

class YoutubeUserIE(InfoExtractor):

1226

IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'

1227

_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'

1228

_TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'

1229

_GDATA_PAGE_SIZE = 50

1230

_GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'

1231

IE_NAME = u'youtube:user'

1232

1233

@classmethod

1234

def suitable(cls, url):

1235

# Don't return True if the url can be extracted with other youtube

1236

# extractor, the regex would is too permissive and it would match.

1237

other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)

1238

if any(ie.suitable(url) for ie in other_ies): return False

1239

else: return super(YoutubeUserIE, cls).suitable(url)

1240

1241

def _real_extract(self, url):

1242

# Extract username

1243

mobj = re.match(self._VALID_URL, url)

1244

if mobj is None:

1245

raise ExtractorError(u'Invalid URL: %s' % url)

1246

1247

username = mobj.group(1)

1248

1249

# Download video ids using YouTube Data API. Result size per

1250

# query is limited (currently to 50 videos) so we need to query

1251

# page by page until there are no video ids - it means we got

1252

# all of them.

1253

1254

def download_page(pagenum):

1255

start_index = pagenum * self._GDATA_PAGE_SIZE + 1

1256

1257

gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)

1258

page = self._download_webpage(

1259

gdata_url, username,

1260

u'Downloading video ids from %d to %d' % (

1261

start_index, start_index + self._GDATA_PAGE_SIZE))

1262

1263

try:

1264

response = json.loads(page)

1265

except ValueError as err:

1266

raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))

1267

if 'entry' not in response['feed']:

1268

return

1269

1270

# Extract video identifiers

1271

entries = response['feed']['entry']

1272

for entry in entries:

1273

title = entry['title']['$t']

1274

video_id = entry['id']['$t'].split('/')[-1]

yield {

'_type': 'url',

'url': video_id,

'ie_key': 'Youtube',

'id': video_id,

'title': title,

}

url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)

1283

1284

return self.playlist_result(url_results, playlist_title=username)

1285

1286

1287

class YoutubeSearchIE(SearchInfoExtractor):

1288

IE_DESC = u'YouTube.com searches'

1289

_API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'

1290

_MAX_RESULTS = 1000

1291

IE_NAME = u'youtube:search'

1292

_SEARCH_KEY = 'ytsearch'

1293

1294

def _get_n_results(self, query, n):

1295

"""Get a specified number of results for a query"""

video_ids = []

pagenum = 0

limit = n

PAGE_SIZE = 50

while (PAGE_SIZE * pagenum) < limit:

1303

result_url = self._API_URL % (

1304

compat_urllib_parse.quote_plus(query.encode('utf-8')),

1305

(PAGE_SIZE * pagenum) + 1)

1306

data_json = self._download_webpage(

1307

result_url, video_id=u'query "%s"' % query,

1308

note=u'Downloading page %s' % (pagenum + 1),

1309

errnote=u'Unable to download API page')

1310

data = json.loads(data_json)

1311

api_response = data['data']

1312

1313

if 'items' not in api_response:

1314

raise ExtractorError(

1315

u'[youtube] No video results', expected=True)

1316

1317

new_ids = list(video['id'] for video in api_response['items'])

1318

video_ids += new_ids

1319

1320

limit = min(n, api_response['totalItems'])

1321

pagenum += 1

1322

1323

if len(video_ids) > n:

1324

video_ids = video_ids[:n]

1325

videos = [self.url_result(video_id, 'Youtube', video_id=video_id)

1326

for video_id in video_ids]

1327

return self.playlist_result(videos, query)

1328

1329

1330

class YoutubeSearchDateIE(YoutubeSearchIE):

1331

IE_NAME = YoutubeSearchIE.IE_NAME + ':date'

1332

_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'

1333

_SEARCH_KEY = 'ytsearchdate'

1334

IE_DESC = u'YouTube.com searches, newest videos first'

1335

1336

1337

class YoutubeSearchURLIE(InfoExtractor):

1338

IE_DESC = u'YouTube.com search URLs'

1339

IE_NAME = u'youtube:search_url'

1340

_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'

1341

1342

def _real_extract(self, url):

1343

mobj = re.match(self._VALID_URL, url)

1344

query = compat_urllib_parse.unquote_plus(mobj.group('query'))

1345

1346

webpage = self._download_webpage(url, query)

1347

result_code = self._search_regex(

1348

r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')

1349

1350

part_codes = re.findall(

1351

r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)

1352

entries = []

1353

for part_code in part_codes:

1354

part_title = self._html_search_regex(

1355

[r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)

1356

part_url_snippet = self._html_search_regex(

1357

r'(?s)href="([^"]+)"', part_code, 'item URL')

1358

part_url = compat_urlparse.urljoin(

1359

'https://www.youtube.com/', part_url_snippet)

entries.append({

'_type': 'url',

'url': part_url,

'title': part_title,

})

return {

'_type': 'playlist',

'entries': entries,

'title': query,

}

class YoutubeShowIE(InfoExtractor):

1374

IE_DESC = u'YouTube.com (multi-season) shows'

1375

_VALID_URL = r'https?://www\.youtube\.com/show/(.*)'

1376

IE_NAME = u'youtube:show'

1377

1378

def _real_extract(self, url):

1379

mobj = re.match(self._VALID_URL, url)

1380

show_name = mobj.group(1)

1381

webpage = self._download_webpage(url, show_name, u'Downloading show webpage')

1382

# There's one playlist for each season of the show

1383

m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))

1384

self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))

1385

return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]

1386

1387

1388

class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):

1389

"""

1390

Base class for extractors that fetch info from

1391

http://www.youtube.com/feed_ajax

1392

Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.

1393

"""

1394

_LOGIN_REQUIRED = True

1395

# use action_load_personal_feed instead of action_load_system_feed

1396

_PERSONAL_FEED = False

1397

1398

@property

1399

def _FEED_TEMPLATE(self):

1400

action = 'action_load_system_feed'

1401

if self._PERSONAL_FEED:

1402

action = 'action_load_personal_feed'

1403

return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)

@property

def IE_NAME(self):

return u'youtube:%s' % self._FEED_NAME

1408

1409

def _real_initialize(self):

1410

self._login()

1411

1412

def _real_extract(self, url):

1413

feed_entries = []

1414

paging = 0

1415

for i in itertools.count(1):

1416

info = self._download_json(self._FEED_TEMPLATE % paging,

1417

u'%s feed' % self._FEED_NAME,

1418

u'Downloading page %s' % i)

1419

feed_html = info.get('feed_html') or info.get('content_html')

1420

m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)

1421

ids = orderedSet(m.group(1) for m in m_ids)

1422

feed_entries.extend(

1423

self.url_result(video_id, 'Youtube', video_id=video_id)

1424

for video_id in ids)

1425

mobj = re.search(

1426

r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',

feed_html)

if mobj is None:

break

paging = mobj.group('paging')

1431

return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)

1432

1433

class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):

1434

IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'

1435

_VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'

1436

_FEED_NAME = 'subscriptions'

1437

_PLAYLIST_TITLE = u'Youtube Subscriptions'

1438

1439

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):

1440

IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'

1441

_VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'

1442

_FEED_NAME = 'recommended'

1443

_PLAYLIST_TITLE = u'Youtube Recommended videos'

1444

1445

class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):

1446

IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'

1447

_VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'

1448

_FEED_NAME = 'watch_later'

1449

_PLAYLIST_TITLE = u'Youtube Watch Later'

1450

_PERSONAL_FEED = True

1451

1452

class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):

1453

IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'

1454

_VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'

1455

_FEED_NAME = 'history'

1456

_PERSONAL_FEED = True

1457

_PLAYLIST_TITLE = u'Youtube Watch History'

1458

1459

class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):

1460

IE_NAME = u'youtube:favorites'

1461

IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'

1462

_VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'

1463

_LOGIN_REQUIRED = True

1464

1465

def _real_extract(self, url):

1466

webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')

1467

playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')

1468

return self.url_result(playlist_id, 'YoutubePlaylist')

1469

1470

1471

class YoutubeTruncatedURLIE(InfoExtractor):

1472

IE_NAME = 'youtube:truncated_url'

1473

IE_DESC = False # Do not list

1474

_VALID_URL = r'''(?x)

1475

(?:https?://)?[^/]+/watch\?(?:

1476

feature=[a-z_]+|

1477

annotation_id=annotation_[^&]+

1478

)?$|

1479

(?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$

'''

_TESTS = [{

'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',

1484

'only_matching': True,

1485

}, {

1486

'url': 'http://www.youtube.com/watch?',

1487

'only_matching': True,

1488

}]

1489

1490

def _real_extract(self, url):

1491

raise ExtractorError(

1492

u'Did you forget to quote the URL? Remember that & is a meta '

1493

u'character in most shells, so you want to put the URL in quotes, '

1494

u'like youtube-dl '

1495

u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '

1496

u' or simply youtube-dl BaW_jenozKc .',

1497

expected=True)