jfr.im git - yt-dlp.git/blame_incremental - youtube

Commit	Line	Data
	1	# coding: utf-8
	2
	3	from __future__ import unicode_literals
	4
	5
	6	import itertools
	7	import json
	8	import os.path
	9	import re
	10	import time
	11	import traceback
	12
	13	from .common import InfoExtractor, SearchInfoExtractor
	14	from ..jsinterp import JSInterpreter
	15	from ..swfinterp import SWFInterpreter
	16	from ..compat import (
	17	compat_chr,
	18	compat_parse_qs,
	19	compat_urllib_parse,
	20	compat_urllib_parse_unquote,
	21	compat_urllib_parse_unquote_plus,
	22	compat_urllib_parse_urlparse,
	23	compat_urlparse,
	24	compat_str,
	25	)
	26	from ..utils import (
	27	clean_html,
	28	encode_dict,
	29	error_to_compat_str,
	30	ExtractorError,
	31	float_or_none,
	32	get_element_by_attribute,
	33	get_element_by_id,
	34	int_or_none,
	35	mimetype2ext,
	36	orderedSet,
	37	parse_duration,
	38	remove_quotes,
	39	remove_start,
	40	sanitized_Request,
	41	smuggle_url,
	42	str_to_int,
	43	unescapeHTML,
	44	unified_strdate,
	45	unsmuggle_url,
	46	uppercase_escape,
	47	ISO3166Utils,
	48	)
	49
	50
	51	class YoutubeBaseInfoExtractor(InfoExtractor):
	52	"""Provide base functions for Youtube extractors"""
	53	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	54	_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
	55	_NETRC_MACHINE = 'youtube'
	56	# If True it will raise an error if no login info is provided
	57	_LOGIN_REQUIRED = False
	58
	59	def _set_language(self):
	60	self._set_cookie(
	61	'.youtube.com', 'PREF', 'f1=50000000&hl=en',
	62	# YouTube sets the expire time to about two months
	63	expire_time=time.time() + 2 * 30 * 24 * 3600)
	64
	65	def _ids_to_results(self, ids):
	66	return [
	67	self.url_result(vid_id, 'Youtube', video_id=vid_id)
	68	for vid_id in ids]
	69
	70	def _login(self):
	71	"""
	72	Attempt to log in to YouTube.
	73	True is returned if successful or skipped.
	74	False is returned if login failed.
	75
	76	If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
	77	"""
	78	(username, password) = self._get_login_info()
	79	# No authentication to be performed
	80	if username is None:
	81	if self._LOGIN_REQUIRED:
	82	raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
	83	return True
	84
	85	login_page = self._download_webpage(
	86	self._LOGIN_URL, None,
	87	note='Downloading login page',
	88	errnote='unable to fetch login page', fatal=False)
	89	if login_page is False:
	90	return
	91
	92	galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
	93	login_page, 'Login GALX parameter')
	94
	95	# Log in
	96	login_form_strs = {
	97	'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
	98	'Email': username,
	99	'GALX': galx,
	100	'Passwd': password,
	101
	102	'PersistentCookie': 'yes',
	103	'_utf8': '霱',
	104	'bgresponse': 'js_disabled',
	105	'checkConnection': '',
	106	'checkedDomains': 'youtube',
	107	'dnConn': '',
	108	'pstMsg': '0',
	109	'rmShown': '1',
	110	'secTok': '',
	111	'signIn': 'Sign in',
	112	'timeStmp': '',
	113	'service': 'youtube',
	114	'uilel': '3',
	115	'hl': 'en_US',
	116	}
	117
	118	login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')
	119
	120	req = sanitized_Request(self._LOGIN_URL, login_data)
	121	login_results = self._download_webpage(
	122	req, None,
	123	note='Logging in', errnote='unable to log in', fatal=False)
	124	if login_results is False:
	125	return False
	126
	127	if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
	128	raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
	129
	130	# Two-Factor
	131	# TODO add SMS and phone call support - these require making a request and then prompting the user
	132
	133	if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:
	134	tfa_code = self._get_tfa_info('2-step verification code')
	135
	136	if not tfa_code:
	137	self._downloader.report_warning(
	138	'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
	139	'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
	140	return False
	141
	142	tfa_code = remove_start(tfa_code, 'G-')
	143
	144	tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
	145
	146	tfa_form_strs.update({
	147	'Pin': tfa_code,
	148	'TrustDevice': 'on',
	149	})
	150
	151	tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')
	152
	153	tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)
	154	tfa_results = self._download_webpage(
	155	tfa_req, None,
	156	note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
	157
	158	if tfa_results is False:
	159	return False
	160
	161	if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:
	162	self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
	163	return False
	164	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
	165	self._downloader.report_warning('unable to log in - did the page structure change?')
	166	return False
	167	if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
	168	self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
	169	return False
	170
	171	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
	172	self._downloader.report_warning('unable to log in: bad username or password')
	173	return False
	174	return True
	175
	176	def _real_initialize(self):
	177	if self._downloader is None:
	178	return
	179	self._set_language()
	180	if not self._login():
	181	return
	182
	183
	184	class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
	185	# Extract entries from page with "Load more" button
	186	def _entries(self, page, playlist_id):
	187	more_widget_html = content_html = page
	188	for page_num in itertools.count(1):
	189	for entry in self._process_page(content_html):
	190	yield entry
	191
	192	mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
	193	if not mobj:
	194	break
	195
	196	more = self._download_json(
	197	'https://youtube.com/%s' % mobj.group('more'), playlist_id,
	198	'Downloading page #%s' % page_num,
	199	transform_source=uppercase_escape)
	200	content_html = more['content_html']
	201	if not content_html.strip():
	202	# Some webpages show a "Load more" button but they don't
	203	# have more videos
	204	break
	205	more_widget_html = more['load_more_widget_html']
	206
	207
	208	class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	209	def _process_page(self, content):
	210	for video_id, video_title in self.extract_videos_from_page(content):
	211	yield self.url_result(video_id, 'Youtube', video_id, video_title)
	212
	213	def extract_videos_from_page(self, page):
	214	ids_in_page = []
	215	titles_in_page = []
	216	for mobj in re.finditer(self._VIDEO_RE, page):
	217	# The link with index 0 is not the first video of the playlist (not sure if still actual)
	218	if 'index' in mobj.groupdict() and mobj.group('id') == '0':
	219	continue
	220	video_id = mobj.group('id')
	221	video_title = unescapeHTML(mobj.group('title'))
	222	if video_title:
	223	video_title = video_title.strip()
	224	try:
	225	idx = ids_in_page.index(video_id)
	226	if video_title and not titles_in_page[idx]:
	227	titles_in_page[idx] = video_title
	228	except ValueError:
	229	ids_in_page.append(video_id)
	230	titles_in_page.append(video_title)
	231	return zip(ids_in_page, titles_in_page)
	232
	233
	234	class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	235	def _process_page(self, content):
	236	for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)):
	237	yield self.url_result(
	238	'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
	239
	240	def _real_extract(self, url):
	241	playlist_id = self._match_id(url)
	242	webpage = self._download_webpage(url, playlist_id)
	243	title = self._og_search_title(webpage, fatal=False)
	244	return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
	245
	246
	247	class YoutubeIE(YoutubeBaseInfoExtractor):
	248	IE_DESC = 'YouTube.com'
	249	_VALID_URL = r"""(?x)^
	250	(
	251	(?:https?://\|//) # http(s):// or protocol-independent URL
	252	(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/\|
	253	(?:www\.)?deturl\.com/www\.youtube\.com/\|
	254	(?:www\.)?pwnyoutube\.com/\|
	255	(?:www\.)?yourepeat\.com/\|
	256	tube\.majestyc\.net/\|
	257	youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
	258	(?:.*?\#/)? # handle anchor (#/) redirect urls
	259	(?: # the various things that can precede the ID:
	260	(?:(?:v\|embed\|e)/(?!videoseries)) # v/ or embed/ or e/
	261	\|(?: # or the v= param in all its forms
	262	(?:(?:watch\|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	263	(?:\?\|\#!?) # the params delimiter ? or # or #!
	264	(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
	265	v=
	266	)
	267	))
	268	\|(?:
	269	youtu\.be\| # just youtu.be/xxxx
	270	vid\.plus # or vid.plus/xxxx
	271	)/
	272	\|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
	273	)
	274	)? # all until now is optional -> you can pass the naked ID
	275	([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
	276	(?!.*?&list=) # combined list/video URLs are handled by the playlist IE
	277	(?(1).+)? # if we found the ID, everything can follow
	278	$"""
	279	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	280	_formats = {
	281	'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	282	'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	283	'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
	284	'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
	285	'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
	286	'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	287	'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	288	'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	289	'36': {'ext': '3gp', 'width': 320, 'height': 240, 'acodec': 'aac', 'abr': 32, 'vcodec': 'mp4v'},
	290	'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	291	'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	292	'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	293	'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	294	'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	295	'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	296	'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	297	'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	298
	299
	300	# 3D videos
	301	'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	302	'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	303	'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	304	'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	305	'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
	306	'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	307	'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	308
	309	# Apple HTTP Live Streaming
	310	'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	311	'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	312	'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	313	'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	314	'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	315	'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	316	'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
	317
	318	# DASH mp4 video
	319	'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	320	'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	321	'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	322	'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	323	'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	324	'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
	325	'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	326	'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	327	'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
	328	'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
	329	'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	330
	331	# Dash mp4 audio
	332	'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
	333	'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
	334	'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
	335
	336	# Dash webm
	337	'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	338	'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	339	'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	340	'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	341	'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	342	'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	343	'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40},
	344	'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	345	'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	346	'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	347	'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	348	'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	349	'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	350	'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	351	'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	352	# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
	353	'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	354	'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	355	'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	356	'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	357	'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	358	'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	359
	360	# Dash webm audio
	361	'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
	362	'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
	363
	364	# Dash webm audio with opus inside
	365	'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
	366	'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
	367	'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
	368
	369	# RTMP (unnamed)
	370	'_rtmp': {'protocol': 'rtmp'},
	371	}
	372	_SUBTITLE_FORMATS = ('ttml', 'vtt')
	373
	374	IE_NAME = 'youtube'
	375	_TESTS = [
	376	{
	377	'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
	378	'info_dict': {
	379	'id': 'BaW_jenozKc',
	380	'ext': 'mp4',
	381	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	382	'uploader': 'Philipp Hagemeister',
	383	'uploader_id': 'phihag',
	384	'upload_date': '20121002',
	385	'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
	386	'categories': ['Science & Technology'],
	387	'tags': ['youtube-dl'],
	388	'like_count': int,
	389	'dislike_count': int,
	390	'start_time': 1,
	391	'end_time': 9,
	392	}
	393	},
	394	{
	395	'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
	396	'note': 'Test generic use_cipher_signature video (#897)',
	397	'info_dict': {
	398	'id': 'UxxajLWwzqY',
	399	'ext': 'mp4',
	400	'upload_date': '20120506',
	401	'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
	402	'alt_title': 'I Love It (feat. Charli XCX)',
	403	'description': 'md5:782e8651347686cba06e58f71ab51773',
	404	'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
	405	'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
	406	'iconic ep', 'iconic', 'love', 'it'],
	407	'uploader': 'Icona Pop',
	408	'uploader_id': 'IconaPop',
	409	'creator': 'Icona Pop',
	410	}
	411	},
	412	{
	413	'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
	414	'note': 'Test VEVO video with age protection (#956)',
	415	'info_dict': {
	416	'id': '07FYdnEawAQ',
	417	'ext': 'mp4',
	418	'upload_date': '20130703',
	419	'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
	420	'alt_title': 'Tunnel Vision',
	421	'description': 'md5:64249768eec3bc4276236606ea996373',
	422	'uploader': 'justintimberlakeVEVO',
	423	'uploader_id': 'justintimberlakeVEVO',
	424	'creator': 'Justin Timberlake',
	425	'age_limit': 18,
	426	}
	427	},
	428	{
	429	'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
	430	'note': 'Embed-only video (#1746)',
	431	'info_dict': {
	432	'id': 'yZIXLfi8CZQ',
	433	'ext': 'mp4',
	434	'upload_date': '20120608',
	435	'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
	436	'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
	437	'uploader': 'SET India',
	438	'uploader_id': 'setindia',
	439	'age_limit': 18,
	440	}
	441	},
	442	{
	443	'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',
	444	'note': 'Use the first video ID in the URL',
	445	'info_dict': {
	446	'id': 'BaW_jenozKc',
	447	'ext': 'mp4',
	448	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	449	'uploader': 'Philipp Hagemeister',
	450	'uploader_id': 'phihag',
	451	'upload_date': '20121002',
	452	'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
	453	'categories': ['Science & Technology'],
	454	'tags': ['youtube-dl'],
	455	'like_count': int,
	456	'dislike_count': int,
	457	},
	458	'params': {
	459	'skip_download': True,
	460	},
	461	},
	462	{
	463	'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
	464	'note': '256k DASH audio (format 141) via DASH manifest',
	465	'info_dict': {
	466	'id': 'a9LDPn-MO4I',
	467	'ext': 'm4a',
	468	'upload_date': '20121002',
	469	'uploader_id': '8KVIDEO',
	470	'description': '',
	471	'uploader': '8KVIDEO',
	472	'title': 'UHDTV TEST 8K VIDEO.mp4'
	473	},
	474	'params': {
	475	'youtube_include_dash_manifest': True,
	476	'format': '141',
	477	},
	478	},
	479	# DASH manifest with encrypted signature
	480	{
	481	'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
	482	'info_dict': {
	483	'id': 'IB3lcPjvWLA',
	484	'ext': 'm4a',
	485	'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
	486	'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
	487	'uploader': 'AfrojackVEVO',
	488	'uploader_id': 'AfrojackVEVO',
	489	'upload_date': '20131011',
	490	},
	491	'params': {
	492	'youtube_include_dash_manifest': True,
	493	'format': '141',
	494	},
	495	},
	496	# JS player signature function name containing $
	497	{
	498	'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
	499	'info_dict': {
	500	'id': 'nfWlot6h_JM',

1

# coding: utf-8

2

3

from __future__ import unicode_literals

import itertools

import json

import os.path

import re

import time

import traceback

from .common import InfoExtractor, SearchInfoExtractor

14

from ..jsinterp import JSInterpreter

15

from ..swfinterp import SWFInterpreter

16

from ..compat import (

compat_chr,

compat_parse_qs,

compat_urllib_parse,

compat_urllib_parse_unquote,

21

compat_urllib_parse_unquote_plus,

22

compat_urllib_parse_urlparse,

compat_urlparse,

compat_str,

)

from ..utils import (

clean_html,

encode_dict,

error_to_compat_str,

ExtractorError,

float_or_none,

get_element_by_attribute,

get_element_by_id,

int_or_none,

mimetype2ext,

orderedSet,

parse_duration,

remove_quotes,

remove_start,

sanitized_Request,

smuggle_url,

str_to_int,

unescapeHTML,

unified_strdate,

unsmuggle_url,

uppercase_escape,

ISO3166Utils,

)

class YoutubeBaseInfoExtractor(InfoExtractor):

52

"""Provide base functions for Youtube extractors"""

53

_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'

54

_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'

55

_NETRC_MACHINE = 'youtube'

56

# If True it will raise an error if no login info is provided

57

_LOGIN_REQUIRED = False

58

59

def _set_language(self):

60

self._set_cookie(

61

'.youtube.com', 'PREF', 'f1=50000000&hl=en',

62

# YouTube sets the expire time to about two months

63

expire_time=time.time() + 2 * 30 * 24 * 3600)

64

65

def _ids_to_results(self, ids):

66

return [

67

self.url_result(vid_id, 'Youtube', video_id=vid_id)

for vid_id in ids]

def _login(self):

"""

Attempt to log in to YouTube.

73

True is returned if successful or skipped.

74

False is returned if login failed.

75

76

If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.

77

"""

78

(username, password) = self._get_login_info()

79

# No authentication to be performed

80

if username is None:

81

if self._LOGIN_REQUIRED:

82

raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)

83

return True

84

85

login_page = self._download_webpage(

86

self._LOGIN_URL, None,

87

note='Downloading login page',

88

errnote='unable to fetch login page', fatal=False)

89

if login_page is False:

90

return

91

92

galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',

93

login_page, 'Login GALX parameter')

# Log in

login_form_strs = {

'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',

'Email': username,

'GALX': galx,

'Passwd': password,

'PersistentCookie': 'yes',

103

'_utf8': '霱',

104

'bgresponse': 'js_disabled',

105

'checkConnection': '',

106

'checkedDomains': 'youtube',

'dnConn': '',

'pstMsg': '0',

'rmShown': '1',

'secTok': '',

'signIn': 'Sign in',

'timeStmp': '',

'service': 'youtube',

'uilel': '3',

'hl': 'en_US',

}

login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')

119

120

req = sanitized_Request(self._LOGIN_URL, login_data)

121

login_results = self._download_webpage(

122

req, None,

123

note='Logging in', errnote='unable to log in', fatal=False)

124

if login_results is False:

125

return False

126

127

if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:

128

raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)

129

130

# Two-Factor

131

# TODO add SMS and phone call support - these require making a request and then prompting the user

132

133

if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:

134

tfa_code = self._get_tfa_info('2-step verification code')

135

136

if not tfa_code:

137

self._downloader.report_warning(

138

'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'

139

'(Note that only TOTP (Google Authenticator App) codes work at this time.)')

140

return False

141

142

tfa_code = remove_start(tfa_code, 'G-')

143

144

tfa_form_strs = self._form_hidden_inputs('challenge', login_results)

145

146

tfa_form_strs.update({

'Pin': tfa_code,

'TrustDevice': 'on',

})

tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')

152

153

tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)

154

tfa_results = self._download_webpage(

155

tfa_req, None,

156

note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)

157

158

if tfa_results is False:

159

return False

160

161

if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:

162

self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')

163

return False

164

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:

165

self._downloader.report_warning('unable to log in - did the page structure change?')

166

return False

167

if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:

168

self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')

169

return False

170

171

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:

172

self._downloader.report_warning('unable to log in: bad username or password')

return False

return True

def _real_initialize(self):

177

if self._downloader is None:

178

return

179

self._set_language()

180

if not self._login():

return

class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):

185

# Extract entries from page with "Load more" button

186

def _entries(self, page, playlist_id):

187

more_widget_html = content_html = page

188

for page_num in itertools.count(1):

189

for entry in self._process_page(content_html):

190

yield entry

191

192

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

197

'https://youtube.com/%s' % mobj.group('more'), playlist_id,

198

'Downloading page #%s' % page_num,

199

transform_source=uppercase_escape)

200

content_html = more['content_html']

201

if not content_html.strip():

202

# Some webpages show a "Load more" button but they don't

203

# have more videos

204

break

205

more_widget_html = more['load_more_widget_html']

206

207

208

class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

209

def _process_page(self, content):

210

for video_id, video_title in self.extract_videos_from_page(content):

211

yield self.url_result(video_id, 'Youtube', video_id, video_title)

212

213

def extract_videos_from_page(self, page):

214

ids_in_page = []

215

titles_in_page = []

216

for mobj in re.finditer(self._VIDEO_RE, page):

217

# The link with index 0 is not the first video of the playlist (not sure if still actual)

218

if 'index' in mobj.groupdict() and mobj.group('id') == '0':

219

continue

220

video_id = mobj.group('id')

221

video_title = unescapeHTML(mobj.group('title'))

222

if video_title:

223

video_title = video_title.strip()

224

try:

225

idx = ids_in_page.index(video_id)

226

if video_title and not titles_in_page[idx]:

227

titles_in_page[idx] = video_title

228

except ValueError:

229

ids_in_page.append(video_id)

230

titles_in_page.append(video_title)

231

return zip(ids_in_page, titles_in_page)

232

233

234

class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

235

def _process_page(self, content):

236

for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)):

237

yield self.url_result(

238

'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')

239

240

def _real_extract(self, url):

241

playlist_id = self._match_id(url)

242

webpage = self._download_webpage(url, playlist_id)

243

title = self._og_search_title(webpage, fatal=False)

244

return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)

245

246

247

class YoutubeIE(YoutubeBaseInfoExtractor):

248

IE_DESC = 'YouTube.com'

249

_VALID_URL = r"""(?x)^

250

(

251

(?:https?://|//) # http(s):// or protocol-independent URL

252

(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|

253

(?:www\.)?deturl\.com/www\.youtube\.com/|

254

(?:www\.)?pwnyoutube\.com/|

255

(?:www\.)?yourepeat\.com/|

256

tube\.majestyc\.net/|

257

youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains

258

(?:.*?\#/)? # handle anchor (#/) redirect urls

259

(?: # the various things that can precede the ID:

260

(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/

261

|(?: # or the v= param in all its forms

262

(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)

263

(?:\?|\#!?) # the params delimiter ? or # or #!

264

(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)

v=

)

))

|(?:

youtu\.be| # just youtu.be/xxxx

270

vid\.plus # or vid.plus/xxxx

271

)/

272

|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=

273

)

274

)? # all until now is optional -> you can pass the naked ID

275

([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID

276

(?!.*?&list=) # combined list/video URLs are handled by the playlist IE

277

(?(1).+)? # if we found the ID, everything can follow

278

$"""

279

_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'

280

_formats = {

281

'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

282

'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

283

'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},

284

'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},

285

'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},

286

'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

287

'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

288

'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

289

'36': {'ext': '3gp', 'width': 320, 'height': 240, 'acodec': 'aac', 'abr': 32, 'vcodec': 'mp4v'},

290

'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

291

'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

292

'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

293

'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

294

'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

295

'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

296

'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

297

'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

# 3D videos

'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

302

'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

303

'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

304

'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

305

'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},

306

'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

307

'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

308

309

# Apple HTTP Live Streaming

310

'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

311

'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

312

'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

313

'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

314

'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

315

'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

316

'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},

317

318

# DASH mp4 video

319

'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

320

'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

321

'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

322

'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

323

'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

324

'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)

325

'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

326

'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

327

'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},

328

'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},

329

'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

330

331

# Dash mp4 audio

332

'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},

333

'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},

334

'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},

335

336

# Dash webm

337

'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

338

'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

339

'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

340

'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

341

'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

342

'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

343

'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40},

344

'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

345

'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

346

'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

347

'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

348

'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

349

'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

350

'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

351

'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

352

# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)

353

'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

354

'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

355

'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

356

'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

357

'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

358

'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

359

360

# Dash webm audio

361

'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},

362

'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},

363

364

# Dash webm audio with opus inside

365

'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},

366

'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},

367

'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},

368

369

# RTMP (unnamed)

370

'_rtmp': {'protocol': 'rtmp'},

371

}

372

_SUBTITLE_FORMATS = ('ttml', 'vtt')

IE_NAME = 'youtube'

_TESTS = [

{

'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

382

'uploader': 'Philipp Hagemeister',

383

'uploader_id': 'phihag',

384

'upload_date': '20121002',

385

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

386

'categories': ['Science & Technology'],

387

'tags': ['youtube-dl'],

388

'like_count': int,

389

'dislike_count': int,

'start_time': 1,

'end_time': 9,

}

},

{

'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',

396

'note': 'Test generic use_cipher_signature video (#897)',

'info_dict': {

'id': 'UxxajLWwzqY',

'ext': 'mp4',

'upload_date': '20120506',

401

'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',

402

'alt_title': 'I Love It (feat. Charli XCX)',

403

'description': 'md5:782e8651347686cba06e58f71ab51773',

404

'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',

405

'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',

406

'iconic ep', 'iconic', 'love', 'it'],

407

'uploader': 'Icona Pop',

408

'uploader_id': 'IconaPop',

409

'creator': 'Icona Pop',

}

},

{

'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',

414

'note': 'Test VEVO video with age protection (#956)',

'info_dict': {

'id': '07FYdnEawAQ',

'ext': 'mp4',

'upload_date': '20130703',

419

'title': 'Justin Timberlake - Tunnel Vision (Explicit)',

420

'alt_title': 'Tunnel Vision',

421

'description': 'md5:64249768eec3bc4276236606ea996373',

422

'uploader': 'justintimberlakeVEVO',

423

'uploader_id': 'justintimberlakeVEVO',

424

'creator': 'Justin Timberlake',

'age_limit': 18,

}

},

{

'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',

430

'note': 'Embed-only video (#1746)',

'info_dict': {

'id': 'yZIXLfi8CZQ',

'ext': 'mp4',

'upload_date': '20120608',

435

'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',

436

'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',

437

'uploader': 'SET India',

438

'uploader_id': 'setindia',

'age_limit': 18,

}

},

{

'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',

444

'note': 'Use the first video ID in the URL',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

449

'uploader': 'Philipp Hagemeister',

450

'uploader_id': 'phihag',

451

'upload_date': '20121002',

452

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

453

'categories': ['Science & Technology'],

454

'tags': ['youtube-dl'],

455

'like_count': int,

456

'dislike_count': int,

457

},

458

'params': {

459

'skip_download': True,

},

},

{

'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',

464

'note': '256k DASH audio (format 141) via DASH manifest',

'info_dict': {

'id': 'a9LDPn-MO4I',

'ext': 'm4a',

'upload_date': '20121002',

469

'uploader_id': '8KVIDEO',

470

'description': '',

471

'uploader': '8KVIDEO',

472

'title': 'UHDTV TEST 8K VIDEO.mp4'

473

},

474

'params': {

475

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# DASH manifest with encrypted signature

480

{

481

'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',

'info_dict': {

'id': 'IB3lcPjvWLA',

'ext': 'm4a',

'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',

486

'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',

487

'uploader': 'AfrojackVEVO',

488

'uploader_id': 'AfrojackVEVO',

489

'upload_date': '20131011',

490

},

491

'params': {

492

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# JS player signature function name containing $

497

{

498

'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',

'info_dict': {

'id': 'nfWlot6h_JM',

'ext': 'm4a',

'title': 'Taylor Swift - Shake It Off',

503

'alt_title': 'Shake It Off',

504

'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',

505

'uploader': 'TaylorSwiftVEVO',

506

'uploader_id': 'TaylorSwiftVEVO',

507

'upload_date': '20140818',

508

'creator': 'Taylor Swift',

509

},

510

'params': {

511

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# Controversy video

{

'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',

'info_dict': {

'id': 'T4XJQO3qol8',

'ext': 'mp4',

'upload_date': '20100909',

522

'uploader': 'The Amazing Atheist',

523

'uploader_id': 'TheAmazingAtheist',

524

'title': 'Burning Everyone\'s Koran',

525

'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',

526

}

527

},

528

# Normal age-gate video (No vevo, embed allowed)

529

{

530

'url': 'http://youtube.com/watch?v=HtVdAasjOgU',

'info_dict': {

'id': 'HtVdAasjOgU',

'ext': 'mp4',

'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',

535

'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',

536

'uploader': 'The Witcher',

537

'uploader_id': 'WitcherGame',

538

'upload_date': '20140605',

'age_limit': 18,

},

},

# Age-gate video with encrypted signature

543

{

544

'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',

'info_dict': {

'id': '6kLq3WMV1nU',

'ext': 'mp4',

'title': 'Dedication To My Ex (Miss That) (Lyric Video)',

549

'description': 'md5:33765bb339e1b47e7e72b5490139bb41',

550

'uploader': 'LloydVEVO',

551

'uploader_id': 'LloydVEVO',

552

'upload_date': '20110629',

'age_limit': 18,

},

},

# video_info is None (https://github.com/rg3/youtube-dl/issues/4421)

557

{

558

'url': '__2ABJjxzNo',

'info_dict': {

'id': '__2ABJjxzNo',

'ext': 'mp4',

'upload_date': '20100430',

563

'uploader_id': 'deadmau5',

564

'creator': 'deadmau5',

565

'description': 'md5:12c56784b8032162bb936a5f76d55360',

566

'uploader': 'deadmau5',

567

'title': 'Deadmau5 - Some Chords (HD)',

568

'alt_title': 'Some Chords',

569

},

570

'expected_warnings': [

571

'DASH manifest missing',

572

]

573

},

574

# Olympics (https://github.com/rg3/youtube-dl/issues/4431)

575

{

576

'url': 'lqQg6PlCWgI',

'info_dict': {

'id': 'lqQg6PlCWgI',

'ext': 'mp4',

'upload_date': '20150827',

581

'uploader_id': 'olympic',

582

'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',

583

'uploader': 'Olympics',

584

'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',

585

},

586

'params': {

587

'skip_download': 'requires avconv',

}

},

# Non-square pixels

{

'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',

'info_dict': {

'id': '_b-2C3KPAM0',

'ext': 'mp4',

'stretched_ratio': 16 / 9.,

597

'upload_date': '20110310',

598

'uploader_id': 'AllenMeow',

599

'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',

600

'uploader': '孫艾倫',

601

'title': '[A-made] 變態妍字幕版太妍我就是這樣的人',

602

},

603

},

604

# url_encoded_fmt_stream_map is empty string

605

{

606

'url': 'qEJwOuvDf7I',

'info_dict': {

'id': 'qEJwOuvDf7I',

'ext': 'webm',

'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',

611

'description': '',

612

'upload_date': '20150404',

613

'uploader_id': 'spbelect',

614

'uploader': 'Наблюдатели Петербурга',

615

},

616

'params': {

617

'skip_download': 'requires avconv',

618

},

619

'skip': 'This live event has ended.',

620

},

621

# Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)

622

{

623

'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',

'info_dict': {

'id': 'FIl7x6_3R5Y',

'ext': 'mp4',

'title': 'md5:7b81415841e02ecd4313668cde88737a',

628

'description': 'md5:116377fd2963b81ec4ce64b542173306',

629

'upload_date': '20150625',

630

'uploader_id': 'dorappi2000',

631

'uploader': 'dorappi2000',

632

'formats': 'mincount:33',

633

},

634

},

635

# DASH manifest with segment_list

636

{

637

'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',

638

'md5': '8ce563a1d667b599d21064e982ab9e31',

'info_dict': {

'id': 'CsmdDsKjzN8',

'ext': 'mp4',

'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510

643

'uploader': 'Airtek',

644

'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',

645

'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',

646

'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',

647

},

648

'params': {

649

'youtube_include_dash_manifest': True,

650

'format': '135', # bestvideo

}

},

{

# Multifeed videos (multiple cameras), URL is for Main Camera

655

'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',

656

'info_dict': {

657

'id': 'jqWvoWXjCVs',

658

'title': 'teamPGP: Rocket League Noob Stream',

659

'description': 'md5:dc7872fb300e143831327f1bae3af010',

},

'playlist': [{

'info_dict': {

'id': 'jqWvoWXjCVs',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',

666

'description': 'md5:dc7872fb300e143831327f1bae3af010',

667

'upload_date': '20150721',

668

'uploader': 'Beer Games Beer',

669

'uploader_id': 'beergamesbeer',

},

}, {

'info_dict': {

'id': '6h8e8xoXJzg',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',

676

'description': 'md5:dc7872fb300e143831327f1bae3af010',

677

'upload_date': '20150721',

678

'uploader': 'Beer Games Beer',

679

'uploader_id': 'beergamesbeer',

},

}, {

'info_dict': {

'id': 'PUOgX5z9xZw',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (grizzle)',

686

'description': 'md5:dc7872fb300e143831327f1bae3af010',

687

'upload_date': '20150721',

688

'uploader': 'Beer Games Beer',

689

'uploader_id': 'beergamesbeer',

},

}, {

'info_dict': {

'id': 'teuwxikvS5k',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (zim)',

696

'description': 'md5:dc7872fb300e143831327f1bae3af010',

697

'upload_date': '20150721',

698

'uploader': 'Beer Games Beer',

699

'uploader_id': 'beergamesbeer',

},

}],

'params': {

'skip_download': True,

},

},

{

'url': 'http://vid.plus/FlRa-iH7PGw',

708

'only_matching': True,

709

},

710

{

711

# Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)

712

# Also tests cut-off URL expansion in video description (see

713

# https://github.com/rg3/youtube-dl/issues/1892,

714

# https://github.com/rg3/youtube-dl/issues/8164)

715

'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',

'info_dict': {

'id': 'lsguqyKfVQg',

'ext': 'mp4',

'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',

720

'alt_title': 'Dark Walk',

721

'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',

722

'upload_date': '20151119',

723

'uploader_id': 'IronSoulElf',

724

'uploader': 'IronSoulElf',

725

'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan',

726

},

727

'params': {

728

'skip_download': True,

},

},

{

# Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)

733

'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',

734

'only_matching': True,

735

},

736

{

737

# Video with yt:stretch=17:0

738

'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',

'info_dict': {

'id': 'Q39EVAstoRM',

'ext': 'mp4',

'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',

743

'description': 'md5:ee18a25c350637c8faff806845bddee9',

744

'upload_date': '20151107',

745

'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',

746

'uploader': 'CH GAMER DROID',

747

},

748

'params': {

749

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY',

754

'only_matching': True,

}

]

def __init__(self, *args, **kwargs):

759

super(YoutubeIE, self).__init__(*args, **kwargs)

760

self._player_cache = {}

761

762

def report_video_info_webpage_download(self, video_id):

763

"""Report attempt to download video info webpage."""

764

self.to_screen('%s: Downloading video info webpage' % video_id)

765

766

def report_information_extraction(self, video_id):

767

"""Report attempt to extract video information."""

768

self.to_screen('%s: Extracting video information' % video_id)

769

770

def report_unavailable_format(self, video_id, format):

771

"""Report extracted video URL."""

772

self.to_screen('%s: Format %s not available' % (video_id, format))

773

774

def report_rtmp_download(self):

775

"""Indicate the download will use the RTMP protocol."""

776

self.to_screen('RTMP download detected')

777

778

def _signature_cache_id(self, example_sig):

779

""" Return a string representation of a signature """

780

return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))

781

782

def _extract_signature_function(self, video_id, player_url, example_sig):

783

id_m = re.match(

784

r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',

785

player_url)

786

if not id_m:

787

raise ExtractorError('Cannot identify player %r' % player_url)

788

player_type = id_m.group('ext')

789

player_id = id_m.group('id')

790

791

# Read from filesystem cache

792

func_id = '%s_%s_%s' % (

793

player_type, player_id, self._signature_cache_id(example_sig))

794

assert os.path.basename(func_id) == func_id

795

796

cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)

797

if cache_spec is not None:

798

return lambda s: ''.join(s[i] for i in cache_spec)

799

800

download_note = (

801

'Downloading player %s' % player_url

802

if self._downloader.params.get('verbose') else

803

'Downloading %s player %s' % (player_type, player_id)

804

)

805

if player_type == 'js':

806

code = self._download_webpage(

807

player_url, video_id,

808

note=download_note,

809

errnote='Download of %s failed' % player_url)

810

res = self._parse_sig_js(code)

811

elif player_type == 'swf':

812

urlh = self._request_webpage(

813

player_url, video_id,

814

note=download_note,

815

errnote='Download of %s failed' % player_url)

816

code = urlh.read()

817

res = self._parse_sig_swf(code)

818

else:

819

assert False, 'Invalid player type %r' % player_type

820

821

test_string = ''.join(map(compat_chr, range(len(example_sig))))

822

cache_res = res(test_string)

823

cache_spec = [ord(c) for c in cache_res]

824

825

self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)

826

return res

827

828

def _print_sig_code(self, func, example_sig):

829

def gen_sig_code(idxs):

830

def _genslice(start, end, step):

831

starts = '' if start == 0 else str(start)

832

ends = (':%d' % (end + step)) if end + step >= 0 else ':'

833

steps = '' if step == 1 else (':%d' % step)

834

return 's[%s%s%s]' % (starts, ends, steps)

835

836

step = None

837

# Quelch pyflakes warnings - start will be set when step is set

838

start = '(Never used)'

839

for i, prev in zip(idxs[1:], idxs[:-1]):

if step is not None:

if i - prev == step:

continue

yield _genslice(start, prev, step)

844

step = None

845

continue

846

if i - prev in [-1, 1]:

step = i - prev

start = prev

continue

else:

yield 's[%d]' % prev

if step is None:

yield 's[%d]' % i

else:

yield _genslice(start, i, step)

856

857

test_string = ''.join(map(compat_chr, range(len(example_sig))))

858

cache_res = func(test_string)

859

cache_spec = [ord(c) for c in cache_res]

860

expr_code = ' + '.join(gen_sig_code(cache_spec))

861

signature_id_tuple = '(%s)' % (

862

', '.join(compat_str(len(p)) for p in example_sig.split('.')))

863

code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'

864

' return %s\n') % (signature_id_tuple, expr_code)

865

self.to_screen('Extracted signature function:\n' + code)

866

867

def _parse_sig_js(self, jscode):

868

funcname = self._search_regex(

869

r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,

870

'Initial JS player signature function name')

871

872

jsi = JSInterpreter(jscode)

873

initial_function = jsi.extract_function(funcname)

874

return lambda s: initial_function([s])

875

876

def _parse_sig_swf(self, file_contents):

877

swfi = SWFInterpreter(file_contents)

878

TARGET_CLASSNAME = 'SignatureDecipher'

879

searched_class = swfi.extract_class(TARGET_CLASSNAME)

880

initial_function = swfi.extract_function(searched_class, 'decipher')

881

return lambda s: initial_function([s])

882

883

def _decrypt_signature(self, s, video_id, player_url, age_gate=False):

884

"""Turn the encrypted s field into a working signature"""

885

886

if player_url is None:

887

raise ExtractorError('Cannot decrypt signature without player_url')

888

889

if player_url.startswith('//'):

890

player_url = 'https:' + player_url

891

try:

892

player_id = (player_url, self._signature_cache_id(s))

893

if player_id not in self._player_cache:

894

func = self._extract_signature_function(

895

video_id, player_url, s

896

)

897

self._player_cache[player_id] = func

898

func = self._player_cache[player_id]

899

if self._downloader.params.get('youtube_print_sig_code'):

900

self._print_sig_code(func, s)

901

return func(s)

902

except Exception as e:

903

tb = traceback.format_exc()

904

raise ExtractorError(

905

'Signature extraction failed: ' + tb, cause=e)

906

907

def _get_subtitles(self, video_id, webpage):

908

try:

909

subs_doc = self._download_xml(

910

'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,

911

video_id, note=False)

912

except ExtractorError as err:

913

self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))

return {}

sub_lang_list = {}

for track in subs_doc.findall('track'):

918

lang = track.attrib['lang_code']

919

if lang in sub_lang_list:

920

continue

921

sub_formats = []

922

for ext in self._SUBTITLE_FORMATS:

923

params = compat_urllib_parse.urlencode({

'lang': lang,

'v': video_id,

'fmt': ext,

'name': track.attrib['name'].encode('utf-8'),

928

})

929

sub_formats.append({

930

'url': 'https://www.youtube.com/api/timedtext?' + params,

931

'ext': ext,

932

})

933

sub_lang_list[lang] = sub_formats

934

if not sub_lang_list:

935

self._downloader.report_warning('video doesn\'t have subtitles')

return {}

return sub_lang_list

def _get_ytplayer_config(self, video_id, webpage):

940

patterns = (

941

# User data may contain arbitrary character sequences that may affect

942

# JSON extraction with regex, e.g. when '};' is contained the second

943

# regex won't capture the whole JSON. Yet working around by trying more

944

# concrete regex first keeping in mind proper quoted string handling

945

# to be implemented in future that will replace this workaround (see

946

# https://github.com/rg3/youtube-dl/issues/7468,

947

# https://github.com/rg3/youtube-dl/pull/7599)

948

r';ytplayer\.config\s*=\s*({.+?});ytplayer',

949

r';ytplayer\.config\s*=\s*({.+?});',

950

)

951

config = self._search_regex(

952

patterns, webpage, 'ytplayer.config', default=None)

953

if config:

954

return self._parse_json(

955

uppercase_escape(config), video_id, fatal=False)

956

957

def _get_automatic_captions(self, video_id, webpage):

958

"""We need the webpage for getting the captions url, pass it as an

959

argument to speed up the process."""

960

self.to_screen('%s: Looking for automatic captions' % video_id)

961

player_config = self._get_ytplayer_config(video_id, webpage)

962

err_msg = 'Couldn\'t find automatic captions for %s' % video_id

963

if not player_config:

964

self._downloader.report_warning(err_msg)

965

return {}

966

try:

967

args = player_config['args']

968

caption_url = args['ttsurl']

969

if not caption_url:

970

self._downloader.report_warning(err_msg)

971

return {}

972

timestamp = args['timestamp']

973

# We get the available subtitles

974

list_params = compat_urllib_parse.urlencode({

'type': 'list',

'tlangs': 1,

'asrs': 1,

})

list_url = caption_url + '&' + list_params

980

caption_list = self._download_xml(list_url, video_id)

981

original_lang_node = caption_list.find('track')

982

if original_lang_node is None:

983

self._downloader.report_warning('Video doesn\'t have automatic captions')

984

return {}

985

original_lang = original_lang_node.attrib['lang_code']

986

caption_kind = original_lang_node.attrib.get('kind', '')

987

988

sub_lang_list = {}

989

for lang_node in caption_list.findall('target'):

990

sub_lang = lang_node.attrib['lang_code']

991

sub_formats = []

992

for ext in self._SUBTITLE_FORMATS:

993

params = compat_urllib_parse.urlencode({

994

'lang': original_lang,

'tlang': sub_lang,

'fmt': ext,

'ts': timestamp,

'kind': caption_kind,

999

})

1000

sub_formats.append({

1001

'url': caption_url + '&' + params,

1002

'ext': ext,

1003

})

1004

sub_lang_list[sub_lang] = sub_formats

1005

return sub_lang_list

1006

# An extractor error can be raise by the download process if there are

1007

# no automatic captions but there are subtitles

1008

except (KeyError, ExtractorError):

1009

self._downloader.report_warning(err_msg)

return {}

@classmethod

def extract_id(cls, url):

1014

mobj = re.match(cls._VALID_URL, url, re.VERBOSE)

1015

if mobj is None:

1016

raise ExtractorError('Invalid URL: %s' % url)

1017

video_id = mobj.group(2)

1018

return video_id

1019

1020

def _extract_from_m3u8(self, manifest_url, video_id):

1021

url_map = {}

1022

1023

def _get_urls(_manifest):

1024

lines = _manifest.split('\n')

1025

urls = filter(lambda l: l and not l.startswith('#'),

1026

lines)

1027

return urls

1028

manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')

1029

formats_urls = _get_urls(manifest)

1030

for format_url in formats_urls:

1031

itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')

1032

url_map[itag] = format_url

1033

return url_map

1034

1035

def _extract_annotations(self, video_id):

1036

url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id

1037

return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')

1038

1039

def _real_extract(self, url):

1040

url, smuggled_data = unsmuggle_url(url, {})

1041

1042

proto = (

1043

'http' if self._downloader.params.get('prefer_insecure', False)

else 'https')

start_time = None

end_time = None

parsed_url = compat_urllib_parse_urlparse(url)

1049

for component in [parsed_url.fragment, parsed_url.query]:

1050

query = compat_parse_qs(component)

1051

if start_time is None and 't' in query:

1052

start_time = parse_duration(query['t'][0])

1053

if start_time is None and 'start' in query:

1054

start_time = parse_duration(query['start'][0])

1055

if end_time is None and 'end' in query:

1056

end_time = parse_duration(query['end'][0])

1057

1058

# Extract original video URL from URL with redirection, like age verification, using next_url parameter

1059

mobj = re.search(self._NEXT_URL_RE, url)

1060

if mobj:

1061

url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')

1062

video_id = self.extract_id(url)

1063

1064

# Get video webpage

1065

url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id

1066

video_webpage = self._download_webpage(url, video_id)

1067

1068

# Attempt to extract SWF player URL

1069

mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

1070

if mobj is not None:

1071

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

else:

player_url = None

dash_mpds = []

def add_dash_mpd(video_info):

1078

dash_mpd = video_info.get('dashmpd')

1079

if dash_mpd and dash_mpd[0] not in dash_mpds:

1080

dash_mpds.append(dash_mpd[0])

# Get video info

embed_webpage = None

is_live = None

if re.search(r'player-age-gate-content">', video_webpage) is not None:

1086

age_gate = True

1087

# We simulate the access to the video from www.youtube.com/v/{video_id}

1088

# this can be viewed without login into Youtube

1089

url = proto + '://www.youtube.com/embed/%s' % video_id

1090

embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')

1091

data = compat_urllib_parse.urlencode({

1092

'video_id': video_id,

1093

'eurl': 'https://youtube.googleapis.com/v/' + video_id,

1094

'sts': self._search_regex(

1095

r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),

1096

})

1097

video_info_url = proto + '://www.youtube.com/get_video_info?' + data

1098

video_info_webpage = self._download_webpage(

1099

video_info_url, video_id,

1100

note='Refetching age-gated info webpage',

1101

errnote='unable to download video info webpage')

1102

video_info = compat_parse_qs(video_info_webpage)

1103

add_dash_mpd(video_info)

else:

age_gate = False

video_info = None

# Try looking directly into the video webpage

1108

ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)

1109

if ytplayer_config:

1110

args = ytplayer_config['args']

1111

if args.get('url_encoded_fmt_stream_map'):

1112

# Convert to the same format returned by compat_parse_qs

1113

video_info = dict((k, [v]) for k, v in args.items())

1114

add_dash_mpd(video_info)

1115

if args.get('livestream') == '1' or args.get('live_playback') == 1:

1116

is_live = True

1117

if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):

1118

# We also try looking in get_video_info since it may contain different dashmpd

1119

# URL that points to a DASH manifest with possibly different itag set (some itags

1120

# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH

1121

# manifest pointed by get_video_info's dashmpd).

1122

# The general idea is to take a union of itags of both DASH manifests (for example

1123

# video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)

1124

self.report_video_info_webpage_download(video_id)

1125

for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:

1126

video_info_url = (

1127

'%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'

1128

% (proto, video_id, el_type))

1129

video_info_webpage = self._download_webpage(

1130

video_info_url,

1131

video_id, note=False,

1132

errnote='unable to download video info webpage')

1133

get_video_info = compat_parse_qs(video_info_webpage)

1134

if get_video_info.get('use_cipher_signature') != ['True']:

1135

add_dash_mpd(get_video_info)

1136

if not video_info:

1137

video_info = get_video_info

1138

if 'token' in get_video_info:

1139

# Different get_video_info requests may report different results, e.g.

1140

# some may report video unavailability, but some may serve it without

1141

# any complaint (see https://github.com/rg3/youtube-dl/issues/7362,

1142

# the original webpage as well as el=info and el=embedded get_video_info

1143

# requests report video unavailability due to geo restriction while

1144

# el=detailpage succeeds and returns valid data). This is probably

1145

# due to YouTube measures against IP ranges of hosting providers.

1146

# Working around by preferring the first succeeded video_info containing

1147

# the token if no such video_info yet was found.

1148

if 'token' not in video_info:

1149

video_info = get_video_info

1150

break

1151

if 'token' not in video_info:

1152

if 'reason' in video_info:

1153

if 'The uploader has not made this video available in your country.' in video_info['reason']:

1154

regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)

1155

if regions_allowed:

1156

raise ExtractorError('YouTube said: This video is available in %s only' % (

1157

', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),

1158

expected=True)

1159

raise ExtractorError(

1160

'YouTube said: %s' % video_info['reason'][0],

1161

expected=True, video_id=video_id)

1162

else:

1163

raise ExtractorError(

1164

'"token" parameter not in video info for unknown reason',

video_id=video_id)

# title

if 'title' in video_info:

1169

video_title = video_info['title'][0]

1170

else:

1171

self._downloader.report_warning('Unable to extract video title')

video_title = '_'

# description

video_description = get_element_by_id("eow-description", video_webpage)

1176

if video_description:

1177

video_description = re.sub(r'''(?x)

1178

<a\s+

1179

(?:[a-zA-Z-]+="[^"]+"\s+)*?

1180

(?:title|href)="([^"]+)"\s+

1181

(?:[a-zA-Z-]+="[^"]+"\s+)*?

1182

class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*>

1183

[^<]+\.{3}\s*

1184

</a>

1185

''', r'\1', video_description)

1186

video_description = clean_html(video_description)

1187

else:

1188

fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)

1189

if fd_mobj:

1190

video_description = unescapeHTML(fd_mobj.group(1))

1191

else:

1192

video_description = ''

1193

1194

if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):

1195

if not self._downloader.params.get('noplaylist'):

1196

entries = []

1197

feed_ids = []

1198

multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])

1199

for feed in multifeed_metadata_list.split(','):

1200

feed_data = compat_parse_qs(feed)

1201

entries.append({

1202

'_type': 'url_transparent',

1203

'ie_key': 'Youtube',

1204

'url': smuggle_url(

1205

'%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),

1206

{'force_singlefeed': True}),

1207

'title': '%s (%s)' % (video_title, feed_data['title'][0]),

1208

})

1209

feed_ids.append(feed_data['id'][0])

1210

self.to_screen(

1211

'Downloading multifeed video (%s) - add --no-playlist to just download video %s'

1212

% (', '.join(feed_ids), video_id))

1213

return self.playlist_result(entries, video_id, video_title, video_description)

1214

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1215

1216

if 'view_count' in video_info:

1217

view_count = int(video_info['view_count'][0])

else:

view_count = None

# Check for "rental" videos

1222

if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:

1223

raise ExtractorError('"rental" videos not supported')

1224

1225

# Start extracting information

1226

self.report_information_extraction(video_id)

1227

1228

# uploader

1229

if 'author' not in video_info:

1230

raise ExtractorError('Unable to extract uploader name')

1231

video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])

1232

1233

# uploader_id

1234

video_uploader_id = None

1235

mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)

1236

if mobj is not None:

1237

video_uploader_id = mobj.group(1)

1238

else:

1239

self._downloader.report_warning('unable to extract uploader nickname')

1240

1241

# thumbnail image

1242

# We try first to get a high quality image:

1243

m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',

1244

video_webpage, re.DOTALL)

1245

if m_thumb is not None:

1246

video_thumbnail = m_thumb.group(1)

1247

elif 'thumbnail_url' not in video_info:

1248

self._downloader.report_warning('unable to extract video thumbnail')

1249

video_thumbnail = None

1250

else: # don't panic if we can't find it

1251

video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])

1252

1253

# upload date

1254

upload_date = self._html_search_meta(

1255

'datePublished', video_webpage, 'upload date', default=None)

1256

if not upload_date:

1257

upload_date = self._search_regex(

1258

[r'(?s)id="eow-date.*?>(.*?)</span>',

1259

r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],

1260

video_webpage, 'upload date', default=None)

1261

if upload_date:

1262

upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())

1263

upload_date = unified_strdate(upload_date)

1264

1265

m_music = re.search(

1266

r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:$.+?$)?</li',

1267

video_webpage)

1268

if m_music:

1269

video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))

1270

video_creator = clean_html(m_music.group('creator'))

1271

else:

1272

video_alt_title = video_creator = None

1273

1274

m_cat_container = self._search_regex(

1275

r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',

1276

video_webpage, 'categories', default=None)

1277

if m_cat_container:

1278

category = self._html_search_regex(

1279

r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',

1280

default=None)

1281

video_categories = None if category is None else [category]

1282

else:

1283

video_categories = None

1284

1285

video_tags = [

1286

unescapeHTML(m.group('content'))

1287

for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]

1288

1289

def _extract_count(count_name):

1290

return str_to_int(self._search_regex(

1291

r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'

1292

% re.escape(count_name),

1293

video_webpage, count_name, default=None))

1294

1295

like_count = _extract_count('like')

1296

dislike_count = _extract_count('dislike')

1297

1298

# subtitles

1299

video_subtitles = self.extract_subtitles(video_id, video_webpage)

1300

automatic_captions = self.extract_automatic_captions(video_id, video_webpage)

1301

1302

if 'length_seconds' not in video_info:

1303

self._downloader.report_warning('unable to extract video duration')

1304

video_duration = None

1305

else:

1306

video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))

1307

1308

# annotations

1309

video_annotations = None

1310

if self._downloader.params.get('writeannotations', False):

1311

video_annotations = self._extract_annotations(video_id)

1312

1313

def _map_to_format_list(urlmap):

1314

formats = []

1315

for itag, video_real_url in urlmap.items():

1316

dct = {

1317

'format_id': itag,

1318

'url': video_real_url,

1319

'player_url': player_url,

1320

}

1321

if itag in self._formats:

1322

dct.update(self._formats[itag])

formats.append(dct)

return formats

if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

1327

self.report_rtmp_download()

1328

formats = [{

1329

'format_id': '_rtmp',

1330

'protocol': 'rtmp',

1331

'url': video_info['conn'][0],

1332

'player_url': player_url,

1333

}]

1334

elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:

1335

encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]

1336

if 'rtmpe%3Dyes' in encoded_url_map:

1337

raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)

1338

formats = []

1339

for url_data_str in encoded_url_map.split(','):

1340

url_data = compat_parse_qs(url_data_str)

1341

if 'itag' not in url_data or 'url' not in url_data:

1342

continue

1343

format_id = url_data['itag'][0]

1344

url = url_data['url'][0]

1345

1346

if 'sig' in url_data:

1347

url += '&signature=' + url_data['sig'][0]

1348

elif 's' in url_data:

1349

encrypted_sig = url_data['s'][0]

1350

ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'

1351

1352

jsplayer_url_json = self._search_regex(

1353

ASSETS_RE,

1354

embed_webpage if age_gate else video_webpage,

1355

'JS player URL (1)', default=None)

1356

if not jsplayer_url_json and not age_gate:

1357

# We need the embed website after all

1358

if embed_webpage is None:

1359

embed_url = proto + '://www.youtube.com/embed/%s' % video_id

1360

embed_webpage = self._download_webpage(

1361

embed_url, video_id, 'Downloading embed webpage')

1362

jsplayer_url_json = self._search_regex(

1363

ASSETS_RE, embed_webpage, 'JS player URL')

1364

1365

player_url = json.loads(jsplayer_url_json)

1366

if player_url is None:

1367

player_url_json = self._search_regex(

1368

r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',

1369

video_webpage, 'age gate player URL')

1370

player_url = json.loads(player_url_json)

1371

1372

if self._downloader.params.get('verbose'):

1373

if player_url is None:

1374

player_version = 'unknown'

1375

player_desc = 'unknown'

1376

else:

1377

if player_url.endswith('swf'):

1378

player_version = self._search_regex(

1379

r'-(.+?)(?:/watch_as3)?\.swf$', player_url,

1380

'flash player', fatal=False)

1381

player_desc = 'flash player %s' % player_version

1382

else:

1383

player_version = self._search_regex(

1384

[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],

1385

player_url,

1386

'html5 player', fatal=False)

1387

player_desc = 'html5 player %s' % player_version

1388

1389

parts_sizes = self._signature_cache_id(encrypted_sig)

1390

self.to_screen('{%s} signature length %s, %s' %

1391

(format_id, parts_sizes, player_desc))

1392

1393

signature = self._decrypt_signature(

1394

encrypted_sig, video_id, player_url, age_gate)

1395

url += '&signature=' + signature

1396

if 'ratebypass' not in url:

1397

url += '&ratebypass=yes'

1398

1399

dct = {

1400

'format_id': format_id,

1401

'url': url,

1402

'player_url': player_url,

1403

}

1404

if format_id in self._formats:

1405

dct.update(self._formats[format_id])

1406

1407

# Some itags are not included in DASH manifest thus corresponding formats will

1408

# lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).

1409

# Trying to extract metadata from url_encoded_fmt_stream_map entry.

1410

mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])

1411

width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)

1412

1413

more_fields = {

1414

'filesize': int_or_none(url_data.get('clen', [None])[0]),

1415

'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),

1416

'width': width,

1417

'height': height,

1418

'fps': int_or_none(url_data.get('fps', [None])[0]),

1419

'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],

1420

}

1421

for key, value in more_fields.items():

1422

if value:

1423

dct[key] = value

1424

type_ = url_data.get('type', [None])[0]

1425

if type_:

1426

type_split = type_.split(';')

1427

kind_ext = type_split[0].split('/')

1428

if len(kind_ext) == 2:

1429

kind, _ = kind_ext

1430

dct['ext'] = mimetype2ext(type_split[0])

1431

if kind in ('audio', 'video'):

1432

codecs = None

1433

for mobj in re.finditer(

1434

r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):

1435

if mobj.group('key') == 'codecs':

1436

codecs = mobj.group('val')

1437

break

1438

if codecs:

1439

codecs = codecs.split(',')

1440

if len(codecs) == 2:

1441

acodec, vcodec = codecs[1], codecs[0]

1442

else:

1443

acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0])

dct.update({

'acodec': acodec,

'vcodec': vcodec,

})

formats.append(dct)

elif video_info.get('hlsvp'):

1450

manifest_url = video_info['hlsvp'][0]

1451

url_map = self._extract_from_m3u8(manifest_url, video_id)

1452

formats = _map_to_format_list(url_map)

1453

# Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming

1454

for a_format in formats:

1455

a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'

1456

else:

1457

unavailable_message = self._html_search_regex(

1458

r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',

1459

video_webpage, 'unavailable message', default=None)

1460

if unavailable_message:

1461

raise ExtractorError(unavailable_message, expected=True)

1462

raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')

1463

1464

# Look for the DASH manifest

1465

if self._downloader.params.get('youtube_include_dash_manifest', True):

1466

dash_mpd_fatal = True

1467

for mpd_url in dash_mpds:

1468

dash_formats = {}

1469

try:

1470

def decrypt_sig(mobj):

1471

s = mobj.group(1)

1472

dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)

1473

return '/signature/%s' % dec_s

1474

1475

mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)

1476

1477

for df in self._extract_mpd_formats(

1478

mpd_url, video_id, fatal=dash_mpd_fatal,

1479

formats_dict=self._formats):

1480

# Do not overwrite DASH format found in some previous DASH manifest

1481

if df['format_id'] not in dash_formats:

1482

dash_formats[df['format_id']] = df

1483

# Additional DASH manifests may end up in HTTP Error 403 therefore

1484

# allow them to fail without bug report message if we already have

1485

# some DASH manifest succeeded. This is temporary workaround to reduce

1486

# burst of bug reports until we figure out the reason and whether it

1487

# can be fixed at all.

1488

dash_mpd_fatal = False

1489

except (ExtractorError, KeyError) as e:

1490

self.report_warning(

1491

'Skipping DASH manifest: %r' % e, video_id)

1492

if dash_formats:

1493

# Remove the formats we found through non-DASH, they

1494

# contain less info and it can be wrong, because we use

1495

# fixed values (for example the resolution). See

1496

# https://github.com/rg3/youtube-dl/issues/5774 for an

1497

# example.

1498

formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]

1499

formats.extend(dash_formats.values())

1500

1501

# Check for malformed aspect ratio

1502

stretched_m = re.search(

1503

r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',

1504

video_webpage)

1505

if stretched_m:

1506

w = float(stretched_m.group('w'))

1507

h = float(stretched_m.group('h'))

1508

# yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).

1509

# We will only process correct ratios.

if w > 0 and h > 0:

ratio = w / h

for f in formats:

if f.get('vcodec') != 'none':

1514

f['stretched_ratio'] = ratio

1515

1516

self._sort_formats(formats)

return {

'id': video_id,

'uploader': video_uploader,

1521

'uploader_id': video_uploader_id,

1522

'upload_date': upload_date,

1523

'creator': video_creator,

1524

'title': video_title,

1525

'alt_title': video_alt_title,

1526

'thumbnail': video_thumbnail,

1527

'description': video_description,

1528

'categories': video_categories,

1529

'tags': video_tags,

1530

'subtitles': video_subtitles,

1531

'automatic_captions': automatic_captions,

1532

'duration': video_duration,

1533

'age_limit': 18 if age_gate else 0,

1534

'annotations': video_annotations,

1535

'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,

1536

'view_count': view_count,

1537

'like_count': like_count,

1538

'dislike_count': dislike_count,

1539

'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),

1540

'formats': formats,

1541

'is_live': is_live,

1542

'start_time': start_time,

1543

'end_time': end_time,

}

class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):

1548

IE_DESC = 'YouTube.com playlists'

1549

_VALID_URL = r"""(?x)(?:

(?:https?://)?

(?:\w+\.)?

youtube\.com/

(?:

\? (?:.*?[&;])*? (?:p|a|list)=

| p/

)

(

(?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}

1560

# Top tracks, they can also include dots

|(?:MC)[\w\.]*

)

.*

|

((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})

1566

)"""

1567

_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'

1568

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'

1569

IE_NAME = 'youtube:playlist'

1570

_TESTS = [{

1571

'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

1572

'info_dict': {

1573

'title': 'ytdl test PL',

1574

'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

},

'playlist_count': 3,

}, {

'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

1579

'info_dict': {

1580

'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

1581

'title': 'YDL_Empty_List',

},

'playlist_count': 0,

}, {

'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',

1586

'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

1587

'info_dict': {

1588

'title': '29C3: Not my department',

1589

'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

1590

},

1591

'playlist_count': 95,

1592

}, {

1593

'note': 'issue #673',

1594

'url': 'PLBB231211A4F62143',

1595

'info_dict': {

1596

'title': '[OLD]Team Fortress 2 (Class-based LP)',

1597

'id': 'PLBB231211A4F62143',

1598

},

1599

'playlist_mincount': 26,

1600

}, {

1601

'note': 'Large playlist',

1602

'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',

1603

'info_dict': {

1604

'title': 'Uploads from Cauchemar',

1605

'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',

1606

},

1607

'playlist_mincount': 799,

1608

}, {

1609

'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

1610

'info_dict': {

1611

'title': 'YDL_safe_search',

1612

'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

},

'playlist_count': 2,

}, {

'note': 'embedded',

'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

'playlist_count': 4,

'info_dict': {

'title': 'JODA15',

'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

1622

}

1623

}, {

1624

'note': 'Embedded SWF player',

1625

'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',

'playlist_count': 4,

'info_dict': {

'title': 'JODA7',

'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',

1630

}

1631

}, {

1632

'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',

1633

'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',

1634

'info_dict': {

1635

'title': 'Uploads from Interstellar Movie',

1636

'id': 'UUXw-G3eDE9trcvY2sBMM_aA',

1637

},

1638

'playlist_mincout': 21,

1639

}]

1640

1641

def _real_initialize(self):

1642

self._login()

1643

1644

def _extract_mix(self, playlist_id):

1645

# The mixes are generated from a single video

1646

# the id of the playlist is just 'RD' + video_id

1647

url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)

1648

webpage = self._download_webpage(

1649

url, playlist_id, 'Downloading Youtube mix')

1650

search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)

1651

title_span = (

1652

search_title('playlist-title') or

1653

search_title('title long-title') or

1654

search_title('title'))

1655

title = clean_html(title_span)

1656

ids = orderedSet(re.findall(

1657

r'''(?xs)data-video-username=".*?".*?

1658

href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),

1659

webpage))

1660

url_results = self._ids_to_results(ids)

1661

1662

return self.playlist_result(url_results, playlist_id, title)

1663

1664

def _extract_playlist(self, playlist_id):

1665

url = self._TEMPLATE_URL % playlist_id

1666

page = self._download_webpage(url, playlist_id)

1667

1668

for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):

1669

match = match.strip()

1670

# Check if the playlist exists or is private

1671

if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):

1672

raise ExtractorError(

1673

'The playlist doesn\'t exist or is private, use --username or '

1674

'--netrc to access it.',

1675

expected=True)

1676

elif re.match(r'[^<]*Invalid parameters[^<]*', match):

1677

raise ExtractorError(

1678

'Invalid parameters. Maybe URL is incorrect.',

1679

expected=True)

1680

elif re.match(r'[^<]*Choose your language[^<]*', match):

1681

continue

1682

else:

1683

self.report_warning('Youtube gives an alert message: ' + match)

1684

1685

playlist_title = self._html_search_regex(

1686

r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',

1687

page, 'title')

1688

1689

return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)

1690

1691

def _real_extract(self, url):

1692

# Extract playlist id

1693

mobj = re.match(self._VALID_URL, url)

1694

if mobj is None:

1695

raise ExtractorError('Invalid URL: %s' % url)

1696

playlist_id = mobj.group(1) or mobj.group(2)

1697

1698

# Check if it's a video-specific URL

1699

query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)

1700

if 'v' in query_dict:

1701

video_id = query_dict['v'][0]

1702

if self._downloader.params.get('noplaylist'):

1703

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1704

return self.url_result(video_id, 'Youtube', video_id=video_id)

1705

else:

1706

self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))

1707

1708

if playlist_id.startswith('RD') or playlist_id.startswith('UL'):

1709

# Mixes require a custom extraction process

1710

return self._extract_mix(playlist_id)

1711

1712

return self._extract_playlist(playlist_id)

1713

1714

1715

class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):

1716

IE_DESC = 'YouTube.com channels'

1717

_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'

1718

_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'

1719

_VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'

1720

IE_NAME = 'youtube:channel'

1721

_TESTS = [{

1722

'note': 'paginated channel',

1723

'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',

1724

'playlist_mincount': 91,

1725

'info_dict': {

1726

'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',

1727

'title': 'Uploads from lex will',

1728

}

1729

}, {

1730

'note': 'Age restricted channel',

1731

# from https://www.youtube.com/user/DeusExOfficial

1732

'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',

1733

'playlist_mincount': 64,

1734

'info_dict': {

1735

'id': 'UUs0ifCMCm1icqRbqhUINa0w',

1736

'title': 'Uploads from Deus Ex',

},

}]

@classmethod

def suitable(cls, url):

1742

return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url)

1743

1744

def _real_extract(self, url):

1745

channel_id = self._match_id(url)

1746

1747

url = self._TEMPLATE_URL % channel_id

1748

1749

# Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)

1750

# Workaround by extracting as a playlist if managed to obtain channel playlist URL

1751

# otherwise fallback on channel by page extraction

1752

channel_page = self._download_webpage(

1753

url + '?view=57', channel_id,

1754

'Downloading channel page', fatal=False)

1755

if channel_page is False:

1756

channel_playlist_id = False

1757

else:

1758

channel_playlist_id = self._html_search_meta(

1759

'channelId', channel_page, 'channel id', default=None)

1760

if not channel_playlist_id:

1761

channel_playlist_id = self._search_regex(

1762

r'data-(?:channel-external-|yt)id="([^"]+)"',

1763

channel_page, 'channel id', default=None)

1764

if channel_playlist_id and channel_playlist_id.startswith('UC'):

1765

playlist_id = 'UU' + channel_playlist_id[2:]

1766

return self.url_result(

1767

compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')

1768

1769

channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')

1770

autogenerated = re.search(r'''(?x)

1771

class="[^"]*?(?:

1772

channel-header-autogenerated-label|

1773

yt-channel-title-autogenerated

1774

)[^"]*"''', channel_page) is not None

1775

1776

if autogenerated:

1777

# The videos are contained in a single page

1778

# the ajax pages can't be used, they are empty

1779

entries = [

1780

self.url_result(

1781

video_id, 'Youtube', video_id=video_id,

1782

video_title=video_title)

1783

for video_id, video_title in self.extract_videos_from_page(channel_page)]

1784

return self.playlist_result(entries, channel_id)

1785

1786

return self.playlist_result(self._entries(channel_page, channel_id), channel_id)

1787

1788

1789

class YoutubeUserIE(YoutubeChannelIE):

1790

IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'

1791

_VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'

1792

_TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'

1793

IE_NAME = 'youtube:user'

1794

1795

_TESTS = [{

1796

'url': 'https://www.youtube.com/user/TheLinuxFoundation',

1797

'playlist_mincount': 320,

1798

'info_dict': {

1799

'title': 'TheLinuxFoundation',

1800

}

1801

}, {

1802

'url': 'ytuser:phihag',

1803

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

1808

# Don't return True if the url can be extracted with other youtube

1809

# extractor, the regex would is too permissive and it would match.

1810

other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)

1811

if any(ie.suitable(url) for ie in other_ies):

1812

return False

1813

else:

1814

return super(YoutubeUserIE, cls).suitable(url)

1815

1816

1817

class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):

1818

IE_DESC = 'YouTube.com user/channel playlists'

1819

_VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'

1820

IE_NAME = 'youtube:playlists'

1821

1822

_TESTS = [{

1823

'url': 'http://www.youtube.com/user/ThirstForScience/playlists',

1824

'playlist_mincount': 4,

1825

'info_dict': {

1826

'id': 'ThirstForScience',

1827

'title': 'Thirst for Science',

1828

},

1829

}, {

1830

# with "Load more" button

1831

'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',

1832

'playlist_mincount': 70,

1833

'info_dict': {

1834

'id': 'igorkle1',

1835

'title': 'Игорь Клейнер',

1836

},

1837

}, {

1838

'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',

1839

'playlist_mincount': 17,

1840

'info_dict': {

1841

'id': 'UCiU1dHvZObB2iP6xkJ__Icw',

1842

'title': 'Chem Player',

},

}]

class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):

1848

IE_DESC = 'YouTube.com searches'

1849

# there doesn't appear to be a real limit, for example if you search for

1850

# 'python' you get more than 8.000.000 results

1851

_MAX_RESULTS = float('inf')

1852

IE_NAME = 'youtube:search'

1853

_SEARCH_KEY = 'ytsearch'

1854

_EXTRA_QUERY_ARGS = {}

1855

_TESTS = []

1856

1857

def _get_n_results(self, query, n):

1858

"""Get a specified number of results for a query"""

videos = []

limit = n

for pagenum in itertools.count(1):

1864

url_query = {

1865

'search_query': query.encode('utf-8'),

'page': pagenum,

'spf': 'navigate',

}

url_query.update(self._EXTRA_QUERY_ARGS)

1870

result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)

1871

data = self._download_json(

1872

result_url, video_id='query "%s"' % query,

1873

note='Downloading page %s' % pagenum,

1874

errnote='Unable to download API page')

1875

html_content = data[1]['body']['content']

1876

1877

if 'class="search-message' in html_content:

1878

raise ExtractorError(

1879

'[youtube] No video results', expected=True)

1880

1881

new_videos = self._ids_to_results(orderedSet(re.findall(

1882

r'href="/watch\?v=(.{11})', html_content)))

1883

videos += new_videos

1884

if not new_videos or len(videos) > limit:

break

if len(videos) > n:

videos = videos[:n]

return self.playlist_result(videos, query)

1890

1891

1892

class YoutubeSearchDateIE(YoutubeSearchIE):

1893

IE_NAME = YoutubeSearchIE.IE_NAME + ':date'

1894

_SEARCH_KEY = 'ytsearchdate'

1895

IE_DESC = 'YouTube.com searches, newest videos first'

1896

_EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}

1897

1898

1899

class YoutubeSearchURLIE(InfoExtractor):

1900

IE_DESC = 'YouTube.com search URLs'

1901

IE_NAME = 'youtube:search_url'

1902

_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'

1903

_TESTS = [{

1904

'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',

1905

'playlist_mincount': 5,

1906

'info_dict': {

1907

'title': 'youtube-dl test video',

}

}]

def _real_extract(self, url):

1912

mobj = re.match(self._VALID_URL, url)

1913

query = compat_urllib_parse_unquote_plus(mobj.group('query'))

1914

1915

webpage = self._download_webpage(url, query)

1916

result_code = self._search_regex(

1917

r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')

1918

1919

part_codes = re.findall(

1920

r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code)

1921

entries = []

1922

for part_code in part_codes:

1923

part_title = self._html_search_regex(

1924

[r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)

1925

part_url_snippet = self._html_search_regex(

1926

r'(?s)href="([^"]+)"', part_code, 'item URL')

1927

part_url = compat_urlparse.urljoin(

1928

'https://www.youtube.com/', part_url_snippet)

entries.append({

'_type': 'url',

'url': part_url,

'title': part_title,

})

return {

'_type': 'playlist',

'entries': entries,

'title': query,

}

class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):

1943

IE_DESC = 'YouTube.com (multi-season) shows'

1944

_VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'

1945

IE_NAME = 'youtube:show'

1946

_TESTS = [{

1947

'url': 'https://www.youtube.com/show/airdisasters',

1948

'playlist_mincount': 5,

1949

'info_dict': {

1950

'id': 'airdisasters',

1951

'title': 'Air Disasters',

}

}]

def _real_extract(self, url):

1956

playlist_id = self._match_id(url)

1957

return super(YoutubeShowIE, self)._real_extract(

1958

'https://www.youtube.com/show/%s/playlists' % playlist_id)

1959

1960

1961

class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):

1962

"""

1963

Base class for feed extractors

1964

Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.

1965

"""

1966

_LOGIN_REQUIRED = True

@property

def IE_NAME(self):

return 'youtube:%s' % self._FEED_NAME

1971

1972

def _real_initialize(self):

1973

self._login()

1974

1975

def _real_extract(self, url):

1976

page = self._download_webpage(

1977

'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)

1978

1979

# The extraction process is the same as for playlists, but the regex

1980

# for the video ids doesn't contain an index

1981

ids = []

1982

more_widget_html = content_html = page

1983

for page_num in itertools.count(1):

1984

matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)

1985

1986

# 'recommended' feed has infinite 'load more' and each new portion spins

1987

# the same videos in (sometimes) slightly different order, so we'll check

1988

# for unicity and break when portion has no new videos

1989

new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))

if not new_ids:

break

ids.extend(new_ids)

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

2000

'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,

2001

'Downloading page #%s' % page_num,

2002

transform_source=uppercase_escape)

2003

content_html = more['content_html']

2004

more_widget_html = more['load_more_widget_html']

2005

2006

return self.playlist_result(

2007

self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)

2008

2009

2010

class YoutubeWatchLaterIE(YoutubePlaylistIE):

2011

IE_NAME = 'youtube:watchlater'

2012

IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'

2013

_VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'

2014

2015

_TESTS = [] # override PlaylistIE tests

2016

2017

def _real_extract(self, url):

2018

return self._extract_playlist('WL')

2019

2020

2021

class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):

2022

IE_NAME = 'youtube:favorites'

2023

IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'

2024

_VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'

2025

_LOGIN_REQUIRED = True

2026

2027

def _real_extract(self, url):

2028

webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')

2029

playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')

2030

return self.url_result(playlist_id, 'YoutubePlaylist')

2031

2032

2033

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):

2034

IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'

2035

_VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'

2036

_FEED_NAME = 'recommended'

2037

_PLAYLIST_TITLE = 'Youtube Recommended videos'

2038

2039

2040

class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):

2041

IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'

2042

_VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'

2043

_FEED_NAME = 'subscriptions'

2044

_PLAYLIST_TITLE = 'Youtube Subscriptions'

2045

2046

2047

class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):

2048

IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'

2049

_VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'

2050

_FEED_NAME = 'history'

2051

_PLAYLIST_TITLE = 'Youtube History'

2052

2053

2054

class YoutubeTruncatedURLIE(InfoExtractor):

2055

IE_NAME = 'youtube:truncated_url'

2056

IE_DESC = False # Do not list

2057

_VALID_URL = r'''(?x)

2058

(?:https?://)?

2059

(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/

2060

(?:watch\?(?:

2061

feature=[a-z_]+|

2062

annotation_id=annotation_[^&]+|

x-yt-cl=[0-9]+|

hl=[^&]*|

t=[0-9]+

)?

|

attribution_link\?a=[^&]+

)

$

'''

_TESTS = [{

'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',

2075

'only_matching': True,

2076

}, {

2077

'url': 'http://www.youtube.com/watch?',

2078

'only_matching': True,

2079

}, {

2080

'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',

2081

'only_matching': True,

2082

}, {

2083

'url': 'https://www.youtube.com/watch?feature=foo',

2084

'only_matching': True,

2085

}, {

2086

'url': 'https://www.youtube.com/watch?hl=en-GB',

2087

'only_matching': True,

2088

}, {

2089

'url': 'https://www.youtube.com/watch?t=2372',

2090

'only_matching': True,

2091

}]

2092

2093

def _real_extract(self, url):

2094

raise ExtractorError(

2095

'Did you forget to quote the URL? Remember that & is a meta '

2096

'character in most shells, so you want to put the URL in quotes, '

2097

'like youtube-dl '

2098

'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '

2099

' or simply youtube-dl BaW_jenozKc .',

expected=True)

class YoutubeTruncatedIDIE(InfoExtractor):

2104

IE_NAME = 'youtube:truncated_id'

2105

IE_DESC = False # Do not list

2106

_VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'

2107

2108

_TESTS = [{

2109

'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',

2110

'only_matching': True,

2111

}]

2112

2113

def _real_extract(self, url):

2114

video_id = self._match_id(url)

2115

raise ExtractorError(

2116

'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),

2117

expected=True)