jfr.im git - yt-dlp.git/blame_incremental - youtube

Commit	Line	Data
	1	# coding: utf-8
	2
	3	from __future__ import unicode_literals
	4
	5
	6	import itertools
	7	import json
	8	import os.path
	9	import re
	10	import time
	11	import traceback
	12
	13	from .common import InfoExtractor, SearchInfoExtractor
	14	from ..jsinterp import JSInterpreter
	15	from ..swfinterp import SWFInterpreter
	16	from ..compat import (
	17	compat_chr,
	18	compat_parse_qs,
	19	compat_urllib_parse,
	20	compat_urllib_parse_unquote,
	21	compat_urllib_parse_unquote_plus,
	22	compat_urllib_parse_urlparse,
	23	compat_urlparse,
	24	compat_str,
	25	)
	26	from ..utils import (
	27	clean_html,
	28	encode_dict,
	29	error_to_compat_str,
	30	ExtractorError,
	31	float_or_none,
	32	get_element_by_attribute,
	33	get_element_by_id,
	34	int_or_none,
	35	mimetype2ext,
	36	orderedSet,
	37	parse_duration,
	38	remove_quotes,
	39	remove_start,
	40	sanitized_Request,
	41	smuggle_url,
	42	str_to_int,
	43	unescapeHTML,
	44	unified_strdate,
	45	unsmuggle_url,
	46	uppercase_escape,
	47	ISO3166Utils,
	48	)
	49
	50
	51	class YoutubeBaseInfoExtractor(InfoExtractor):
	52	"""Provide base functions for Youtube extractors"""
	53	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	54	_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
	55	_NETRC_MACHINE = 'youtube'
	56	# If True it will raise an error if no login info is provided
	57	_LOGIN_REQUIRED = False
	58
	59	def _set_language(self):
	60	self._set_cookie(
	61	'.youtube.com', 'PREF', 'f1=50000000&hl=en',
	62	# YouTube sets the expire time to about two months
	63	expire_time=time.time() + 2 * 30 * 24 * 3600)
	64
	65	def _ids_to_results(self, ids):
	66	return [
	67	self.url_result(vid_id, 'Youtube', video_id=vid_id)
	68	for vid_id in ids]
	69
	70	def _login(self):
	71	"""
	72	Attempt to log in to YouTube.
	73	True is returned if successful or skipped.
	74	False is returned if login failed.
	75
	76	If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
	77	"""
	78	(username, password) = self._get_login_info()
	79	# No authentication to be performed
	80	if username is None:
	81	if self._LOGIN_REQUIRED:
	82	raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
	83	return True
	84
	85	login_page = self._download_webpage(
	86	self._LOGIN_URL, None,
	87	note='Downloading login page',
	88	errnote='unable to fetch login page', fatal=False)
	89	if login_page is False:
	90	return
	91
	92	galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
	93	login_page, 'Login GALX parameter')
	94
	95	# Log in
	96	login_form_strs = {
	97	'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
	98	'Email': username,
	99	'GALX': galx,
	100	'Passwd': password,
	101
	102	'PersistentCookie': 'yes',
	103	'_utf8': '霱',
	104	'bgresponse': 'js_disabled',
	105	'checkConnection': '',
	106	'checkedDomains': 'youtube',
	107	'dnConn': '',
	108	'pstMsg': '0',
	109	'rmShown': '1',
	110	'secTok': '',
	111	'signIn': 'Sign in',
	112	'timeStmp': '',
	113	'service': 'youtube',
	114	'uilel': '3',
	115	'hl': 'en_US',
	116	}
	117
	118	login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')
	119
	120	req = sanitized_Request(self._LOGIN_URL, login_data)
	121	login_results = self._download_webpage(
	122	req, None,
	123	note='Logging in', errnote='unable to log in', fatal=False)
	124	if login_results is False:
	125	return False
	126
	127	if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
	128	raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
	129
	130	# Two-Factor
	131	# TODO add SMS and phone call support - these require making a request and then prompting the user
	132
	133	if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:
	134	tfa_code = self._get_tfa_info('2-step verification code')
	135
	136	if not tfa_code:
	137	self._downloader.report_warning(
	138	'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
	139	'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
	140	return False
	141
	142	tfa_code = remove_start(tfa_code, 'G-')
	143
	144	tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
	145
	146	tfa_form_strs.update({
	147	'Pin': tfa_code,
	148	'TrustDevice': 'on',
	149	})
	150
	151	tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')
	152
	153	tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)
	154	tfa_results = self._download_webpage(
	155	tfa_req, None,
	156	note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
	157
	158	if tfa_results is False:
	159	return False
	160
	161	if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:
	162	self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
	163	return False
	164	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
	165	self._downloader.report_warning('unable to log in - did the page structure change?')
	166	return False
	167	if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
	168	self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
	169	return False
	170
	171	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
	172	self._downloader.report_warning('unable to log in: bad username or password')
	173	return False
	174	return True
	175
	176	def _real_initialize(self):
	177	if self._downloader is None:
	178	return
	179	self._set_language()
	180	if not self._login():
	181	return
	182
	183
	184	class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
	185	# Extract entries from page with "Load more" button
	186	def _entries(self, page, playlist_id):
	187	more_widget_html = content_html = page
	188	for page_num in itertools.count(1):
	189	for entry in self._process_page(content_html):
	190	yield entry
	191
	192	mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
	193	if not mobj:
	194	break
	195
	196	more = self._download_json(
	197	'https://youtube.com/%s' % mobj.group('more'), playlist_id,
	198	'Downloading page #%s' % page_num,
	199	transform_source=uppercase_escape)
	200	content_html = more['content_html']
	201	if not content_html.strip():
	202	# Some webpages show a "Load more" button but they don't
	203	# have more videos
	204	break
	205	more_widget_html = more['load_more_widget_html']
	206
	207
	208	class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	209	def _process_page(self, content):
	210	for video_id, video_title in self.extract_videos_from_page(content):
	211	yield self.url_result(video_id, 'Youtube', video_id, video_title)
	212
	213	def extract_videos_from_page(self, page):
	214	ids_in_page = []
	215	titles_in_page = []
	216	for mobj in re.finditer(self._VIDEO_RE, page):
	217	# The link with index 0 is not the first video of the playlist (not sure if still actual)
	218	if 'index' in mobj.groupdict() and mobj.group('id') == '0':
	219	continue
	220	video_id = mobj.group('id')
	221	video_title = unescapeHTML(mobj.group('title'))
	222	if video_title:
	223	video_title = video_title.strip()
	224	try:
	225	idx = ids_in_page.index(video_id)
	226	if video_title and not titles_in_page[idx]:
	227	titles_in_page[idx] = video_title
	228	except ValueError:
	229	ids_in_page.append(video_id)
	230	titles_in_page.append(video_title)
	231	return zip(ids_in_page, titles_in_page)
	232
	233
	234	class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	235	def _process_page(self, content):
	236	for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)):
	237	yield self.url_result(
	238	'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
	239
	240	def _real_extract(self, url):
	241	playlist_id = self._match_id(url)
	242	webpage = self._download_webpage(url, playlist_id)
	243	title = self._og_search_title(webpage, fatal=False)
	244	return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
	245
	246
	247	class YoutubeIE(YoutubeBaseInfoExtractor):
	248	IE_DESC = 'YouTube.com'
	249	_VALID_URL = r"""(?x)^
	250	(
	251	(?:https?://\|//) # http(s):// or protocol-independent URL
	252	(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/\|
	253	(?:www\.)?deturl\.com/www\.youtube\.com/\|
	254	(?:www\.)?pwnyoutube\.com/\|
	255	(?:www\.)?yourepeat\.com/\|
	256	tube\.majestyc\.net/\|
	257	youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
	258	(?:.*?\#/)? # handle anchor (#/) redirect urls
	259	(?: # the various things that can precede the ID:
	260	(?:(?:v\|embed\|e)/(?!videoseries)) # v/ or embed/ or e/
	261	\|(?: # or the v= param in all its forms
	262	(?:(?:watch\|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	263	(?:\?\|\#!?) # the params delimiter ? or # or #!
	264	(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
	265	v=
	266	)
	267	))
	268	\|(?:
	269	youtu\.be\| # just youtu.be/xxxx
	270	vid\.plus # or vid.plus/xxxx
	271	)/
	272	\|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
	273	)
	274	)? # all until now is optional -> you can pass the naked ID
	275	([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
	276	(?!.*?&list=) # combined list/video URLs are handled by the playlist IE
	277	(?(1).+)? # if we found the ID, everything can follow
	278	$"""
	279	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	280	_formats = {
	281	'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	282	'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	283	'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
	284	'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
	285	'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
	286	'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	287	'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	288	'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	289	# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
	290	'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
	291	'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	292	'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	293	'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	294	'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	295	'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	296	'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	297	'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	298	'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	299
	300
	301	# 3D videos
	302	'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	303	'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	304	'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	305	'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	306	'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
	307	'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	308	'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	309
	310	# Apple HTTP Live Streaming
	311	'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	312	'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	313	'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	314	'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	315	'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	316	'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	317	'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
	318
	319	# DASH mp4 video
	320	'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	321	'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	322	'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	323	'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	324	'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	325	'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
	326	'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	327	'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	328	'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
	329	'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
	330	'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	331
	332	# Dash mp4 audio
	333	'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
	334	'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
	335	'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
	336
	337	# Dash webm
	338	'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	339	'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	340	'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	341	'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	342	'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	343	'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	344	'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40},
	345	'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	346	'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	347	'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	348	'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	349	'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	350	'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	351	'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	352	'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	353	# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
	354	'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	355	'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	356	'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	357	'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	358	'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	359	'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	360
	361	# Dash webm audio
	362	'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
	363	'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
	364
	365	# Dash webm audio with opus inside
	366	'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
	367	'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
	368	'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
	369
	370	# RTMP (unnamed)
	371	'_rtmp': {'protocol': 'rtmp'},
	372	}
	373	_SUBTITLE_FORMATS = ('ttml', 'vtt')
	374
	375	IE_NAME = 'youtube'
	376	_TESTS = [
	377	{
	378	'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
	379	'info_dict': {
	380	'id': 'BaW_jenozKc',
	381	'ext': 'mp4',
	382	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	383	'uploader': 'Philipp Hagemeister',
	384	'uploader_id': 'phihag',
	385	'upload_date': '20121002',
	386	'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
	387	'categories': ['Science & Technology'],
	388	'tags': ['youtube-dl'],
	389	'like_count': int,
	390	'dislike_count': int,
	391	'start_time': 1,
	392	'end_time': 9,
	393	}
	394	},
	395	{
	396	'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
	397	'note': 'Test generic use_cipher_signature video (#897)',
	398	'info_dict': {
	399	'id': 'UxxajLWwzqY',
	400	'ext': 'mp4',
	401	'upload_date': '20120506',
	402	'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
	403	'alt_title': 'I Love It (feat. Charli XCX)',
	404	'description': 'md5:782e8651347686cba06e58f71ab51773',
	405	'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
	406	'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
	407	'iconic ep', 'iconic', 'love', 'it'],
	408	'uploader': 'Icona Pop',
	409	'uploader_id': 'IconaPop',
	410	'creator': 'Icona Pop',
	411	}
	412	},
	413	{
	414	'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
	415	'note': 'Test VEVO video with age protection (#956)',
	416	'info_dict': {
	417	'id': '07FYdnEawAQ',
	418	'ext': 'mp4',
	419	'upload_date': '20130703',
	420	'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
	421	'alt_title': 'Tunnel Vision',
	422	'description': 'md5:64249768eec3bc4276236606ea996373',
	423	'uploader': 'justintimberlakeVEVO',
	424	'uploader_id': 'justintimberlakeVEVO',
	425	'creator': 'Justin Timberlake',
	426	'age_limit': 18,
	427	}
	428	},
	429	{
	430	'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
	431	'note': 'Embed-only video (#1746)',
	432	'info_dict': {
	433	'id': 'yZIXLfi8CZQ',
	434	'ext': 'mp4',
	435	'upload_date': '20120608',
	436	'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
	437	'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
	438	'uploader': 'SET India',
	439	'uploader_id': 'setindia',
	440	'age_limit': 18,
	441	}
	442	},
	443	{
	444	'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
	445	'note': 'Use the first video ID in the URL',
	446	'info_dict': {
	447	'id': 'BaW_jenozKc',
	448	'ext': 'mp4',
	449	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	450	'uploader': 'Philipp Hagemeister',
	451	'uploader_id': 'phihag',
	452	'upload_date': '20121002',
	453	'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
	454	'categories': ['Science & Technology'],
	455	'tags': ['youtube-dl'],
	456	'like_count': int,
	457	'dislike_count': int,
	458	},
	459	'params': {
	460	'skip_download': True,
	461	},
	462	},
	463	{
	464	'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
	465	'note': '256k DASH audio (format 141) via DASH manifest',
	466	'info_dict': {
	467	'id': 'a9LDPn-MO4I',
	468	'ext': 'm4a',
	469	'upload_date': '20121002',
	470	'uploader_id': '8KVIDEO',
	471	'description': '',
	472	'uploader': '8KVIDEO',
	473	'title': 'UHDTV TEST 8K VIDEO.mp4'
	474	},
	475	'params': {
	476	'youtube_include_dash_manifest': True,
	477	'format': '141',
	478	},
	479	},
	480	# DASH manifest with encrypted signature
	481	{
	482	'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
	483	'info_dict': {
	484	'id': 'IB3lcPjvWLA',
	485	'ext': 'm4a',
	486	'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
	487	'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
	488	'uploader': 'AfrojackVEVO',
	489	'uploader_id': 'AfrojackVEVO',
	490	'upload_date': '20131011',
	491	},
	492	'params': {
	493	'youtube_include_dash_manifest': True,
	494	'format': '141',
	495	},
	496	},
	497	# JS player signature function name containing $
	498	{
	499	'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
	500	'info_dict': {

1

# coding: utf-8

2

3

from __future__ import unicode_literals

import itertools

import json

import os.path

import re

import time

import traceback

from .common import InfoExtractor, SearchInfoExtractor

14

from ..jsinterp import JSInterpreter

15

from ..swfinterp import SWFInterpreter

16

from ..compat import (

compat_chr,

compat_parse_qs,

compat_urllib_parse,

compat_urllib_parse_unquote,

21

compat_urllib_parse_unquote_plus,

22

compat_urllib_parse_urlparse,

compat_urlparse,

compat_str,

)

from ..utils import (

clean_html,

encode_dict,

error_to_compat_str,

ExtractorError,

float_or_none,

get_element_by_attribute,

get_element_by_id,

int_or_none,

mimetype2ext,

orderedSet,

parse_duration,

remove_quotes,

remove_start,

sanitized_Request,

smuggle_url,

str_to_int,

unescapeHTML,

unified_strdate,

unsmuggle_url,

uppercase_escape,

ISO3166Utils,

)

class YoutubeBaseInfoExtractor(InfoExtractor):

52

"""Provide base functions for Youtube extractors"""

53

_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'

54

_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'

55

_NETRC_MACHINE = 'youtube'

56

# If True it will raise an error if no login info is provided

57

_LOGIN_REQUIRED = False

58

59

def _set_language(self):

60

self._set_cookie(

61

'.youtube.com', 'PREF', 'f1=50000000&hl=en',

62

# YouTube sets the expire time to about two months

63

expire_time=time.time() + 2 * 30 * 24 * 3600)

64

65

def _ids_to_results(self, ids):

66

return [

67

self.url_result(vid_id, 'Youtube', video_id=vid_id)

for vid_id in ids]

def _login(self):

"""

Attempt to log in to YouTube.

73

True is returned if successful or skipped.

74

False is returned if login failed.

75

76

If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.

77

"""

78

(username, password) = self._get_login_info()

79

# No authentication to be performed

80

if username is None:

81

if self._LOGIN_REQUIRED:

82

raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)

83

return True

84

85

login_page = self._download_webpage(

86

self._LOGIN_URL, None,

87

note='Downloading login page',

88

errnote='unable to fetch login page', fatal=False)

89

if login_page is False:

90

return

91

92

galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',

93

login_page, 'Login GALX parameter')

# Log in

login_form_strs = {

'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',

'Email': username,

'GALX': galx,

'Passwd': password,

'PersistentCookie': 'yes',

103

'_utf8': '霱',

104

'bgresponse': 'js_disabled',

105

'checkConnection': '',

106

'checkedDomains': 'youtube',

'dnConn': '',

'pstMsg': '0',

'rmShown': '1',

'secTok': '',

'signIn': 'Sign in',

'timeStmp': '',

'service': 'youtube',

'uilel': '3',

'hl': 'en_US',

}

login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')

119

120

req = sanitized_Request(self._LOGIN_URL, login_data)

121

login_results = self._download_webpage(

122

req, None,

123

note='Logging in', errnote='unable to log in', fatal=False)

124

if login_results is False:

125

return False

126

127

if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:

128

raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)

129

130

# Two-Factor

131

# TODO add SMS and phone call support - these require making a request and then prompting the user

132

133

if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:

134

tfa_code = self._get_tfa_info('2-step verification code')

135

136

if not tfa_code:

137

self._downloader.report_warning(

138

'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'

139

'(Note that only TOTP (Google Authenticator App) codes work at this time.)')

140

return False

141

142

tfa_code = remove_start(tfa_code, 'G-')

143

144

tfa_form_strs = self._form_hidden_inputs('challenge', login_results)

145

146

tfa_form_strs.update({

'Pin': tfa_code,

'TrustDevice': 'on',

})

tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')

152

153

tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)

154

tfa_results = self._download_webpage(

155

tfa_req, None,

156

note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)

157

158

if tfa_results is False:

159

return False

160

161

if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:

162

self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')

163

return False

164

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:

165

self._downloader.report_warning('unable to log in - did the page structure change?')

166

return False

167

if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:

168

self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')

169

return False

170

171

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:

172

self._downloader.report_warning('unable to log in: bad username or password')

return False

return True

def _real_initialize(self):

177

if self._downloader is None:

178

return

179

self._set_language()

180

if not self._login():

return

class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):

185

# Extract entries from page with "Load more" button

186

def _entries(self, page, playlist_id):

187

more_widget_html = content_html = page

188

for page_num in itertools.count(1):

189

for entry in self._process_page(content_html):

190

yield entry

191

192

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

197

'https://youtube.com/%s' % mobj.group('more'), playlist_id,

198

'Downloading page #%s' % page_num,

199

transform_source=uppercase_escape)

200

content_html = more['content_html']

201

if not content_html.strip():

202

# Some webpages show a "Load more" button but they don't

203

# have more videos

204

break

205

more_widget_html = more['load_more_widget_html']

206

207

208

class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

209

def _process_page(self, content):

210

for video_id, video_title in self.extract_videos_from_page(content):

211

yield self.url_result(video_id, 'Youtube', video_id, video_title)

212

213

def extract_videos_from_page(self, page):

214

ids_in_page = []

215

titles_in_page = []

216

for mobj in re.finditer(self._VIDEO_RE, page):

217

# The link with index 0 is not the first video of the playlist (not sure if still actual)

218

if 'index' in mobj.groupdict() and mobj.group('id') == '0':

219

continue

220

video_id = mobj.group('id')

221

video_title = unescapeHTML(mobj.group('title'))

222

if video_title:

223

video_title = video_title.strip()

224

try:

225

idx = ids_in_page.index(video_id)

226

if video_title and not titles_in_page[idx]:

227

titles_in_page[idx] = video_title

228

except ValueError:

229

ids_in_page.append(video_id)

230

titles_in_page.append(video_title)

231

return zip(ids_in_page, titles_in_page)

232

233

234

class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

235

def _process_page(self, content):

236

for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)):

237

yield self.url_result(

238

'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')

239

240

def _real_extract(self, url):

241

playlist_id = self._match_id(url)

242

webpage = self._download_webpage(url, playlist_id)

243

title = self._og_search_title(webpage, fatal=False)

244

return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)

245

246

247

class YoutubeIE(YoutubeBaseInfoExtractor):

248

IE_DESC = 'YouTube.com'

249

_VALID_URL = r"""(?x)^

250

(

251

(?:https?://|//) # http(s):// or protocol-independent URL

252

(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|

253

(?:www\.)?deturl\.com/www\.youtube\.com/|

254

(?:www\.)?pwnyoutube\.com/|

255

(?:www\.)?yourepeat\.com/|

256

tube\.majestyc\.net/|

257

youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains

258

(?:.*?\#/)? # handle anchor (#/) redirect urls

259

(?: # the various things that can precede the ID:

260

(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/

261

|(?: # or the v= param in all its forms

262

(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)

263

(?:\?|\#!?) # the params delimiter ? or # or #!

264

(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)

v=

)

))

|(?:

youtu\.be| # just youtu.be/xxxx

270

vid\.plus # or vid.plus/xxxx

271

)/

272

|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=

273

)

274

)? # all until now is optional -> you can pass the naked ID

275

([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID

276

(?!.*?&list=) # combined list/video URLs are handled by the playlist IE

277

(?(1).+)? # if we found the ID, everything can follow

278

$"""

279

_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'

280

_formats = {

281

'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

282

'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

283

'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},

284

'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},

285

'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},

286

'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

287

'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

288

'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

289

# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well

290

'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},

291

'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

292

'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

293

'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

294

'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

295

'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

296

'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

297

'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

298

'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

# 3D videos

'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

303

'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

304

'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

305

'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

306

'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},

307

'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

308

'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

309

310

# Apple HTTP Live Streaming

311

'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

312

'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

313

'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

314

'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

315

'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

316

'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

317

'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},

318

319

# DASH mp4 video

320

'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

321

'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

322

'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

323

'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

324

'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

325

'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)

326

'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

327

'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

328

'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},

329

'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},

330

'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

331

332

# Dash mp4 audio

333

'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},

334

'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},

335

'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},

336

337

# Dash webm

338

'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

339

'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

340

'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

341

'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

342

'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

343

'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

344

'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40},

345

'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

346

'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

347

'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

348

'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

349

'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

350

'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

351

'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

352

'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

353

# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)

354

'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

355

'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

356

'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

357

'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

358

'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

359

'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

360

361

# Dash webm audio

362

'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},

363

'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},

364

365

# Dash webm audio with opus inside

366

'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},

367

'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},

368

'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},

369

370

# RTMP (unnamed)

371

'_rtmp': {'protocol': 'rtmp'},

372

}

373

_SUBTITLE_FORMATS = ('ttml', 'vtt')

IE_NAME = 'youtube'

_TESTS = [

{

'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

383

'uploader': 'Philipp Hagemeister',

384

'uploader_id': 'phihag',

385

'upload_date': '20121002',

386

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

387

'categories': ['Science & Technology'],

388

'tags': ['youtube-dl'],

389

'like_count': int,

390

'dislike_count': int,

'start_time': 1,

'end_time': 9,

}

},

{

'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',

397

'note': 'Test generic use_cipher_signature video (#897)',

'info_dict': {

'id': 'UxxajLWwzqY',

'ext': 'mp4',

'upload_date': '20120506',

402

'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',

403

'alt_title': 'I Love It (feat. Charli XCX)',

404

'description': 'md5:782e8651347686cba06e58f71ab51773',

405

'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',

406

'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',

407

'iconic ep', 'iconic', 'love', 'it'],

408

'uploader': 'Icona Pop',

409

'uploader_id': 'IconaPop',

410

'creator': 'Icona Pop',

}

},

{

'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',

415

'note': 'Test VEVO video with age protection (#956)',

'info_dict': {

'id': '07FYdnEawAQ',

'ext': 'mp4',

'upload_date': '20130703',

420

'title': 'Justin Timberlake - Tunnel Vision (Explicit)',

421

'alt_title': 'Tunnel Vision',

422

'description': 'md5:64249768eec3bc4276236606ea996373',

423

'uploader': 'justintimberlakeVEVO',

424

'uploader_id': 'justintimberlakeVEVO',

425

'creator': 'Justin Timberlake',

'age_limit': 18,

}

},

{

'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',

431

'note': 'Embed-only video (#1746)',

'info_dict': {

'id': 'yZIXLfi8CZQ',

'ext': 'mp4',

'upload_date': '20120608',

436

'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',

437

'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',

438

'uploader': 'SET India',

439

'uploader_id': 'setindia',

'age_limit': 18,

}

},

{

'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',

445

'note': 'Use the first video ID in the URL',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

450

'uploader': 'Philipp Hagemeister',

451

'uploader_id': 'phihag',

452

'upload_date': '20121002',

453

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

454

'categories': ['Science & Technology'],

455

'tags': ['youtube-dl'],

456

'like_count': int,

457

'dislike_count': int,

458

},

459

'params': {

460

'skip_download': True,

},

},

{

'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',

465

'note': '256k DASH audio (format 141) via DASH manifest',

'info_dict': {

'id': 'a9LDPn-MO4I',

'ext': 'm4a',

'upload_date': '20121002',

470

'uploader_id': '8KVIDEO',

471

'description': '',

472

'uploader': '8KVIDEO',

473

'title': 'UHDTV TEST 8K VIDEO.mp4'

474

},

475

'params': {

476

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# DASH manifest with encrypted signature

481

{

482

'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',

'info_dict': {

'id': 'IB3lcPjvWLA',

'ext': 'm4a',

'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',

487

'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',

488

'uploader': 'AfrojackVEVO',

489

'uploader_id': 'AfrojackVEVO',

490

'upload_date': '20131011',

491

},

492

'params': {

493

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# JS player signature function name containing $

498

{

499

'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',

'info_dict': {

'id': 'nfWlot6h_JM',

'ext': 'm4a',

'title': 'Taylor Swift - Shake It Off',

504

'alt_title': 'Shake It Off',

505

'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',

506

'uploader': 'TaylorSwiftVEVO',

507

'uploader_id': 'TaylorSwiftVEVO',

508

'upload_date': '20140818',

509

'creator': 'Taylor Swift',

510

},

511

'params': {

512

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# Controversy video

{

'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',

'info_dict': {

'id': 'T4XJQO3qol8',

'ext': 'mp4',

'upload_date': '20100909',

523

'uploader': 'The Amazing Atheist',

524

'uploader_id': 'TheAmazingAtheist',

525

'title': 'Burning Everyone\'s Koran',

526

'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',

527

}

528

},

529

# Normal age-gate video (No vevo, embed allowed)

530

{

531

'url': 'http://youtube.com/watch?v=HtVdAasjOgU',

'info_dict': {

'id': 'HtVdAasjOgU',

'ext': 'mp4',

'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',

536

'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',

537

'uploader': 'The Witcher',

538

'uploader_id': 'WitcherGame',

539

'upload_date': '20140605',

'age_limit': 18,

},

},

# Age-gate video with encrypted signature

544

{

545

'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',

'info_dict': {

'id': '6kLq3WMV1nU',

'ext': 'mp4',

'title': 'Dedication To My Ex (Miss That) (Lyric Video)',

550

'description': 'md5:33765bb339e1b47e7e72b5490139bb41',

551

'uploader': 'LloydVEVO',

552

'uploader_id': 'LloydVEVO',

553

'upload_date': '20110629',

'age_limit': 18,

},

},

# video_info is None (https://github.com/rg3/youtube-dl/issues/4421)

558

{

559

'url': '__2ABJjxzNo',

'info_dict': {

'id': '__2ABJjxzNo',

'ext': 'mp4',

'upload_date': '20100430',

564

'uploader_id': 'deadmau5',

565

'creator': 'deadmau5',

566

'description': 'md5:12c56784b8032162bb936a5f76d55360',

567

'uploader': 'deadmau5',

568

'title': 'Deadmau5 - Some Chords (HD)',

569

'alt_title': 'Some Chords',

570

},

571

'expected_warnings': [

572

'DASH manifest missing',

573

]

574

},

575

# Olympics (https://github.com/rg3/youtube-dl/issues/4431)

576

{

577

'url': 'lqQg6PlCWgI',

'info_dict': {

'id': 'lqQg6PlCWgI',

'ext': 'mp4',

'upload_date': '20150827',

582

'uploader_id': 'olympic',

583

'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',

584

'uploader': 'Olympics',

585

'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',

586

},

587

'params': {

588

'skip_download': 'requires avconv',

}

},

# Non-square pixels

{

'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',

'info_dict': {

'id': '_b-2C3KPAM0',

'ext': 'mp4',

'stretched_ratio': 16 / 9.,

598

'upload_date': '20110310',

599

'uploader_id': 'AllenMeow',

600

'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',

601

'uploader': '孫艾倫',

602

'title': '[A-made] 變態妍字幕版太妍我就是這樣的人',

603

},

604

},

605

# url_encoded_fmt_stream_map is empty string

606

{

607

'url': 'qEJwOuvDf7I',

'info_dict': {

'id': 'qEJwOuvDf7I',

'ext': 'webm',

'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',

612

'description': '',

613

'upload_date': '20150404',

614

'uploader_id': 'spbelect',

615

'uploader': 'Наблюдатели Петербурга',

616

},

617

'params': {

618

'skip_download': 'requires avconv',

619

},

620

'skip': 'This live event has ended.',

621

},

622

# Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)

623

{

624

'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',

'info_dict': {

'id': 'FIl7x6_3R5Y',

'ext': 'mp4',

'title': 'md5:7b81415841e02ecd4313668cde88737a',

629

'description': 'md5:116377fd2963b81ec4ce64b542173306',

630

'upload_date': '20150625',

631

'uploader_id': 'dorappi2000',

632

'uploader': 'dorappi2000',

633

'formats': 'mincount:33',

634

},

635

},

636

# DASH manifest with segment_list

637

{

638

'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',

639

'md5': '8ce563a1d667b599d21064e982ab9e31',

'info_dict': {

'id': 'CsmdDsKjzN8',

'ext': 'mp4',

'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510

644

'uploader': 'Airtek',

645

'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',

646

'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',

647

'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',

648

},

649

'params': {

650

'youtube_include_dash_manifest': True,

651

'format': '135', # bestvideo

}

},

{

# Multifeed videos (multiple cameras), URL is for Main Camera

656

'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',

657

'info_dict': {

658

'id': 'jqWvoWXjCVs',

659

'title': 'teamPGP: Rocket League Noob Stream',

660

'description': 'md5:dc7872fb300e143831327f1bae3af010',

},

'playlist': [{

'info_dict': {

'id': 'jqWvoWXjCVs',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',

667

'description': 'md5:dc7872fb300e143831327f1bae3af010',

668

'upload_date': '20150721',

669

'uploader': 'Beer Games Beer',

670

'uploader_id': 'beergamesbeer',

},

}, {

'info_dict': {

'id': '6h8e8xoXJzg',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',

677

'description': 'md5:dc7872fb300e143831327f1bae3af010',

678

'upload_date': '20150721',

679

'uploader': 'Beer Games Beer',

680

'uploader_id': 'beergamesbeer',

},

}, {

'info_dict': {

'id': 'PUOgX5z9xZw',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (grizzle)',

687

'description': 'md5:dc7872fb300e143831327f1bae3af010',

688

'upload_date': '20150721',

689

'uploader': 'Beer Games Beer',

690

'uploader_id': 'beergamesbeer',

},

}, {

'info_dict': {

'id': 'teuwxikvS5k',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (zim)',

697

'description': 'md5:dc7872fb300e143831327f1bae3af010',

698

'upload_date': '20150721',

699

'uploader': 'Beer Games Beer',

700

'uploader_id': 'beergamesbeer',

},

}],

'params': {

'skip_download': True,

},

},

{

'url': 'http://vid.plus/FlRa-iH7PGw',

709

'only_matching': True,

710

},

711

{

712

# Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)

713

# Also tests cut-off URL expansion in video description (see

714

# https://github.com/rg3/youtube-dl/issues/1892,

715

# https://github.com/rg3/youtube-dl/issues/8164)

716

'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',

'info_dict': {

'id': 'lsguqyKfVQg',

'ext': 'mp4',

'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',

721

'alt_title': 'Dark Walk',

722

'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',

723

'upload_date': '20151119',

724

'uploader_id': 'IronSoulElf',

725

'uploader': 'IronSoulElf',

726

'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan',

727

},

728

'params': {

729

'skip_download': True,

},

},

{

# Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)

734

'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',

735

'only_matching': True,

736

},

737

{

738

# Video with yt:stretch=17:0

739

'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',

'info_dict': {

'id': 'Q39EVAstoRM',

'ext': 'mp4',

'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',

744

'description': 'md5:ee18a25c350637c8faff806845bddee9',

745

'upload_date': '20151107',

746

'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',

747

'uploader': 'CH GAMER DROID',

748

},

749

'params': {

750

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY',

755

'only_matching': True,

}

]

def __init__(self, *args, **kwargs):

760

super(YoutubeIE, self).__init__(*args, **kwargs)

761

self._player_cache = {}

762

763

def report_video_info_webpage_download(self, video_id):

764

"""Report attempt to download video info webpage."""

765

self.to_screen('%s: Downloading video info webpage' % video_id)

766

767

def report_information_extraction(self, video_id):

768

"""Report attempt to extract video information."""

769

self.to_screen('%s: Extracting video information' % video_id)

770

771

def report_unavailable_format(self, video_id, format):

772

"""Report extracted video URL."""

773

self.to_screen('%s: Format %s not available' % (video_id, format))

774

775

def report_rtmp_download(self):

776

"""Indicate the download will use the RTMP protocol."""

777

self.to_screen('RTMP download detected')

778

779

def _signature_cache_id(self, example_sig):

780

""" Return a string representation of a signature """

781

return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))

782

783

def _extract_signature_function(self, video_id, player_url, example_sig):

784

id_m = re.match(

785

r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',

786

player_url)

787

if not id_m:

788

raise ExtractorError('Cannot identify player %r' % player_url)

789

player_type = id_m.group('ext')

790

player_id = id_m.group('id')

791

792

# Read from filesystem cache

793

func_id = '%s_%s_%s' % (

794

player_type, player_id, self._signature_cache_id(example_sig))

795

assert os.path.basename(func_id) == func_id

796

797

cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)

798

if cache_spec is not None:

799

return lambda s: ''.join(s[i] for i in cache_spec)

800

801

download_note = (

802

'Downloading player %s' % player_url

803

if self._downloader.params.get('verbose') else

804

'Downloading %s player %s' % (player_type, player_id)

805

)

806

if player_type == 'js':

807

code = self._download_webpage(

808

player_url, video_id,

809

note=download_note,

810

errnote='Download of %s failed' % player_url)

811

res = self._parse_sig_js(code)

812

elif player_type == 'swf':

813

urlh = self._request_webpage(

814

player_url, video_id,

815

note=download_note,

816

errnote='Download of %s failed' % player_url)

817

code = urlh.read()

818

res = self._parse_sig_swf(code)

819

else:

820

assert False, 'Invalid player type %r' % player_type

821

822

test_string = ''.join(map(compat_chr, range(len(example_sig))))

823

cache_res = res(test_string)

824

cache_spec = [ord(c) for c in cache_res]

825

826

self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)

827

return res

828

829

def _print_sig_code(self, func, example_sig):

830

def gen_sig_code(idxs):

831

def _genslice(start, end, step):

832

starts = '' if start == 0 else str(start)

833

ends = (':%d' % (end + step)) if end + step >= 0 else ':'

834

steps = '' if step == 1 else (':%d' % step)

835

return 's[%s%s%s]' % (starts, ends, steps)

836

837

step = None

838

# Quelch pyflakes warnings - start will be set when step is set

839

start = '(Never used)'

840

for i, prev in zip(idxs[1:], idxs[:-1]):

if step is not None:

if i - prev == step:

continue

yield _genslice(start, prev, step)

845

step = None

846

continue

847

if i - prev in [-1, 1]:

step = i - prev

start = prev

continue

else:

yield 's[%d]' % prev

if step is None:

yield 's[%d]' % i

else:

yield _genslice(start, i, step)

857

858

test_string = ''.join(map(compat_chr, range(len(example_sig))))

859

cache_res = func(test_string)

860

cache_spec = [ord(c) for c in cache_res]

861

expr_code = ' + '.join(gen_sig_code(cache_spec))

862

signature_id_tuple = '(%s)' % (

863

', '.join(compat_str(len(p)) for p in example_sig.split('.')))

864

code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'

865

' return %s\n') % (signature_id_tuple, expr_code)

866

self.to_screen('Extracted signature function:\n' + code)

867

868

def _parse_sig_js(self, jscode):

869

funcname = self._search_regex(

870

r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,

871

'Initial JS player signature function name')

872

873

jsi = JSInterpreter(jscode)

874

initial_function = jsi.extract_function(funcname)

875

return lambda s: initial_function([s])

876

877

def _parse_sig_swf(self, file_contents):

878

swfi = SWFInterpreter(file_contents)

879

TARGET_CLASSNAME = 'SignatureDecipher'

880

searched_class = swfi.extract_class(TARGET_CLASSNAME)

881

initial_function = swfi.extract_function(searched_class, 'decipher')

882

return lambda s: initial_function([s])

883

884

def _decrypt_signature(self, s, video_id, player_url, age_gate=False):

885

"""Turn the encrypted s field into a working signature"""

886

887

if player_url is None:

888

raise ExtractorError('Cannot decrypt signature without player_url')

889

890

if player_url.startswith('//'):

891

player_url = 'https:' + player_url

892

try:

893

player_id = (player_url, self._signature_cache_id(s))

894

if player_id not in self._player_cache:

895

func = self._extract_signature_function(

896

video_id, player_url, s

897

)

898

self._player_cache[player_id] = func

899

func = self._player_cache[player_id]

900

if self._downloader.params.get('youtube_print_sig_code'):

901

self._print_sig_code(func, s)

902

return func(s)

903

except Exception as e:

904

tb = traceback.format_exc()

905

raise ExtractorError(

906

'Signature extraction failed: ' + tb, cause=e)

907

908

def _get_subtitles(self, video_id, webpage):

909

try:

910

subs_doc = self._download_xml(

911

'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,

912

video_id, note=False)

913

except ExtractorError as err:

914

self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))

return {}

sub_lang_list = {}

for track in subs_doc.findall('track'):

919

lang = track.attrib['lang_code']

920

if lang in sub_lang_list:

921

continue

922

sub_formats = []

923

for ext in self._SUBTITLE_FORMATS:

924

params = compat_urllib_parse.urlencode({

'lang': lang,

'v': video_id,

'fmt': ext,

'name': track.attrib['name'].encode('utf-8'),

929

})

930

sub_formats.append({

931

'url': 'https://www.youtube.com/api/timedtext?' + params,

932

'ext': ext,

933

})

934

sub_lang_list[lang] = sub_formats

935

if not sub_lang_list:

936

self._downloader.report_warning('video doesn\'t have subtitles')

return {}

return sub_lang_list

def _get_ytplayer_config(self, video_id, webpage):

941

patterns = (

942

# User data may contain arbitrary character sequences that may affect

943

# JSON extraction with regex, e.g. when '};' is contained the second

944

# regex won't capture the whole JSON. Yet working around by trying more

945

# concrete regex first keeping in mind proper quoted string handling

946

# to be implemented in future that will replace this workaround (see

947

# https://github.com/rg3/youtube-dl/issues/7468,

948

# https://github.com/rg3/youtube-dl/pull/7599)

949

r';ytplayer\.config\s*=\s*({.+?});ytplayer',

950

r';ytplayer\.config\s*=\s*({.+?});',

951

)

952

config = self._search_regex(

953

patterns, webpage, 'ytplayer.config', default=None)

954

if config:

955

return self._parse_json(

956

uppercase_escape(config), video_id, fatal=False)

957

958

def _get_automatic_captions(self, video_id, webpage):

959

"""We need the webpage for getting the captions url, pass it as an

960

argument to speed up the process."""

961

self.to_screen('%s: Looking for automatic captions' % video_id)

962

player_config = self._get_ytplayer_config(video_id, webpage)

963

err_msg = 'Couldn\'t find automatic captions for %s' % video_id

964

if not player_config:

965

self._downloader.report_warning(err_msg)

966

return {}

967

try:

968

args = player_config['args']

969

caption_url = args['ttsurl']

970

if not caption_url:

971

self._downloader.report_warning(err_msg)

972

return {}

973

timestamp = args['timestamp']

974

# We get the available subtitles

975

list_params = compat_urllib_parse.urlencode({

'type': 'list',

'tlangs': 1,

'asrs': 1,

})

list_url = caption_url + '&' + list_params

981

caption_list = self._download_xml(list_url, video_id)

982

original_lang_node = caption_list.find('track')

983

if original_lang_node is None:

984

self._downloader.report_warning('Video doesn\'t have automatic captions')

985

return {}

986

original_lang = original_lang_node.attrib['lang_code']

987

caption_kind = original_lang_node.attrib.get('kind', '')

988

989

sub_lang_list = {}

990

for lang_node in caption_list.findall('target'):

991

sub_lang = lang_node.attrib['lang_code']

992

sub_formats = []

993

for ext in self._SUBTITLE_FORMATS:

994

params = compat_urllib_parse.urlencode({

995

'lang': original_lang,

'tlang': sub_lang,

'fmt': ext,

'ts': timestamp,

'kind': caption_kind,

1000

})

1001

sub_formats.append({

1002

'url': caption_url + '&' + params,

1003

'ext': ext,

1004

})

1005

sub_lang_list[sub_lang] = sub_formats

1006

return sub_lang_list

1007

# An extractor error can be raise by the download process if there are

1008

# no automatic captions but there are subtitles

1009

except (KeyError, ExtractorError):

1010

self._downloader.report_warning(err_msg)

return {}

@classmethod

def extract_id(cls, url):

1015

mobj = re.match(cls._VALID_URL, url, re.VERBOSE)

1016

if mobj is None:

1017

raise ExtractorError('Invalid URL: %s' % url)

1018

video_id = mobj.group(2)

1019

return video_id

1020

1021

def _extract_from_m3u8(self, manifest_url, video_id):

1022

url_map = {}

1023

1024

def _get_urls(_manifest):

1025

lines = _manifest.split('\n')

1026

urls = filter(lambda l: l and not l.startswith('#'),

1027

lines)

1028

return urls

1029

manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')

1030

formats_urls = _get_urls(manifest)

1031

for format_url in formats_urls:

1032

itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')

1033

url_map[itag] = format_url

1034

return url_map

1035

1036

def _extract_annotations(self, video_id):

1037

url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id

1038

return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')

1039

1040

def _real_extract(self, url):

1041

url, smuggled_data = unsmuggle_url(url, {})

1042

1043

proto = (

1044

'http' if self._downloader.params.get('prefer_insecure', False)

else 'https')

start_time = None

end_time = None

parsed_url = compat_urllib_parse_urlparse(url)

1050

for component in [parsed_url.fragment, parsed_url.query]:

1051

query = compat_parse_qs(component)

1052

if start_time is None and 't' in query:

1053

start_time = parse_duration(query['t'][0])

1054

if start_time is None and 'start' in query:

1055

start_time = parse_duration(query['start'][0])

1056

if end_time is None and 'end' in query:

1057

end_time = parse_duration(query['end'][0])

1058

1059

# Extract original video URL from URL with redirection, like age verification, using next_url parameter

1060

mobj = re.search(self._NEXT_URL_RE, url)

1061

if mobj:

1062

url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')

1063

video_id = self.extract_id(url)

1064

1065

# Get video webpage

1066

url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id

1067

video_webpage = self._download_webpage(url, video_id)

1068

1069

# Attempt to extract SWF player URL

1070

mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

1071

if mobj is not None:

1072

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

else:

player_url = None

dash_mpds = []

def add_dash_mpd(video_info):

1079

dash_mpd = video_info.get('dashmpd')

1080

if dash_mpd and dash_mpd[0] not in dash_mpds:

1081

dash_mpds.append(dash_mpd[0])

# Get video info

embed_webpage = None

is_live = None

if re.search(r'player-age-gate-content">', video_webpage) is not None:

1087

age_gate = True

1088

# We simulate the access to the video from www.youtube.com/v/{video_id}

1089

# this can be viewed without login into Youtube

1090

url = proto + '://www.youtube.com/embed/%s' % video_id

1091

embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')

1092

data = compat_urllib_parse.urlencode({

1093

'video_id': video_id,

1094

'eurl': 'https://youtube.googleapis.com/v/' + video_id,

1095

'sts': self._search_regex(

1096

r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),

1097

})

1098

video_info_url = proto + '://www.youtube.com/get_video_info?' + data

1099

video_info_webpage = self._download_webpage(

1100

video_info_url, video_id,

1101

note='Refetching age-gated info webpage',

1102

errnote='unable to download video info webpage')

1103

video_info = compat_parse_qs(video_info_webpage)

1104

add_dash_mpd(video_info)

else:

age_gate = False

video_info = None

# Try looking directly into the video webpage

1109

ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)

1110

if ytplayer_config:

1111

args = ytplayer_config['args']

1112

if args.get('url_encoded_fmt_stream_map'):

1113

# Convert to the same format returned by compat_parse_qs

1114

video_info = dict((k, [v]) for k, v in args.items())

1115

add_dash_mpd(video_info)

1116

if args.get('livestream') == '1' or args.get('live_playback') == 1:

1117

is_live = True

1118

if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):

1119

# We also try looking in get_video_info since it may contain different dashmpd

1120

# URL that points to a DASH manifest with possibly different itag set (some itags

1121

# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH

1122

# manifest pointed by get_video_info's dashmpd).

1123

# The general idea is to take a union of itags of both DASH manifests (for example

1124

# video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)

1125

self.report_video_info_webpage_download(video_id)

1126

for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:

1127

video_info_url = (

1128

'%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'

1129

% (proto, video_id, el_type))

1130

video_info_webpage = self._download_webpage(

1131

video_info_url,

1132

video_id, note=False,

1133

errnote='unable to download video info webpage')

1134

get_video_info = compat_parse_qs(video_info_webpage)

1135

if get_video_info.get('use_cipher_signature') != ['True']:

1136

add_dash_mpd(get_video_info)

1137

if not video_info:

1138

video_info = get_video_info

1139

if 'token' in get_video_info:

1140

# Different get_video_info requests may report different results, e.g.

1141

# some may report video unavailability, but some may serve it without

1142

# any complaint (see https://github.com/rg3/youtube-dl/issues/7362,

1143

# the original webpage as well as el=info and el=embedded get_video_info

1144

# requests report video unavailability due to geo restriction while

1145

# el=detailpage succeeds and returns valid data). This is probably

1146

# due to YouTube measures against IP ranges of hosting providers.

1147

# Working around by preferring the first succeeded video_info containing

1148

# the token if no such video_info yet was found.

1149

if 'token' not in video_info:

1150

video_info = get_video_info

1151

break

1152

if 'token' not in video_info:

1153

if 'reason' in video_info:

1154

if 'The uploader has not made this video available in your country.' in video_info['reason']:

1155

regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)

1156

if regions_allowed:

1157

raise ExtractorError('YouTube said: This video is available in %s only' % (

1158

', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),

1159

expected=True)

1160

raise ExtractorError(

1161

'YouTube said: %s' % video_info['reason'][0],

1162

expected=True, video_id=video_id)

1163

else:

1164

raise ExtractorError(

1165

'"token" parameter not in video info for unknown reason',

video_id=video_id)

# title

if 'title' in video_info:

1170

video_title = video_info['title'][0]

1171

else:

1172

self._downloader.report_warning('Unable to extract video title')

video_title = '_'

# description

video_description = get_element_by_id("eow-description", video_webpage)

1177

if video_description:

1178

video_description = re.sub(r'''(?x)

1179

<a\s+

1180

(?:[a-zA-Z-]+="[^"]+"\s+)*?

1181

(?:title|href)="([^"]+)"\s+

1182

(?:[a-zA-Z-]+="[^"]+"\s+)*?

1183

class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*>

1184

[^<]+\.{3}\s*

1185

</a>

1186

''', r'\1', video_description)

1187

video_description = clean_html(video_description)

1188

else:

1189

fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)

1190

if fd_mobj:

1191

video_description = unescapeHTML(fd_mobj.group(1))

1192

else:

1193

video_description = ''

1194

1195

if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):

1196

if not self._downloader.params.get('noplaylist'):

1197

entries = []

1198

feed_ids = []

1199

multifeed_metadata_list = video_info['multifeed_metadata_list'][0]

1200

for feed in multifeed_metadata_list.split(','):

1201

# Unquote should take place before split on comma (,) since textual

1202

# fields may contain comma as well (see

1203

# https://github.com/rg3/youtube-dl/issues/8536)

1204

feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))

1205

entries.append({

1206

'_type': 'url_transparent',

1207

'ie_key': 'Youtube',

1208

'url': smuggle_url(

1209

'%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),

1210

{'force_singlefeed': True}),

1211

'title': '%s (%s)' % (video_title, feed_data['title'][0]),

1212

})

1213

feed_ids.append(feed_data['id'][0])

1214

self.to_screen(

1215

'Downloading multifeed video (%s) - add --no-playlist to just download video %s'

1216

% (', '.join(feed_ids), video_id))

1217

return self.playlist_result(entries, video_id, video_title, video_description)

1218

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1219

1220

if 'view_count' in video_info:

1221

view_count = int(video_info['view_count'][0])

else:

view_count = None

# Check for "rental" videos

1226

if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:

1227

raise ExtractorError('"rental" videos not supported')

1228

1229

# Start extracting information

1230

self.report_information_extraction(video_id)

1231

1232

# uploader

1233

if 'author' not in video_info:

1234

raise ExtractorError('Unable to extract uploader name')

1235

video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])

1236

1237

# uploader_id

1238

video_uploader_id = None

1239

mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)

1240

if mobj is not None:

1241

video_uploader_id = mobj.group(1)

1242

else:

1243

self._downloader.report_warning('unable to extract uploader nickname')

1244

1245

# thumbnail image

1246

# We try first to get a high quality image:

1247

m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',

1248

video_webpage, re.DOTALL)

1249

if m_thumb is not None:

1250

video_thumbnail = m_thumb.group(1)

1251

elif 'thumbnail_url' not in video_info:

1252

self._downloader.report_warning('unable to extract video thumbnail')

1253

video_thumbnail = None

1254

else: # don't panic if we can't find it

1255

video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])

1256

1257

# upload date

1258

upload_date = self._html_search_meta(

1259

'datePublished', video_webpage, 'upload date', default=None)

1260

if not upload_date:

1261

upload_date = self._search_regex(

1262

[r'(?s)id="eow-date.*?>(.*?)</span>',

1263

r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],

1264

video_webpage, 'upload date', default=None)

1265

if upload_date:

1266

upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())

1267

upload_date = unified_strdate(upload_date)

1268

1269

m_music = re.search(

1270

r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:$.+?$)?</li',

1271

video_webpage)

1272

if m_music:

1273

video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))

1274

video_creator = clean_html(m_music.group('creator'))

1275

else:

1276

video_alt_title = video_creator = None

1277

1278

m_cat_container = self._search_regex(

1279

r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',

1280

video_webpage, 'categories', default=None)

1281

if m_cat_container:

1282

category = self._html_search_regex(

1283

r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',

1284

default=None)

1285

video_categories = None if category is None else [category]

1286

else:

1287

video_categories = None

1288

1289

video_tags = [

1290

unescapeHTML(m.group('content'))

1291

for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]

1292

1293

def _extract_count(count_name):

1294

return str_to_int(self._search_regex(

1295

r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'

1296

% re.escape(count_name),

1297

video_webpage, count_name, default=None))

1298

1299

like_count = _extract_count('like')

1300

dislike_count = _extract_count('dislike')

1301

1302

# subtitles

1303

video_subtitles = self.extract_subtitles(video_id, video_webpage)

1304

automatic_captions = self.extract_automatic_captions(video_id, video_webpage)

1305

1306

if 'length_seconds' not in video_info:

1307

self._downloader.report_warning('unable to extract video duration')

1308

video_duration = None

1309

else:

1310

video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))

1311

1312

# annotations

1313

video_annotations = None

1314

if self._downloader.params.get('writeannotations', False):

1315

video_annotations = self._extract_annotations(video_id)

1316

1317

def _map_to_format_list(urlmap):

1318

formats = []

1319

for itag, video_real_url in urlmap.items():

1320

dct = {

1321

'format_id': itag,

1322

'url': video_real_url,

1323

'player_url': player_url,

1324

}

1325

if itag in self._formats:

1326

dct.update(self._formats[itag])

formats.append(dct)

return formats

if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

1331

self.report_rtmp_download()

1332

formats = [{

1333

'format_id': '_rtmp',

1334

'protocol': 'rtmp',

1335

'url': video_info['conn'][0],

1336

'player_url': player_url,

1337

}]

1338

elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:

1339

encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]

1340

if 'rtmpe%3Dyes' in encoded_url_map:

1341

raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)

1342

formats = []

1343

for url_data_str in encoded_url_map.split(','):

1344

url_data = compat_parse_qs(url_data_str)

1345

if 'itag' not in url_data or 'url' not in url_data:

1346

continue

1347

format_id = url_data['itag'][0]

1348

url = url_data['url'][0]

1349

1350

if 'sig' in url_data:

1351

url += '&signature=' + url_data['sig'][0]

1352

elif 's' in url_data:

1353

encrypted_sig = url_data['s'][0]

1354

ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'

1355

1356

jsplayer_url_json = self._search_regex(

1357

ASSETS_RE,

1358

embed_webpage if age_gate else video_webpage,

1359

'JS player URL (1)', default=None)

1360

if not jsplayer_url_json and not age_gate:

1361

# We need the embed website after all

1362

if embed_webpage is None:

1363

embed_url = proto + '://www.youtube.com/embed/%s' % video_id

1364

embed_webpage = self._download_webpage(

1365

embed_url, video_id, 'Downloading embed webpage')

1366

jsplayer_url_json = self._search_regex(

1367

ASSETS_RE, embed_webpage, 'JS player URL')

1368

1369

player_url = json.loads(jsplayer_url_json)

1370

if player_url is None:

1371

player_url_json = self._search_regex(

1372

r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',

1373

video_webpage, 'age gate player URL')

1374

player_url = json.loads(player_url_json)

1375

1376

if self._downloader.params.get('verbose'):

1377

if player_url is None:

1378

player_version = 'unknown'

1379

player_desc = 'unknown'

1380

else:

1381

if player_url.endswith('swf'):

1382

player_version = self._search_regex(

1383

r'-(.+?)(?:/watch_as3)?\.swf$', player_url,

1384

'flash player', fatal=False)

1385

player_desc = 'flash player %s' % player_version

1386

else:

1387

player_version = self._search_regex(

1388

[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],

1389

player_url,

1390

'html5 player', fatal=False)

1391

player_desc = 'html5 player %s' % player_version

1392

1393

parts_sizes = self._signature_cache_id(encrypted_sig)

1394

self.to_screen('{%s} signature length %s, %s' %

1395

(format_id, parts_sizes, player_desc))

1396

1397

signature = self._decrypt_signature(

1398

encrypted_sig, video_id, player_url, age_gate)

1399

url += '&signature=' + signature

1400

if 'ratebypass' not in url:

1401

url += '&ratebypass=yes'

1402

1403

dct = {

1404

'format_id': format_id,

1405

'url': url,

1406

'player_url': player_url,

1407

}

1408

if format_id in self._formats:

1409

dct.update(self._formats[format_id])

1410

1411

# Some itags are not included in DASH manifest thus corresponding formats will

1412

# lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).

1413

# Trying to extract metadata from url_encoded_fmt_stream_map entry.

1414

mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])

1415

width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)

1416

1417

more_fields = {

1418

'filesize': int_or_none(url_data.get('clen', [None])[0]),

1419

'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),

1420

'width': width,

1421

'height': height,

1422

'fps': int_or_none(url_data.get('fps', [None])[0]),

1423

'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],

1424

}

1425

for key, value in more_fields.items():

1426

if value:

1427

dct[key] = value

1428

type_ = url_data.get('type', [None])[0]

1429

if type_:

1430

type_split = type_.split(';')

1431

kind_ext = type_split[0].split('/')

1432

if len(kind_ext) == 2:

1433

kind, _ = kind_ext

1434

dct['ext'] = mimetype2ext(type_split[0])

1435

if kind in ('audio', 'video'):

1436

codecs = None

1437

for mobj in re.finditer(

1438

r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):

1439

if mobj.group('key') == 'codecs':

1440

codecs = mobj.group('val')

1441

break

1442

if codecs:

1443

codecs = codecs.split(',')

1444

if len(codecs) == 2:

1445

acodec, vcodec = codecs[1], codecs[0]

1446

else:

1447

acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0])

dct.update({

'acodec': acodec,

'vcodec': vcodec,

})

formats.append(dct)

elif video_info.get('hlsvp'):

1454

manifest_url = video_info['hlsvp'][0]

1455

url_map = self._extract_from_m3u8(manifest_url, video_id)

1456

formats = _map_to_format_list(url_map)

1457

# Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming

1458

for a_format in formats:

1459

a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'

1460

else:

1461

unavailable_message = self._html_search_regex(

1462

r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',

1463

video_webpage, 'unavailable message', default=None)

1464

if unavailable_message:

1465

raise ExtractorError(unavailable_message, expected=True)

1466

raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')

1467

1468

# Look for the DASH manifest

1469

if self._downloader.params.get('youtube_include_dash_manifest', True):

1470

dash_mpd_fatal = True

1471

for mpd_url in dash_mpds:

1472

dash_formats = {}

1473

try:

1474

def decrypt_sig(mobj):

1475

s = mobj.group(1)

1476

dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)

1477

return '/signature/%s' % dec_s

1478

1479

mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)

1480

1481

for df in self._extract_mpd_formats(

1482

mpd_url, video_id, fatal=dash_mpd_fatal,

1483

formats_dict=self._formats):

1484

# Do not overwrite DASH format found in some previous DASH manifest

1485

if df['format_id'] not in dash_formats:

1486

dash_formats[df['format_id']] = df

1487

# Additional DASH manifests may end up in HTTP Error 403 therefore

1488

# allow them to fail without bug report message if we already have

1489

# some DASH manifest succeeded. This is temporary workaround to reduce

1490

# burst of bug reports until we figure out the reason and whether it

1491

# can be fixed at all.

1492

dash_mpd_fatal = False

1493

except (ExtractorError, KeyError) as e:

1494

self.report_warning(

1495

'Skipping DASH manifest: %r' % e, video_id)

1496

if dash_formats:

1497

# Remove the formats we found through non-DASH, they

1498

# contain less info and it can be wrong, because we use

1499

# fixed values (for example the resolution). See

1500

# https://github.com/rg3/youtube-dl/issues/5774 for an

1501

# example.

1502

formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]

1503

formats.extend(dash_formats.values())

1504

1505

# Check for malformed aspect ratio

1506

stretched_m = re.search(

1507

r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',

1508

video_webpage)

1509

if stretched_m:

1510

w = float(stretched_m.group('w'))

1511

h = float(stretched_m.group('h'))

1512

# yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).

1513

# We will only process correct ratios.

if w > 0 and h > 0:

ratio = w / h

for f in formats:

if f.get('vcodec') != 'none':

1518

f['stretched_ratio'] = ratio

1519

1520

self._sort_formats(formats)

return {

'id': video_id,

'uploader': video_uploader,

1525

'uploader_id': video_uploader_id,

1526

'upload_date': upload_date,

1527

'creator': video_creator,

1528

'title': video_title,

1529

'alt_title': video_alt_title,

1530

'thumbnail': video_thumbnail,

1531

'description': video_description,

1532

'categories': video_categories,

1533

'tags': video_tags,

1534

'subtitles': video_subtitles,

1535

'automatic_captions': automatic_captions,

1536

'duration': video_duration,

1537

'age_limit': 18 if age_gate else 0,

1538

'annotations': video_annotations,

1539

'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,

1540

'view_count': view_count,

1541

'like_count': like_count,

1542

'dislike_count': dislike_count,

1543

'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),

1544

'formats': formats,

1545

'is_live': is_live,

1546

'start_time': start_time,

1547

'end_time': end_time,

}

class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):

1552

IE_DESC = 'YouTube.com playlists'

1553

_VALID_URL = r"""(?x)(?:

(?:https?://)?

(?:\w+\.)?

youtube\.com/

(?:

\? (?:.*?[&;])*? (?:p|a|list)=

| p/

)

(

(?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}

1564

# Top tracks, they can also include dots

|(?:MC)[\w\.]*

)

.*

|

((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})

1570

)"""

1571

_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'

1572

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'

1573

IE_NAME = 'youtube:playlist'

1574

_TESTS = [{

1575

'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

1576

'info_dict': {

1577

'title': 'ytdl test PL',

1578

'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

},

'playlist_count': 3,

}, {

'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

1583

'info_dict': {

1584

'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

1585

'title': 'YDL_Empty_List',

},

'playlist_count': 0,

}, {

'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',

1590

'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

1591

'info_dict': {

1592

'title': '29C3: Not my department',

1593

'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

1594

},

1595

'playlist_count': 95,

1596

}, {

1597

'note': 'issue #673',

1598

'url': 'PLBB231211A4F62143',

1599

'info_dict': {

1600

'title': '[OLD]Team Fortress 2 (Class-based LP)',

1601

'id': 'PLBB231211A4F62143',

1602

},

1603

'playlist_mincount': 26,

1604

}, {

1605

'note': 'Large playlist',

1606

'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',

1607

'info_dict': {

1608

'title': 'Uploads from Cauchemar',

1609

'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',

1610

},

1611

'playlist_mincount': 799,

1612

}, {

1613

'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

1614

'info_dict': {

1615

'title': 'YDL_safe_search',

1616

'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

},

'playlist_count': 2,

}, {

'note': 'embedded',

'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

'playlist_count': 4,

'info_dict': {

'title': 'JODA15',

'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

1626

}

1627

}, {

1628

'note': 'Embedded SWF player',

1629

'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',

'playlist_count': 4,

'info_dict': {

'title': 'JODA7',

'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',

1634

}

1635

}, {

1636

'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',

1637

'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',

1638

'info_dict': {

1639

'title': 'Uploads from Interstellar Movie',

1640

'id': 'UUXw-G3eDE9trcvY2sBMM_aA',

1641

},

1642

'playlist_mincout': 21,

1643

}]

1644

1645

def _real_initialize(self):

1646

self._login()

1647

1648

def _extract_mix(self, playlist_id):

1649

# The mixes are generated from a single video

1650

# the id of the playlist is just 'RD' + video_id

1651

url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)

1652

webpage = self._download_webpage(

1653

url, playlist_id, 'Downloading Youtube mix')

1654

search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)

1655

title_span = (

1656

search_title('playlist-title') or

1657

search_title('title long-title') or

1658

search_title('title'))

1659

title = clean_html(title_span)

1660

ids = orderedSet(re.findall(

1661

r'''(?xs)data-video-username=".*?".*?

1662

href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),

1663

webpage))

1664

url_results = self._ids_to_results(ids)

1665

1666

return self.playlist_result(url_results, playlist_id, title)

1667

1668

def _extract_playlist(self, playlist_id):

1669

url = self._TEMPLATE_URL % playlist_id

1670

page = self._download_webpage(url, playlist_id)

1671

1672

for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):

1673

match = match.strip()

1674

# Check if the playlist exists or is private

1675

if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):

1676

raise ExtractorError(

1677

'The playlist doesn\'t exist or is private, use --username or '

1678

'--netrc to access it.',

1679

expected=True)

1680

elif re.match(r'[^<]*Invalid parameters[^<]*', match):

1681

raise ExtractorError(

1682

'Invalid parameters. Maybe URL is incorrect.',

1683

expected=True)

1684

elif re.match(r'[^<]*Choose your language[^<]*', match):

1685

continue

1686

else:

1687

self.report_warning('Youtube gives an alert message: ' + match)

1688

1689

playlist_title = self._html_search_regex(

1690

r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',

1691

page, 'title')

1692

1693

return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)

1694

1695

def _real_extract(self, url):

1696

# Extract playlist id

1697

mobj = re.match(self._VALID_URL, url)

1698

if mobj is None:

1699

raise ExtractorError('Invalid URL: %s' % url)

1700

playlist_id = mobj.group(1) or mobj.group(2)

1701

1702

# Check if it's a video-specific URL

1703

query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)

1704

if 'v' in query_dict:

1705

video_id = query_dict['v'][0]

1706

if self._downloader.params.get('noplaylist'):

1707

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1708

return self.url_result(video_id, 'Youtube', video_id=video_id)

1709

else:

1710

self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))

1711

1712

if playlist_id.startswith('RD') or playlist_id.startswith('UL'):

1713

# Mixes require a custom extraction process

1714

return self._extract_mix(playlist_id)

1715

1716

return self._extract_playlist(playlist_id)

1717

1718

1719

class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):

1720

IE_DESC = 'YouTube.com channels'

1721

_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'

1722

_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'

1723

_VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'

1724

IE_NAME = 'youtube:channel'

1725

_TESTS = [{

1726

'note': 'paginated channel',

1727

'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',

1728

'playlist_mincount': 91,

1729

'info_dict': {

1730

'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',

1731

'title': 'Uploads from lex will',

1732

}

1733

}, {

1734

'note': 'Age restricted channel',

1735

# from https://www.youtube.com/user/DeusExOfficial

1736

'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',

1737

'playlist_mincount': 64,

1738

'info_dict': {

1739

'id': 'UUs0ifCMCm1icqRbqhUINa0w',

1740

'title': 'Uploads from Deus Ex',

},

}]

@classmethod

def suitable(cls, url):

1746

return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url)

1747

1748

def _real_extract(self, url):

1749

channel_id = self._match_id(url)

1750

1751

url = self._TEMPLATE_URL % channel_id

1752

1753

# Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)

1754

# Workaround by extracting as a playlist if managed to obtain channel playlist URL

1755

# otherwise fallback on channel by page extraction

1756

channel_page = self._download_webpage(

1757

url + '?view=57', channel_id,

1758

'Downloading channel page', fatal=False)

1759

if channel_page is False:

1760

channel_playlist_id = False

1761

else:

1762

channel_playlist_id = self._html_search_meta(

1763

'channelId', channel_page, 'channel id', default=None)

1764

if not channel_playlist_id:

1765

channel_playlist_id = self._search_regex(

1766

r'data-(?:channel-external-|yt)id="([^"]+)"',

1767

channel_page, 'channel id', default=None)

1768

if channel_playlist_id and channel_playlist_id.startswith('UC'):

1769

playlist_id = 'UU' + channel_playlist_id[2:]

1770

return self.url_result(

1771

compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')

1772

1773

channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')

1774

autogenerated = re.search(r'''(?x)

1775

class="[^"]*?(?:

1776

channel-header-autogenerated-label|

1777

yt-channel-title-autogenerated

1778

)[^"]*"''', channel_page) is not None

1779

1780

if autogenerated:

1781

# The videos are contained in a single page

1782

# the ajax pages can't be used, they are empty

1783

entries = [

1784

self.url_result(

1785

video_id, 'Youtube', video_id=video_id,

1786

video_title=video_title)

1787

for video_id, video_title in self.extract_videos_from_page(channel_page)]

1788

return self.playlist_result(entries, channel_id)

1789

1790

return self.playlist_result(self._entries(channel_page, channel_id), channel_id)

1791

1792

1793

class YoutubeUserIE(YoutubeChannelIE):

1794

IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'

1795

_VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'

1796

_TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'

1797

IE_NAME = 'youtube:user'

1798

1799

_TESTS = [{

1800

'url': 'https://www.youtube.com/user/TheLinuxFoundation',

1801

'playlist_mincount': 320,

1802

'info_dict': {

1803

'title': 'TheLinuxFoundation',

1804

}

1805

}, {

1806

'url': 'ytuser:phihag',

1807

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

1812

# Don't return True if the url can be extracted with other youtube

1813

# extractor, the regex would is too permissive and it would match.

1814

other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)

1815

if any(ie.suitable(url) for ie in other_ies):

1816

return False

1817

else:

1818

return super(YoutubeUserIE, cls).suitable(url)

1819

1820

1821

class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):

1822

IE_DESC = 'YouTube.com user/channel playlists'

1823

_VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'

1824

IE_NAME = 'youtube:playlists'

1825

1826

_TESTS = [{

1827

'url': 'http://www.youtube.com/user/ThirstForScience/playlists',

1828

'playlist_mincount': 4,

1829

'info_dict': {

1830

'id': 'ThirstForScience',

1831

'title': 'Thirst for Science',

1832

},

1833

}, {

1834

# with "Load more" button

1835

'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',

1836

'playlist_mincount': 70,

1837

'info_dict': {

1838

'id': 'igorkle1',

1839

'title': 'Игорь Клейнер',

1840

},

1841

}, {

1842

'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',

1843

'playlist_mincount': 17,

1844

'info_dict': {

1845

'id': 'UCiU1dHvZObB2iP6xkJ__Icw',

1846

'title': 'Chem Player',

},

}]

class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):

1852

IE_DESC = 'YouTube.com searches'

1853

# there doesn't appear to be a real limit, for example if you search for

1854

# 'python' you get more than 8.000.000 results

1855

_MAX_RESULTS = float('inf')

1856

IE_NAME = 'youtube:search'

1857

_SEARCH_KEY = 'ytsearch'

1858

_EXTRA_QUERY_ARGS = {}

1859

_TESTS = []

1860

1861

def _get_n_results(self, query, n):

1862

"""Get a specified number of results for a query"""

videos = []

limit = n

for pagenum in itertools.count(1):

1868

url_query = {

1869

'search_query': query.encode('utf-8'),

'page': pagenum,

'spf': 'navigate',

}

url_query.update(self._EXTRA_QUERY_ARGS)

1874

result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)

1875

data = self._download_json(

1876

result_url, video_id='query "%s"' % query,

1877

note='Downloading page %s' % pagenum,

1878

errnote='Unable to download API page')

1879

html_content = data[1]['body']['content']

1880

1881

if 'class="search-message' in html_content:

1882

raise ExtractorError(

1883

'[youtube] No video results', expected=True)

1884

1885

new_videos = self._ids_to_results(orderedSet(re.findall(

1886

r'href="/watch\?v=(.{11})', html_content)))

1887

videos += new_videos

1888

if not new_videos or len(videos) > limit:

break

if len(videos) > n:

videos = videos[:n]

return self.playlist_result(videos, query)

1894

1895

1896

class YoutubeSearchDateIE(YoutubeSearchIE):

1897

IE_NAME = YoutubeSearchIE.IE_NAME + ':date'

1898

_SEARCH_KEY = 'ytsearchdate'

1899

IE_DESC = 'YouTube.com searches, newest videos first'

1900

_EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}

1901

1902

1903

class YoutubeSearchURLIE(InfoExtractor):

1904

IE_DESC = 'YouTube.com search URLs'

1905

IE_NAME = 'youtube:search_url'

1906

_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'

1907

_TESTS = [{

1908

'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',

1909

'playlist_mincount': 5,

1910

'info_dict': {

1911

'title': 'youtube-dl test video',

}

}]

def _real_extract(self, url):

1916

mobj = re.match(self._VALID_URL, url)

1917

query = compat_urllib_parse_unquote_plus(mobj.group('query'))

1918

1919

webpage = self._download_webpage(url, query)

1920

result_code = self._search_regex(

1921

r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')

1922

1923

part_codes = re.findall(

1924

r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code)

1925

entries = []

1926

for part_code in part_codes:

1927

part_title = self._html_search_regex(

1928

[r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)

1929

part_url_snippet = self._html_search_regex(

1930

r'(?s)href="([^"]+)"', part_code, 'item URL')

1931

part_url = compat_urlparse.urljoin(

1932

'https://www.youtube.com/', part_url_snippet)

entries.append({

'_type': 'url',

'url': part_url,

'title': part_title,

})

return {

'_type': 'playlist',

'entries': entries,

'title': query,

}

class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):

1947

IE_DESC = 'YouTube.com (multi-season) shows'

1948

_VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'

1949

IE_NAME = 'youtube:show'

1950

_TESTS = [{

1951

'url': 'https://www.youtube.com/show/airdisasters',

1952

'playlist_mincount': 5,

1953

'info_dict': {

1954

'id': 'airdisasters',

1955

'title': 'Air Disasters',

}

}]

def _real_extract(self, url):

1960

playlist_id = self._match_id(url)

1961

return super(YoutubeShowIE, self)._real_extract(

1962

'https://www.youtube.com/show/%s/playlists' % playlist_id)

1963

1964

1965

class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):

1966

"""

1967

Base class for feed extractors

1968

Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.

1969

"""

1970

_LOGIN_REQUIRED = True

@property

def IE_NAME(self):

return 'youtube:%s' % self._FEED_NAME

1975

1976

def _real_initialize(self):

1977

self._login()

1978

1979

def _real_extract(self, url):

1980

page = self._download_webpage(

1981

'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)

1982

1983

# The extraction process is the same as for playlists, but the regex

1984

# for the video ids doesn't contain an index

1985

ids = []

1986

more_widget_html = content_html = page

1987

for page_num in itertools.count(1):

1988

matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)

1989

1990

# 'recommended' feed has infinite 'load more' and each new portion spins

1991

# the same videos in (sometimes) slightly different order, so we'll check

1992

# for unicity and break when portion has no new videos

1993

new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))

if not new_ids:

break

ids.extend(new_ids)

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

2004

'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,

2005

'Downloading page #%s' % page_num,

2006

transform_source=uppercase_escape)

2007

content_html = more['content_html']

2008

more_widget_html = more['load_more_widget_html']

2009

2010

return self.playlist_result(

2011

self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)

2012

2013

2014

class YoutubeWatchLaterIE(YoutubePlaylistIE):

2015

IE_NAME = 'youtube:watchlater'

2016

IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'

2017

_VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'

2018

2019

_TESTS = [] # override PlaylistIE tests

2020

2021

def _real_extract(self, url):

2022

return self._extract_playlist('WL')

2023

2024

2025

class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):

2026

IE_NAME = 'youtube:favorites'

2027

IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'

2028

_VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'

2029

_LOGIN_REQUIRED = True

2030

2031

def _real_extract(self, url):

2032

webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')

2033

playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')

2034

return self.url_result(playlist_id, 'YoutubePlaylist')

2035

2036

2037

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):

2038

IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'

2039

_VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'

2040

_FEED_NAME = 'recommended'

2041

_PLAYLIST_TITLE = 'Youtube Recommended videos'

2042

2043

2044

class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):

2045

IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'

2046

_VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'

2047

_FEED_NAME = 'subscriptions'

2048

_PLAYLIST_TITLE = 'Youtube Subscriptions'

2049

2050

2051

class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):

2052

IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'

2053

_VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'

2054

_FEED_NAME = 'history'

2055

_PLAYLIST_TITLE = 'Youtube History'

2056

2057

2058

class YoutubeTruncatedURLIE(InfoExtractor):

2059

IE_NAME = 'youtube:truncated_url'

2060

IE_DESC = False # Do not list

2061

_VALID_URL = r'''(?x)

2062

(?:https?://)?

2063

(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/

2064

(?:watch\?(?:

2065

feature=[a-z_]+|

2066

annotation_id=annotation_[^&]+|

x-yt-cl=[0-9]+|

hl=[^&]*|

t=[0-9]+

)?

|

attribution_link\?a=[^&]+

)

$

'''

_TESTS = [{

'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',

2079

'only_matching': True,

2080

}, {

2081

'url': 'http://www.youtube.com/watch?',

2082

'only_matching': True,

2083

}, {

2084

'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',

2085

'only_matching': True,

2086

}, {

2087

'url': 'https://www.youtube.com/watch?feature=foo',

2088

'only_matching': True,

2089

}, {

2090

'url': 'https://www.youtube.com/watch?hl=en-GB',

2091

'only_matching': True,

2092

}, {

2093

'url': 'https://www.youtube.com/watch?t=2372',

2094

'only_matching': True,

2095

}]

2096

2097

def _real_extract(self, url):

2098

raise ExtractorError(

2099

'Did you forget to quote the URL? Remember that & is a meta '

2100

'character in most shells, so you want to put the URL in quotes, '

2101

'like youtube-dl '

2102

'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '

2103

' or simply youtube-dl BaW_jenozKc .',

expected=True)

class YoutubeTruncatedIDIE(InfoExtractor):

2108

IE_NAME = 'youtube:truncated_id'

2109

IE_DESC = False # Do not list

2110

_VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'

2111

2112

_TESTS = [{

2113

'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',

2114

'only_matching': True,

2115

}]

2116

2117

def _real_extract(self, url):

2118

video_id = self._match_id(url)

2119

raise ExtractorError(

2120

'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),

2121

expected=True)