jfr.im git - yt-dlp.git/blame_incremental - youtube

Commit	Line	Data
	1	# coding: utf-8
	2
	3	from __future__ import unicode_literals
	4
	5
	6	import itertools
	7	import json
	8	import os.path
	9	import random
	10	import re
	11	import time
	12	import traceback
	13
	14	from .common import InfoExtractor, SearchInfoExtractor
	15	from ..jsinterp import JSInterpreter
	16	from ..swfinterp import SWFInterpreter
	17	from ..compat import (
	18	compat_chr,
	19	compat_kwargs,
	20	compat_parse_qs,
	21	compat_urllib_parse_unquote,
	22	compat_urllib_parse_unquote_plus,
	23	compat_urllib_parse_urlencode,
	24	compat_urllib_parse_urlparse,
	25	compat_urlparse,
	26	compat_str,
	27	)
	28	from ..utils import (
	29	clean_html,
	30	error_to_compat_str,
	31	ExtractorError,
	32	float_or_none,
	33	get_element_by_attribute,
	34	get_element_by_id,
	35	int_or_none,
	36	mimetype2ext,
	37	orderedSet,
	38	parse_codecs,
	39	parse_duration,
	40	qualities,
	41	remove_quotes,
	42	remove_start,
	43	smuggle_url,
	44	str_or_none,
	45	str_to_int,
	46	try_get,
	47	unescapeHTML,
	48	unified_strdate,
	49	unsmuggle_url,
	50	uppercase_escape,
	51	url_or_none,
	52	urlencode_postdata,
	53	)
	54
	55
	56	class YoutubeBaseInfoExtractor(InfoExtractor):
	57	"""Provide base functions for Youtube extractors"""
	58	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	59	_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
	60
	61	_LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
	62	_CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
	63	_TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
	64
	65	_NETRC_MACHINE = 'youtube'
	66	# If True it will raise an error if no login info is provided
	67	_LOGIN_REQUIRED = False
	68
	69	_PLAYLIST_ID_RE = r'(?:PL\|LL\|EC\|UU\|FL\|RD\|UL\|TL\|OLAK5uy_)[0-9A-Za-z-_]{10,}'
	70
	71	def _set_language(self):
	72	self._set_cookie(
	73	'.youtube.com', 'PREF', 'f1=50000000&hl=en',
	74	# YouTube sets the expire time to about two months
	75	expire_time=time.time() + 2 * 30 * 24 * 3600)
	76
	77	def _ids_to_results(self, ids):
	78	return [
	79	self.url_result(vid_id, 'Youtube', video_id=vid_id)
	80	for vid_id in ids]
	81
	82	def _login(self):
	83	"""
	84	Attempt to log in to YouTube.
	85	True is returned if successful or skipped.
	86	False is returned if login failed.
	87
	88	If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
	89	"""
	90	username, password = self._get_login_info()
	91	# No authentication to be performed
	92	if username is None:
	93	if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
	94	raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
	95	return True
	96
	97	login_page = self._download_webpage(
	98	self._LOGIN_URL, None,
	99	note='Downloading login page',
	100	errnote='unable to fetch login page', fatal=False)
	101	if login_page is False:
	102	return
	103
	104	login_form = self._hidden_inputs(login_page)
	105
	106	def req(url, f_req, note, errnote):
	107	data = login_form.copy()
	108	data.update({
	109	'pstMsg': 1,
	110	'checkConnection': 'youtube',
	111	'checkedDomains': 'youtube',
	112	'hl': 'en',
	113	'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
	114	'f.req': json.dumps(f_req),
	115	'flowName': 'GlifWebSignIn',
	116	'flowEntry': 'ServiceLogin',
	117	})
	118	return self._download_json(
	119	url, None, note=note, errnote=errnote,
	120	transform_source=lambda s: re.sub(r'^[^[]*', '', s),
	121	fatal=False,
	122	data=urlencode_postdata(data), headers={
	123	'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
	124	'Google-Accounts-XSRF': 1,
	125	})
	126
	127	def warn(message):
	128	self._downloader.report_warning(message)
	129
	130	lookup_req = [
	131	username,
	132	None, [], None, 'US', None, None, 2, False, True,
	133	[
	134	None, None,
	135	[2, 1, None, 1,
	136	'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
	137	None, [], 4],
	138	1, [None, None, []], None, None, None, True
	139	],
	140	username,
	141	]
	142
	143	lookup_results = req(
	144	self._LOOKUP_URL, lookup_req,
	145	'Looking up account info', 'Unable to look up account info')
	146
	147	if lookup_results is False:
	148	return False
	149
	150	user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
	151	if not user_hash:
	152	warn('Unable to extract user hash')
	153	return False
	154
	155	challenge_req = [
	156	user_hash,
	157	None, 1, None, [1, None, None, None, [password, None, True]],
	158	[
	159	None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
	160	1, [None, None, []], None, None, None, True
	161	]]
	162
	163	challenge_results = req(
	164	self._CHALLENGE_URL, challenge_req,
	165	'Logging in', 'Unable to log in')
	166
	167	if challenge_results is False:
	168	return
	169
	170	login_res = try_get(challenge_results, lambda x: x[0][5], list)
	171	if login_res:
	172	login_msg = try_get(login_res, lambda x: x[5], compat_str)
	173	warn(
	174	'Unable to login: %s' % 'Invalid password'
	175	if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
	176	return False
	177
	178	res = try_get(challenge_results, lambda x: x[0][-1], list)
	179	if not res:
	180	warn('Unable to extract result entry')
	181	return False
	182
	183	login_challenge = try_get(res, lambda x: x[0][0], list)
	184	if login_challenge:
	185	challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
	186	if challenge_str == 'TWO_STEP_VERIFICATION':
	187	# SEND_SUCCESS - TFA code has been successfully sent to phone
	188	# QUOTA_EXCEEDED - reached the limit of TFA codes
	189	status = try_get(login_challenge, lambda x: x[5], compat_str)
	190	if status == 'QUOTA_EXCEEDED':
	191	warn('Exceeded the limit of TFA codes, try later')
	192	return False
	193
	194	tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
	195	if not tl:
	196	warn('Unable to extract TL')
	197	return False
	198
	199	tfa_code = self._get_tfa_info('2-step verification code')
	200
	201	if not tfa_code:
	202	warn(
	203	'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
	204	'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
	205	return False
	206
	207	tfa_code = remove_start(tfa_code, 'G-')
	208
	209	tfa_req = [
	210	user_hash, None, 2, None,
	211	[
	212	9, None, None, None, None, None, None, None,
	213	[None, tfa_code, True, 2]
	214	]]
	215
	216	tfa_results = req(
	217	self._TFA_URL.format(tl), tfa_req,
	218	'Submitting TFA code', 'Unable to submit TFA code')
	219
	220	if tfa_results is False:
	221	return False
	222
	223	tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
	224	if tfa_res:
	225	tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
	226	warn(
	227	'Unable to finish TFA: %s' % 'Invalid TFA code'
	228	if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
	229	return False
	230
	231	check_cookie_url = try_get(
	232	tfa_results, lambda x: x[0][-1][2], compat_str)
	233	else:
	234	CHALLENGES = {
	235	'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
	236	'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
	237	'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
	238	}
	239	challenge = CHALLENGES.get(
	240	challenge_str,
	241	'%s returned error %s.' % (self.IE_NAME, challenge_str))
	242	warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
	243	return False
	244	else:
	245	check_cookie_url = try_get(res, lambda x: x[2], compat_str)
	246
	247	if not check_cookie_url:
	248	warn('Unable to extract CheckCookie URL')
	249	return False
	250
	251	check_cookie_results = self._download_webpage(
	252	check_cookie_url, None, 'Checking cookie', fatal=False)
	253
	254	if check_cookie_results is False:
	255	return False
	256
	257	if 'https://myaccount.google.com/' not in check_cookie_results:
	258	warn('Unable to log in')
	259	return False
	260
	261	return True
	262
	263	def _download_webpage_handle(self, args, *kwargs):
	264	query = kwargs.get('query', {}).copy()
	265	query['disable_polymer'] = 'true'
	266	kwargs['query'] = query
	267	return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
	268	args, *compat_kwargs(kwargs))
	269
	270	def _real_initialize(self):
	271	if self._downloader is None:
	272	return
	273	self._set_language()
	274	if not self._login():
	275	return
	276
	277
	278	class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
	279	# Extract entries from page with "Load more" button
	280	def _entries(self, page, playlist_id):
	281	more_widget_html = content_html = page
	282	for page_num in itertools.count(1):
	283	for entry in self._process_page(content_html):
	284	yield entry
	285
	286	mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
	287	if not mobj:
	288	break
	289
	290	more = self._download_json(
	291	'https://youtube.com/%s' % mobj.group('more'), playlist_id,
	292	'Downloading page #%s' % page_num,
	293	transform_source=uppercase_escape)
	294	content_html = more['content_html']
	295	if not content_html.strip():
	296	# Some webpages show a "Load more" button but they don't
	297	# have more videos
	298	break
	299	more_widget_html = more['load_more_widget_html']
	300
	301
	302	class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	303	def _process_page(self, content):
	304	for video_id, video_title in self.extract_videos_from_page(content):
	305	yield self.url_result(video_id, 'Youtube', video_id, video_title)
	306
	307	def extract_videos_from_page(self, page):
	308	ids_in_page = []
	309	titles_in_page = []
	310	for mobj in re.finditer(self._VIDEO_RE, page):
	311	# The link with index 0 is not the first video of the playlist (not sure if still actual)
	312	if 'index' in mobj.groupdict() and mobj.group('id') == '0':
	313	continue
	314	video_id = mobj.group('id')
	315	video_title = unescapeHTML(mobj.group('title'))
	316	if video_title:
	317	video_title = video_title.strip()
	318	try:
	319	idx = ids_in_page.index(video_id)
	320	if video_title and not titles_in_page[idx]:
	321	titles_in_page[idx] = video_title
	322	except ValueError:
	323	ids_in_page.append(video_id)
	324	titles_in_page.append(video_title)
	325	return zip(ids_in_page, titles_in_page)
	326
	327
	328	class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	329	def _process_page(self, content):
	330	for playlist_id in orderedSet(re.findall(
	331	r'<h3[^>]+class="[^"]yt-lockup-title[^"]"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
	332	content)):
	333	yield self.url_result(
	334	'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
	335
	336	def _real_extract(self, url):
	337	playlist_id = self._match_id(url)
	338	webpage = self._download_webpage(url, playlist_id)
	339	title = self._og_search_title(webpage, fatal=False)
	340	return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
	341
	342
	343	class YoutubeIE(YoutubeBaseInfoExtractor):
	344	IE_DESC = 'YouTube.com'
	345	_VALID_URL = r"""(?x)^
	346	(
	347	(?:https?://\|//) # http(s):// or protocol-independent URL
	348	(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/\|
	349	(?:www\.)?deturl\.com/www\.youtube\.com/\|
	350	(?:www\.)?pwnyoutube\.com/\|
	351	(?:www\.)?hooktube\.com/\|
	352	(?:www\.)?yourepeat\.com/\|
	353	tube\.majestyc\.net/\|
	354	(?:www\.)?invidio\.us/\|
	355	(?:www\.)?invidious\.snopyta\.org/\|
	356	(?:www\.)?invidious\.kabi\.tk/\|
	357	(?:www\.)?vid\.wxzm\.sx/\|
	358	youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
	359	(?:.*?\#/)? # handle anchor (#/) redirect urls
	360	(?: # the various things that can precede the ID:
	361	(?:(?:v\|embed\|e)/(?!videoseries)) # v/ or embed/ or e/
	362	\|(?: # or the v= param in all its forms
	363	(?:(?:watch\|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	364	(?:\?\|\#!?) # the params delimiter ? or # or #!
	365	(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
	366	v=
	367	)
	368	))
	369	\|(?:
	370	youtu\.be\| # just youtu.be/xxxx
	371	vid\.plus\| # or vid.plus/xxxx
	372	zwearz\.com/watch\| # or zwearz.com/watch/xxxx
	373	)/
	374	\|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
	375	)
	376	)? # all until now is optional -> you can pass the naked ID
	377	([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
	378	(?!.*?\blist=
	379	(?:
	380	%(playlist_id)s\| # combined list/video URLs are handled by the playlist IE
	381	WL # WL are handled by the watch later IE
	382	)
	383	)
	384	(?(1).+)? # if we found the ID, everything can follow
	385	$""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
	386	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	387	_formats = {
	388	'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	389	'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	390	'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
	391	'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
	392	'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
	393	'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	394	'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	395	'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	396	# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
	397	'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
	398	'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	399	'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	400	'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	401	'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	402	'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	403	'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	404	'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	405	'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	406
	407
	408	# 3D videos
	409	'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	410	'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	411	'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	412	'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	413	'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
	414	'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	415	'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	416
	417	# Apple HTTP Live Streaming
	418	'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	419	'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	420	'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	421	'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	422	'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	423	'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	424	'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	425	'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
	426
	427	# DASH mp4 video
	428	'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
	429	'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
	430	'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
	431	'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
	432	'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
	433	'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
	434	'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
	435	'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
	436	'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
	437	'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
	438	'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
	439	'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
	440
	441	# Dash mp4 audio
	442	'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
	443	'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
	444	'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
	445	'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
	446	'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
	447	'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
	448	'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
	449
	450	# Dash webm
	451	'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	452	'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	453	'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	454	'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	455	'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	456	'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	457	'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
	458	'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	459	'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	460	'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	461	'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	462	'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	463	'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	464	'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	465	'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	466	# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
	467	'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	468	'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
	469	'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
	470	'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
	471	'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	472	'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
	473
	474	# Dash webm audio
	475	'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
	476	'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
	477
	478	# Dash webm audio with opus inside
	479	'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
	480	'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
	481	'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
	482
	483	# RTMP (unnamed)
	484	'_rtmp': {'protocol': 'rtmp'},
	485	}
	486	_SUBTITLE_FORMATS = ('ttml', 'vtt')
	487
	488	_GEO_BYPASS = False
	489
	490	IE_NAME = 'youtube'
	491	_TESTS = [
	492	{
	493	'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
	494	'info_dict': {
	495	'id': 'BaW_jenozKc',
	496	'ext': 'mp4',
	497	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	498	'uploader': 'Philipp Hagemeister',
	499	'uploader_id': 'phihag',
	500	'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',

1

# coding: utf-8

2

3

from __future__ import unicode_literals

import itertools

import json

import os.path

import random

import re

import time

import traceback

from .common import InfoExtractor, SearchInfoExtractor

15

from ..jsinterp import JSInterpreter

16

from ..swfinterp import SWFInterpreter

17

from ..compat import (

compat_chr,

compat_kwargs,

compat_parse_qs,

compat_urllib_parse_unquote,

22

compat_urllib_parse_unquote_plus,

23

compat_urllib_parse_urlencode,

24

compat_urllib_parse_urlparse,

compat_urlparse,

compat_str,

)

from ..utils import (

clean_html,

error_to_compat_str,

ExtractorError,

float_or_none,

get_element_by_attribute,

get_element_by_id,

int_or_none,

mimetype2ext,

orderedSet,

parse_codecs,

parse_duration,

qualities,

remove_quotes,

remove_start,

smuggle_url,

str_or_none,

str_to_int,

try_get,

unescapeHTML,

unified_strdate,

unsmuggle_url,

uppercase_escape,

url_or_none,

urlencode_postdata,

)

class YoutubeBaseInfoExtractor(InfoExtractor):

57

"""Provide base functions for Youtube extractors"""

58

_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'

59

_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'

60

61

_LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'

62

_CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'

63

_TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'

64

65

_NETRC_MACHINE = 'youtube'

66

# If True it will raise an error if no login info is provided

67

_LOGIN_REQUIRED = False

68

69

_PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'

70

71

def _set_language(self):

72

self._set_cookie(

73

'.youtube.com', 'PREF', 'f1=50000000&hl=en',

74

# YouTube sets the expire time to about two months

75

expire_time=time.time() + 2 * 30 * 24 * 3600)

76

77

def _ids_to_results(self, ids):

78

return [

79

self.url_result(vid_id, 'Youtube', video_id=vid_id)

for vid_id in ids]

def _login(self):

"""

Attempt to log in to YouTube.

85

True is returned if successful or skipped.

86

False is returned if login failed.

87

88

If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.

89

"""

90

username, password = self._get_login_info()

91

# No authentication to be performed

92

if username is None:

93

if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:

94

raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)

95

return True

96

97

login_page = self._download_webpage(

98

self._LOGIN_URL, None,

99

note='Downloading login page',

100

errnote='unable to fetch login page', fatal=False)

101

if login_page is False:

102

return

103

104

login_form = self._hidden_inputs(login_page)

105

106

def req(url, f_req, note, errnote):

107

data = login_form.copy()

108

data.update({

109

'pstMsg': 1,

110

'checkConnection': 'youtube',

111

'checkedDomains': 'youtube',

112

'hl': 'en',

113

'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',

114

'f.req': json.dumps(f_req),

115

'flowName': 'GlifWebSignIn',

116

'flowEntry': 'ServiceLogin',

117

})

118

return self._download_json(

119

url, None, note=note, errnote=errnote,

120

transform_source=lambda s: re.sub(r'^[^[]*', '', s),

121

fatal=False,

122

data=urlencode_postdata(data), headers={

123

'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',

124

'Google-Accounts-XSRF': 1,

})

def warn(message):

self._downloader.report_warning(message)

lookup_req = [

username,

None, [], None, 'US', None, None, 2, False, True,

[

None, None,

[2, 1, None, 1,

'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',

137

None, [], 4],

138

1, [None, None, []], None, None, None, True

],

username,

]

lookup_results = req(

144

self._LOOKUP_URL, lookup_req,

145

'Looking up account info', 'Unable to look up account info')

146

147

if lookup_results is False:

148

return False

149

150

user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)

151

if not user_hash:

152

warn('Unable to extract user hash')

return False

challenge_req = [

user_hash,

None, 1, None, [1, None, None, None, [password, None, True]],

158

[

159

None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],

160

1, [None, None, []], None, None, None, True

161

]]

162

163

challenge_results = req(

164

self._CHALLENGE_URL, challenge_req,

165

'Logging in', 'Unable to log in')

166

167

if challenge_results is False:

168

return

169

170

login_res = try_get(challenge_results, lambda x: x[0][5], list)

171

if login_res:

172

login_msg = try_get(login_res, lambda x: x[5], compat_str)

173

warn(

174

'Unable to login: %s' % 'Invalid password'

175

if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)

176

return False

177

178

res = try_get(challenge_results, lambda x: x[0][-1], list)

179

if not res:

180

warn('Unable to extract result entry')

181

return False

182

183

login_challenge = try_get(res, lambda x: x[0][0], list)

184

if login_challenge:

185

challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)

186

if challenge_str == 'TWO_STEP_VERIFICATION':

187

# SEND_SUCCESS - TFA code has been successfully sent to phone

188

# QUOTA_EXCEEDED - reached the limit of TFA codes

189

status = try_get(login_challenge, lambda x: x[5], compat_str)

190

if status == 'QUOTA_EXCEEDED':

191

warn('Exceeded the limit of TFA codes, try later')

192

return False

193

194

tl = try_get(challenge_results, lambda x: x[1][2], compat_str)

195

if not tl:

196

warn('Unable to extract TL')

197

return False

198

199

tfa_code = self._get_tfa_info('2-step verification code')

if not tfa_code:

warn(

'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'

204

'(Note that only TOTP (Google Authenticator App) codes work at this time.)')

205

return False

206

207

tfa_code = remove_start(tfa_code, 'G-')

208

209

tfa_req = [

210

user_hash, None, 2, None,

211

[

212

9, None, None, None, None, None, None, None,

213

[None, tfa_code, True, 2]

]]

tfa_results = req(

self._TFA_URL.format(tl), tfa_req,

218

'Submitting TFA code', 'Unable to submit TFA code')

219

220

if tfa_results is False:

221

return False

222

223

tfa_res = try_get(tfa_results, lambda x: x[0][5], list)

224

if tfa_res:

225

tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)

226

warn(

227

'Unable to finish TFA: %s' % 'Invalid TFA code'

228

if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)

229

return False

230

231

check_cookie_url = try_get(

232

tfa_results, lambda x: x[0][-1][2], compat_str)

233

else:

234

CHALLENGES = {

235

'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",

236

'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',

237

'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",

238

}

239

challenge = CHALLENGES.get(

240

challenge_str,

241

'%s returned error %s.' % (self.IE_NAME, challenge_str))

242

warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)

243

return False

244

else:

245

check_cookie_url = try_get(res, lambda x: x[2], compat_str)

246

247

if not check_cookie_url:

248

warn('Unable to extract CheckCookie URL')

249

return False

250

251

check_cookie_results = self._download_webpage(

252

check_cookie_url, None, 'Checking cookie', fatal=False)

253

254

if check_cookie_results is False:

255

return False

256

257

if 'https://myaccount.google.com/' not in check_cookie_results:

258

warn('Unable to log in')

return False

return True

def _download_webpage_handle(self, *args, **kwargs):

264

query = kwargs.get('query', {}).copy()

265

query['disable_polymer'] = 'true'

266

kwargs['query'] = query

267

return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(

268

*args, **compat_kwargs(kwargs))

269

270

def _real_initialize(self):

271

if self._downloader is None:

272

return

273

self._set_language()

274

if not self._login():

return

class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):

279

# Extract entries from page with "Load more" button

280

def _entries(self, page, playlist_id):

281

more_widget_html = content_html = page

282

for page_num in itertools.count(1):

283

for entry in self._process_page(content_html):

284

yield entry

285

286

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

291

'https://youtube.com/%s' % mobj.group('more'), playlist_id,

292

'Downloading page #%s' % page_num,

293

transform_source=uppercase_escape)

294

content_html = more['content_html']

295

if not content_html.strip():

296

# Some webpages show a "Load more" button but they don't

297

# have more videos

298

break

299

more_widget_html = more['load_more_widget_html']

300

301

302

class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

303

def _process_page(self, content):

304

for video_id, video_title in self.extract_videos_from_page(content):

305

yield self.url_result(video_id, 'Youtube', video_id, video_title)

306

307

def extract_videos_from_page(self, page):

308

ids_in_page = []

309

titles_in_page = []

310

for mobj in re.finditer(self._VIDEO_RE, page):

311

# The link with index 0 is not the first video of the playlist (not sure if still actual)

312

if 'index' in mobj.groupdict() and mobj.group('id') == '0':

313

continue

314

video_id = mobj.group('id')

315

video_title = unescapeHTML(mobj.group('title'))

316

if video_title:

317

video_title = video_title.strip()

318

try:

319

idx = ids_in_page.index(video_id)

320

if video_title and not titles_in_page[idx]:

321

titles_in_page[idx] = video_title

322

except ValueError:

323

ids_in_page.append(video_id)

324

titles_in_page.append(video_title)

325

return zip(ids_in_page, titles_in_page)

326

327

328

class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

329

def _process_page(self, content):

330

for playlist_id in orderedSet(re.findall(

331

r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',

332

content)):

333

yield self.url_result(

334

'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')

335

336

def _real_extract(self, url):

337

playlist_id = self._match_id(url)

338

webpage = self._download_webpage(url, playlist_id)

339

title = self._og_search_title(webpage, fatal=False)

340

return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)

341

342

343

class YoutubeIE(YoutubeBaseInfoExtractor):

344

IE_DESC = 'YouTube.com'

345

_VALID_URL = r"""(?x)^

346

(

347

(?:https?://|//) # http(s):// or protocol-independent URL

348

(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|

349

(?:www\.)?deturl\.com/www\.youtube\.com/|

350

(?:www\.)?pwnyoutube\.com/|

351

(?:www\.)?hooktube\.com/|

352

(?:www\.)?yourepeat\.com/|

353

tube\.majestyc\.net/|

354

(?:www\.)?invidio\.us/|

355

(?:www\.)?invidious\.snopyta\.org/|

356

(?:www\.)?invidious\.kabi\.tk/|

357

(?:www\.)?vid\.wxzm\.sx/|

358

youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains

359

(?:.*?\#/)? # handle anchor (#/) redirect urls

360

(?: # the various things that can precede the ID:

361

(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/

362

|(?: # or the v= param in all its forms

363

(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)

364

(?:\?|\#!?) # the params delimiter ? or # or #!

365

(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)

v=

)

))

|(?:

youtu\.be| # just youtu.be/xxxx

371

vid\.plus| # or vid.plus/xxxx

372

zwearz\.com/watch| # or zwearz.com/watch/xxxx

373

)/

374

|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=

375

)

376

)? # all until now is optional -> you can pass the naked ID

377

([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID

378

(?!.*?\blist=

379

(?:

380

%(playlist_id)s| # combined list/video URLs are handled by the playlist IE

381

WL # WL are handled by the watch later IE

382

)

383

)

384

(?(1).+)? # if we found the ID, everything can follow

385

$""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}

386

_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'

387

_formats = {

388

'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

389

'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

390

'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},

391

'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},

392

'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},

393

'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

394

'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

395

'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

396

# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well

397

'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},

398

'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

399

'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

400

'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

401

'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

402

'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

403

'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

404

'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

405

'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

# 3D videos

'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

410

'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

411

'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

412

'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

413

'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},

414

'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

415

'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

416

417

# Apple HTTP Live Streaming

418

'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

419

'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

420

'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

421

'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

422

'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

423

'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

424

'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

425

'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},

426

427

# DASH mp4 video

428

'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},

429

'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},

430

'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},

431

'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},

432

'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},

433

'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)

434

'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},

435

'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},

436

'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},

437

'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},

438

'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},

439

'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},

440

441

# Dash mp4 audio

442

'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},

443

'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},

444

'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},

445

'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},

446

'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},

447

'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},

448

'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},

449

450

# Dash webm

451

'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

452

'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

453

'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

454

'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

455

'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

456

'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

457

'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},

458

'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},

459

'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},

460

'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

461

'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

462

'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

463

'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},

464

'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},

465

'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},

466

# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)

467

'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},

468

'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

469

'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

470

'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

471

'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},

472

'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

473

474

# Dash webm audio

475

'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},

476

'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},

477

478

# Dash webm audio with opus inside

479

'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},

480

'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},

481

'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},

482

483

# RTMP (unnamed)

484

'_rtmp': {'protocol': 'rtmp'},

485

}

486

_SUBTITLE_FORMATS = ('ttml', 'vtt')

_GEO_BYPASS = False

IE_NAME = 'youtube'

_TESTS = [

{

'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

498

'uploader': 'Philipp Hagemeister',

499

'uploader_id': 'phihag',

500

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',

501

'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',

502

'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',

503

'upload_date': '20121002',

504

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

505

'categories': ['Science & Technology'],

506

'tags': ['youtube-dl'],

'duration': 10,

'view_count': int,

'like_count': int,

'dislike_count': int,

'start_time': 1,

'end_time': 9,

}

},

{

'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',

517

'note': 'Test generic use_cipher_signature video (#897)',

'info_dict': {

'id': 'UxxajLWwzqY',

'ext': 'mp4',

'upload_date': '20120506',

522

'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',

523

'alt_title': 'I Love It (feat. Charli XCX)',

524

'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',

525

'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',

526

'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',

527

'iconic ep', 'iconic', 'love', 'it'],

528

'duration': 180,

529

'uploader': 'Icona Pop',

530

'uploader_id': 'IconaPop',

531

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',

532

'creator': 'Icona Pop',

533

'track': 'I Love It (feat. Charli XCX)',

534

'artist': 'Icona Pop',

}

},

{

'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',

539

'note': 'Test VEVO video with age protection (#956)',

'info_dict': {

'id': '07FYdnEawAQ',

'ext': 'mp4',

'upload_date': '20130703',

544

'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',

545

'alt_title': 'Tunnel Vision',

546

'description': 'md5:07dab3356cde4199048e4c7cd93471e1',

547

'duration': 419,

548

'uploader': 'justintimberlakeVEVO',

549

'uploader_id': 'justintimberlakeVEVO',

550

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',

551

'creator': 'Justin Timberlake',

552

'track': 'Tunnel Vision',

553

'artist': 'Justin Timberlake',

'age_limit': 18,

}

},

{

'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',

559

'note': 'Embed-only video (#1746)',

'info_dict': {

'id': 'yZIXLfi8CZQ',

'ext': 'mp4',

'upload_date': '20120608',

564

'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',

565

'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',

566

'uploader': 'SET India',

567

'uploader_id': 'setindia',

568

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',

'age_limit': 18,

}

},

{

'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',

574

'note': 'Use the first video ID in the URL',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

579

'uploader': 'Philipp Hagemeister',

580

'uploader_id': 'phihag',

581

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',

582

'upload_date': '20121002',

583

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

584

'categories': ['Science & Technology'],

585

'tags': ['youtube-dl'],

'duration': 10,

'view_count': int,

'like_count': int,

'dislike_count': int,

590

},

591

'params': {

592

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',

597

'note': '256k DASH audio (format 141) via DASH manifest',

'info_dict': {

'id': 'a9LDPn-MO4I',

'ext': 'm4a',

'upload_date': '20121002',

602

'uploader_id': '8KVIDEO',

603

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',

604

'description': '',

605

'uploader': '8KVIDEO',

606

'title': 'UHDTV TEST 8K VIDEO.mp4'

607

},

608

'params': {

609

'youtube_include_dash_manifest': True,

610

'format': '141',

611

},

612

'skip': 'format 141 not served anymore',

613

},

614

# DASH manifest with encrypted signature

615

{

616

'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',

'info_dict': {

'id': 'IB3lcPjvWLA',

'ext': 'm4a',

'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',

621

'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',

622

'duration': 244,

623

'uploader': 'AfrojackVEVO',

624

'uploader_id': 'AfrojackVEVO',

625

'upload_date': '20131011',

626

},

627

'params': {

628

'youtube_include_dash_manifest': True,

629

'format': '141/bestaudio[ext=m4a]',

630

},

631

},

632

# JS player signature function name containing $

633

{

634

'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',

'info_dict': {

'id': 'nfWlot6h_JM',

'ext': 'm4a',

'title': 'Taylor Swift - Shake It Off',

639

'description': 'md5:bec2185232c05479482cb5a9b82719bf',

640

'duration': 242,

641

'uploader': 'TaylorSwiftVEVO',

642

'uploader_id': 'TaylorSwiftVEVO',

643

'upload_date': '20140818',

644

'creator': 'Taylor Swift',

645

},

646

'params': {

647

'youtube_include_dash_manifest': True,

648

'format': '141/bestaudio[ext=m4a]',

},

},

# Controversy video

{

'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',

'info_dict': {

'id': 'T4XJQO3qol8',

'ext': 'mp4',

'duration': 219,

'upload_date': '20100909',

659

'uploader': 'Amazing Atheist',

660

'uploader_id': 'TheAmazingAtheist',

661

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',

662

'title': 'Burning Everyone\'s Koran',

663

'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',

664

}

665

},

666

# Normal age-gate video (No vevo, embed allowed)

667

{

668

'url': 'https://youtube.com/watch?v=HtVdAasjOgU',

'info_dict': {

'id': 'HtVdAasjOgU',

'ext': 'mp4',

'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',

673

'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',

674

'duration': 142,

675

'uploader': 'The Witcher',

676

'uploader_id': 'WitcherGame',

677

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',

678

'upload_date': '20140605',

'age_limit': 18,

},

},

# Age-gate video with encrypted signature

683

{

684

'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',

'info_dict': {

'id': '6kLq3WMV1nU',

'ext': 'mp4',

'title': 'Dedication To My Ex (Miss That) (Lyric Video)',

689

'description': 'md5:33765bb339e1b47e7e72b5490139bb41',

690

'duration': 246,

691

'uploader': 'LloydVEVO',

692

'uploader_id': 'LloydVEVO',

693

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',

694

'upload_date': '20110629',

'age_limit': 18,

},

},

# video_info is None (https://github.com/rg3/youtube-dl/issues/4421)

699

# YouTube Red ad is not captured for creator

700

{

701

'url': '__2ABJjxzNo',

'info_dict': {

'id': '__2ABJjxzNo',

'ext': 'mp4',

'duration': 266,

'upload_date': '20100430',

707

'uploader_id': 'deadmau5',

708

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',

709

'creator': 'deadmau5',

710

'description': 'md5:12c56784b8032162bb936a5f76d55360',

711

'uploader': 'deadmau5',

712

'title': 'Deadmau5 - Some Chords (HD)',

713

'alt_title': 'Some Chords',

714

},

715

'expected_warnings': [

716

'DASH manifest missing',

717

]

718

},

719

# Olympics (https://github.com/rg3/youtube-dl/issues/4431)

720

{

721

'url': 'lqQg6PlCWgI',

'info_dict': {

'id': 'lqQg6PlCWgI',

'ext': 'mp4',

'duration': 6085,

'upload_date': '20150827',

727

'uploader_id': 'olympic',

728

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',

729

'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',

730

'uploader': 'Olympic',

731

'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',

732

},

733

'params': {

734

'skip_download': 'requires avconv',

}

},

# Non-square pixels

{

'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',

'info_dict': {

'id': '_b-2C3KPAM0',

'ext': 'mp4',

'stretched_ratio': 16 / 9.,

744

'duration': 85,

745

'upload_date': '20110310',

746

'uploader_id': 'AllenMeow',

747

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',

748

'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',

749

'uploader': '孫ᄋᄅ',

750

'title': '[A-made] 變態妍字幕版太妍我就是這樣的人',

751

},

752

},

753

# url_encoded_fmt_stream_map is empty string

754

{

755

'url': 'qEJwOuvDf7I',

'info_dict': {

'id': 'qEJwOuvDf7I',

'ext': 'webm',

'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',

760

'description': '',

761

'upload_date': '20150404',

762

'uploader_id': 'spbelect',

763

'uploader': 'Наблюдатели Петербурга',

764

},

765

'params': {

766

'skip_download': 'requires avconv',

767

},

768

'skip': 'This live event has ended.',

769

},

770

# Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)

771

{

772

'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',

'info_dict': {

'id': 'FIl7x6_3R5Y',

'ext': 'webm',

'title': 'md5:7b81415841e02ecd4313668cde88737a',

777

'description': 'md5:116377fd2963b81ec4ce64b542173306',

778

'duration': 220,

779

'upload_date': '20150625',

780

'uploader_id': 'dorappi2000',

781

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',

782

'uploader': 'dorappi2000',

783

'formats': 'mincount:31',

784

},

785

'skip': 'not actual anymore',

786

},

787

# DASH manifest with segment_list

788

{

789

'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',

790

'md5': '8ce563a1d667b599d21064e982ab9e31',

'info_dict': {

'id': 'CsmdDsKjzN8',

'ext': 'mp4',

'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510

795

'uploader': 'Airtek',

796

'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',

797

'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',

798

'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',

799

},

800

'params': {

801

'youtube_include_dash_manifest': True,

802

'format': '135', # bestvideo

803

},

804

'skip': 'This live event has ended.',

805

},

806

{

807

# Multifeed videos (multiple cameras), URL is for Main Camera

808

'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',

809

'info_dict': {

810

'id': 'jqWvoWXjCVs',

811

'title': 'teamPGP: Rocket League Noob Stream',

812

'description': 'md5:dc7872fb300e143831327f1bae3af010',

},

'playlist': [{

'info_dict': {

'id': 'jqWvoWXjCVs',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',

819

'description': 'md5:dc7872fb300e143831327f1bae3af010',

820

'duration': 7335,

821

'upload_date': '20150721',

822

'uploader': 'Beer Games Beer',

823

'uploader_id': 'beergamesbeer',

824

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

825

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': '6h8e8xoXJzg',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',

832

'description': 'md5:dc7872fb300e143831327f1bae3af010',

833

'duration': 7337,

834

'upload_date': '20150721',

835

'uploader': 'Beer Games Beer',

836

'uploader_id': 'beergamesbeer',

837

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

838

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': 'PUOgX5z9xZw',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (grizzle)',

845

'description': 'md5:dc7872fb300e143831327f1bae3af010',

846

'duration': 7337,

847

'upload_date': '20150721',

848

'uploader': 'Beer Games Beer',

849

'uploader_id': 'beergamesbeer',

850

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

851

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': 'teuwxikvS5k',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (zim)',

858

'description': 'md5:dc7872fb300e143831327f1bae3af010',

859

'duration': 7334,

860

'upload_date': '20150721',

861

'uploader': 'Beer Games Beer',

862

'uploader_id': 'beergamesbeer',

863

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

864

'license': 'Standard YouTube License',

},

}],

'params': {

'skip_download': True,

869

},

870

'skip': 'This video is not available.',

871

},

872

{

873

# Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)

874

'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',

875

'info_dict': {

876

'id': 'gVfLd0zydlo',

877

'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',

878

},

879

'playlist_count': 2,

880

'skip': 'Not multifeed anymore',

881

},

882

{

883

'url': 'https://vid.plus/FlRa-iH7PGw',

884

'only_matching': True,

885

},

886

{

887

'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',

888

'only_matching': True,

889

},

890

{

891

# Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)

892

# Also tests cut-off URL expansion in video description (see

893

# https://github.com/rg3/youtube-dl/issues/1892,

894

# https://github.com/rg3/youtube-dl/issues/8164)

895

'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',

'info_dict': {

'id': 'lsguqyKfVQg',

'ext': 'mp4',

'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',

900

'alt_title': 'Dark Walk - Position Music',

901

'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',

902

'duration': 133,

903

'upload_date': '20151119',

904

'uploader_id': 'IronSoulElf',

905

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',

906

'uploader': 'IronSoulElf',

907

'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',

908

'track': 'Dark Walk - Position Music',

909

'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',

910

},

911

'params': {

912

'skip_download': True,

},

},

{

# Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)

917

'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',

918

'only_matching': True,

919

},

920

{

921

# Video with yt:stretch=17:0

922

'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',

'info_dict': {

'id': 'Q39EVAstoRM',

'ext': 'mp4',

'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',

927

'description': 'md5:ee18a25c350637c8faff806845bddee9',

928

'upload_date': '20151107',

929

'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',

930

'uploader': 'CH GAMER DROID',

931

},

932

'params': {

933

'skip_download': True,

934

},

935

'skip': 'This video does not exist.',

936

},

937

{

938

# Video licensed under Creative Commons

939

'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',

'info_dict': {

'id': 'M4gD1WSo5mA',

'ext': 'mp4',

'title': 'md5:e41008789470fc2533a3252216f1c1d1',

944

'description': 'md5:a677553cf0840649b731a3024aeff4cc',

945

'duration': 721,

946

'upload_date': '20150127',

947

'uploader_id': 'BerkmanCenter',

948

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',

949

'uploader': 'The Berkman Klein Center for Internet & Society',

950

'license': 'Creative Commons Attribution license (reuse allowed)',

951

},

952

'params': {

953

'skip_download': True,

},

},

{

# Channel-like uploader_url

958

'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',

'info_dict': {

'id': 'eQcmzGIKrzg',

'ext': 'mp4',

'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',

963

'description': 'md5:dda0d780d5a6e120758d1711d062a867',

964

'duration': 4060,

965

'upload_date': '20151119',

966

'uploader': 'Bernie Sanders',

967

'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',

968

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',

969

'license': 'Creative Commons Attribution license (reuse allowed)',

970

},

971

'params': {

972

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY',

977

'only_matching': True,

978

},

979

{

980

# YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059)

981

'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',

982

'only_matching': True,

983

},

984

{

985

# Rental video preview

986

'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',

'info_dict': {

'id': 'uGpuVWrhIzE',

'ext': 'mp4',

'title': 'Piku - Trailer',

991

'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',

992

'upload_date': '20150811',

993

'uploader': 'FlixMatrix',

994

'uploader_id': 'FlixMatrixKaravan',

995

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',

996

'license': 'Standard YouTube License',

997

},

998

'params': {

999

'skip_download': True,

1000

},

1001

'skip': 'This video is not available.',

1002

},

1003

{

1004

# YouTube Red video with episode data

1005

'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',

'info_dict': {

'id': 'iqKdEhx-dD4',

'ext': 'mp4',

'title': 'Isolation - Mind Field (Ep 1)',

1010

'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',

1011

'duration': 2085,

1012

'upload_date': '20170118',

1013

'uploader': 'Vsauce',

1014

'uploader_id': 'Vsauce',

1015

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',

1016

'series': 'Mind Field',

'season_number': 1,

'episode_number': 1,

},

'params': {

'skip_download': True,

1022

},

1023

'expected_warnings': [

1024

'Skipping DASH manifest',

],

},

{

# The following content has been identified by the YouTube community

1029

# as inappropriate or offensive to some audiences.

1030

'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',

'info_dict': {

'id': '6SJNVb0GnPI',

'ext': 'mp4',

'title': 'Race Differences in Intelligence',

1035

'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',

1036

'duration': 965,

1037

'upload_date': '20140124',

1038

'uploader': 'New Century Foundation',

1039

'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',

1040

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',

1041

},

1042

'params': {

1043

'skip_download': True,

},

},

{

# itag 212

'url': '1t24XAntNCY',

1049

'only_matching': True,

1050

},

1051

{

1052

# geo restricted to JP

1053

'url': 'sJL6WA-aGkQ',

1054

'only_matching': True,

1055

},

1056

{

1057

'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',

1058

'only_matching': True,

1059

},

1060

{

1061

'url': 'https://invidio.us/watch?v=BaW_jenozKc',

1062

'only_matching': True,

},

{

# DRM protected

'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',

1067

'only_matching': True,

1068

},

1069

{

1070

# Video with unsupported adaptive stream type formats

1071

'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',

'info_dict': {

'id': 'Z4Vy8R84T1U',

'ext': 'mp4',

'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',

1076

'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',

1077

'duration': 433,

1078

'upload_date': '20130923',

1079

'uploader': 'Amelia Putri Harwita',

1080

'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',

1081

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',

1082

'formats': 'maxcount:10',

1083

},

1084

'params': {

1085

'skip_download': True,

1086

'youtube_include_dash_manifest': False,

},

}

]

def __init__(self, *args, **kwargs):

1092

super(YoutubeIE, self).__init__(*args, **kwargs)

1093

self._player_cache = {}

1094

1095

def report_video_info_webpage_download(self, video_id):

1096

"""Report attempt to download video info webpage."""

1097

self.to_screen('%s: Downloading video info webpage' % video_id)

1098

1099

def report_information_extraction(self, video_id):

1100

"""Report attempt to extract video information."""

1101

self.to_screen('%s: Extracting video information' % video_id)

1102

1103

def report_unavailable_format(self, video_id, format):

1104

"""Report extracted video URL."""

1105

self.to_screen('%s: Format %s not available' % (video_id, format))

1106

1107

def report_rtmp_download(self):

1108

"""Indicate the download will use the RTMP protocol."""

1109

self.to_screen('RTMP download detected')

1110

1111

def _signature_cache_id(self, example_sig):

1112

""" Return a string representation of a signature """

1113

return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))

1114

1115

def _extract_signature_function(self, video_id, player_url, example_sig):

1116

id_m = re.match(

1117

r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',

1118

player_url)

1119

if not id_m:

1120

raise ExtractorError('Cannot identify player %r' % player_url)

1121

player_type = id_m.group('ext')

1122

player_id = id_m.group('id')

1123

1124

# Read from filesystem cache

1125

func_id = '%s_%s_%s' % (

1126

player_type, player_id, self._signature_cache_id(example_sig))

1127

assert os.path.basename(func_id) == func_id

1128

1129

cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)

1130

if cache_spec is not None:

1131

return lambda s: ''.join(s[i] for i in cache_spec)

1132

1133

download_note = (

1134

'Downloading player %s' % player_url

1135

if self._downloader.params.get('verbose') else

1136

'Downloading %s player %s' % (player_type, player_id)

1137

)

1138

if player_type == 'js':

1139

code = self._download_webpage(

1140

player_url, video_id,

1141

note=download_note,

1142

errnote='Download of %s failed' % player_url)

1143

res = self._parse_sig_js(code)

1144

elif player_type == 'swf':

1145

urlh = self._request_webpage(

1146

player_url, video_id,

1147

note=download_note,

1148

errnote='Download of %s failed' % player_url)

1149

code = urlh.read()

1150

res = self._parse_sig_swf(code)

1151

else:

1152

assert False, 'Invalid player type %r' % player_type

1153

1154

test_string = ''.join(map(compat_chr, range(len(example_sig))))

1155

cache_res = res(test_string)

1156

cache_spec = [ord(c) for c in cache_res]

1157

1158

self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)

1159

return res

1160

1161

def _print_sig_code(self, func, example_sig):

1162

def gen_sig_code(idxs):

1163

def _genslice(start, end, step):

1164

starts = '' if start == 0 else str(start)

1165

ends = (':%d' % (end + step)) if end + step >= 0 else ':'

1166

steps = '' if step == 1 else (':%d' % step)

1167

return 's[%s%s%s]' % (starts, ends, steps)

1168

1169

step = None

1170

# Quelch pyflakes warnings - start will be set when step is set

1171

start = '(Never used)'

1172

for i, prev in zip(idxs[1:], idxs[:-1]):

if step is not None:

if i - prev == step:

continue

yield _genslice(start, prev, step)

1177

step = None

1178

continue

1179

if i - prev in [-1, 1]:

step = i - prev

start = prev

continue

else:

yield 's[%d]' % prev

if step is None:

yield 's[%d]' % i

else:

yield _genslice(start, i, step)

1189

1190

test_string = ''.join(map(compat_chr, range(len(example_sig))))

1191

cache_res = func(test_string)

1192

cache_spec = [ord(c) for c in cache_res]

1193

expr_code = ' + '.join(gen_sig_code(cache_spec))

1194

signature_id_tuple = '(%s)' % (

1195

', '.join(compat_str(len(p)) for p in example_sig.split('.')))

1196

code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'

1197

' return %s\n') % (signature_id_tuple, expr_code)

1198

self.to_screen('Extracted signature function:\n' + code)

1199

1200

def _parse_sig_js(self, jscode):

1201

funcname = self._search_regex(

1202

(r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1203

r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',

1204

r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(',

1205

r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1206

r'\bc\s*&&\s*d\.set$[^,]+\s*,\s*\([^)]*$\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),

1207

jscode, 'Initial JS player signature function name', group='sig')

1208

1209

jsi = JSInterpreter(jscode)

1210

initial_function = jsi.extract_function(funcname)

1211

return lambda s: initial_function([s])

1212

1213

def _parse_sig_swf(self, file_contents):

1214

swfi = SWFInterpreter(file_contents)

1215

TARGET_CLASSNAME = 'SignatureDecipher'

1216

searched_class = swfi.extract_class(TARGET_CLASSNAME)

1217

initial_function = swfi.extract_function(searched_class, 'decipher')

1218

return lambda s: initial_function([s])

1219

1220

def _decrypt_signature(self, s, video_id, player_url, age_gate=False):

1221

"""Turn the encrypted s field into a working signature"""

1222

1223

if player_url is None:

1224

raise ExtractorError('Cannot decrypt signature without player_url')

1225

1226

if player_url.startswith('//'):

1227

player_url = 'https:' + player_url

1228

elif not re.match(r'https?://', player_url):

1229

player_url = compat_urlparse.urljoin(

1230

'https://www.youtube.com', player_url)

1231

try:

1232

player_id = (player_url, self._signature_cache_id(s))

1233

if player_id not in self._player_cache:

1234

func = self._extract_signature_function(

1235

video_id, player_url, s

1236

)

1237

self._player_cache[player_id] = func

1238

func = self._player_cache[player_id]

1239

if self._downloader.params.get('youtube_print_sig_code'):

1240

self._print_sig_code(func, s)

1241

return func(s)

1242

except Exception as e:

1243

tb = traceback.format_exc()

1244

raise ExtractorError(

1245

'Signature extraction failed: ' + tb, cause=e)

1246

1247

def _get_subtitles(self, video_id, webpage):

1248

try:

1249

subs_doc = self._download_xml(

1250

'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,

1251

video_id, note=False)

1252

except ExtractorError as err:

1253

self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))

return {}

sub_lang_list = {}

for track in subs_doc.findall('track'):

1258

lang = track.attrib['lang_code']

1259

if lang in sub_lang_list:

1260

continue

1261

sub_formats = []

1262

for ext in self._SUBTITLE_FORMATS:

1263

params = compat_urllib_parse_urlencode({

'lang': lang,

'v': video_id,

'fmt': ext,

'name': track.attrib['name'].encode('utf-8'),

1268

})

1269

sub_formats.append({

1270

'url': 'https://www.youtube.com/api/timedtext?' + params,

1271

'ext': ext,

1272

})

1273

sub_lang_list[lang] = sub_formats

1274

if not sub_lang_list:

1275

self._downloader.report_warning('video doesn\'t have subtitles')

return {}

return sub_lang_list

def _get_ytplayer_config(self, video_id, webpage):

1280

patterns = (

1281

# User data may contain arbitrary character sequences that may affect

1282

# JSON extraction with regex, e.g. when '};' is contained the second

1283

# regex won't capture the whole JSON. Yet working around by trying more

1284

# concrete regex first keeping in mind proper quoted string handling

1285

# to be implemented in future that will replace this workaround (see

1286

# https://github.com/rg3/youtube-dl/issues/7468,

1287

# https://github.com/rg3/youtube-dl/pull/7599)

1288

r';ytplayer\.config\s*=\s*({.+?});ytplayer',

1289

r';ytplayer\.config\s*=\s*({.+?});',

1290

)

1291

config = self._search_regex(

1292

patterns, webpage, 'ytplayer.config', default=None)

1293

if config:

1294

return self._parse_json(

1295

uppercase_escape(config), video_id, fatal=False)

1296

1297

def _get_automatic_captions(self, video_id, webpage):

1298

"""We need the webpage for getting the captions url, pass it as an

1299

argument to speed up the process."""

1300

self.to_screen('%s: Looking for automatic captions' % video_id)

1301

player_config = self._get_ytplayer_config(video_id, webpage)

1302

err_msg = 'Couldn\'t find automatic captions for %s' % video_id

1303

if not player_config:

1304

self._downloader.report_warning(err_msg)

1305

return {}

1306

try:

1307

args = player_config['args']

1308

caption_url = args.get('ttsurl')

1309

if caption_url:

1310

timestamp = args['timestamp']

1311

# We get the available subtitles

1312

list_params = compat_urllib_parse_urlencode({

'type': 'list',

'tlangs': 1,

'asrs': 1,

})

list_url = caption_url + '&' + list_params

1318

caption_list = self._download_xml(list_url, video_id)

1319

original_lang_node = caption_list.find('track')

1320

if original_lang_node is None:

1321

self._downloader.report_warning('Video doesn\'t have automatic captions')

1322

return {}

1323

original_lang = original_lang_node.attrib['lang_code']

1324

caption_kind = original_lang_node.attrib.get('kind', '')

1325

1326

sub_lang_list = {}

1327

for lang_node in caption_list.findall('target'):

1328

sub_lang = lang_node.attrib['lang_code']

1329

sub_formats = []

1330

for ext in self._SUBTITLE_FORMATS:

1331

params = compat_urllib_parse_urlencode({

1332

'lang': original_lang,

'tlang': sub_lang,

'fmt': ext,

'ts': timestamp,

'kind': caption_kind,

1337

})

1338

sub_formats.append({

1339

'url': caption_url + '&' + params,

1340

'ext': ext,

1341

})

1342

sub_lang_list[sub_lang] = sub_formats

1343

return sub_lang_list

1344

1345

def make_captions(sub_url, sub_langs):

1346

parsed_sub_url = compat_urllib_parse_urlparse(sub_url)

1347

caption_qs = compat_parse_qs(parsed_sub_url.query)

1348

captions = {}

1349

for sub_lang in sub_langs:

1350

sub_formats = []

1351

for ext in self._SUBTITLE_FORMATS:

caption_qs.update({

'tlang': [sub_lang],

'fmt': [ext],

})

sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(

1357

query=compat_urllib_parse_urlencode(caption_qs, True)))

sub_formats.append({

'url': sub_url,

'ext': ext,

})

captions[sub_lang] = sub_formats

1363

return captions

1364

1365

# New captions format as of 22.06.2017

1366

player_response = args.get('player_response')

1367

if player_response and isinstance(player_response, compat_str):

1368

player_response = self._parse_json(

1369

player_response, video_id, fatal=False)

1370

if player_response:

1371

renderer = player_response['captions']['playerCaptionsTracklistRenderer']

1372

base_url = renderer['captionTracks'][0]['baseUrl']

1373

sub_lang_list = []

1374

for lang in renderer['translationLanguages']:

1375

lang_code = lang.get('languageCode')

1376

if lang_code:

1377

sub_lang_list.append(lang_code)

1378

return make_captions(base_url, sub_lang_list)

1379

1380

# Some videos don't provide ttsurl but rather caption_tracks and

1381

# caption_translation_languages (e.g. 20LmZk1hakA)

1382

# Does not used anymore as of 22.06.2017

1383

caption_tracks = args['caption_tracks']

1384

caption_translation_languages = args['caption_translation_languages']

1385

caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]

1386

sub_lang_list = []

1387

for lang in caption_translation_languages.split(','):

1388

lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))

1389

sub_lang = lang_qs.get('lc', [None])[0]

1390

if sub_lang:

1391

sub_lang_list.append(sub_lang)

1392

return make_captions(caption_url, sub_lang_list)

1393

# An extractor error can be raise by the download process if there are

1394

# no automatic captions but there are subtitles

1395

except (KeyError, IndexError, ExtractorError):

1396

self._downloader.report_warning(err_msg)

1397

return {}

1398

1399

def _mark_watched(self, video_id, video_info, player_response):

1400

playback_url = url_or_none(try_get(

1401

player_response,

1402

lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(

1403

video_info, lambda x: x['videostats_playback_base_url'][0]))

1404

if not playback_url:

1405

return

1406

parsed_playback_url = compat_urlparse.urlparse(playback_url)

1407

qs = compat_urlparse.parse_qs(parsed_playback_url.query)

1408

1409

# cpn generation algorithm is reverse engineered from base.js.

1410

# In fact it works even with dummy cpn.

1411

CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'

1412

cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))

qs.update({

'ver': ['2'],

'cpn': [cpn],

})

playback_url = compat_urlparse.urlunparse(

1419

parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))

1420

1421

self._download_webpage(

1422

playback_url, video_id, 'Marking watched',

1423

'Unable to mark watched', fatal=False)

1424

1425

@staticmethod

1426

def _extract_urls(webpage):

1427

# Embedded YouTube player

1428

entries = [

1429

unescapeHTML(mobj.group('url'))

1430

for mobj in re.finditer(r'''(?x)

(?:

<iframe[^>]+?src=|

data-video-url=|

<embed[^>]+?src=|

embedSWF\(?:\s*|

<object[^>]+data=|

new\s+SWFObject\(

)

(["\'])

(?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/

1441

(?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)

1442

\1''', webpage)]

1443

1444

# lazyYT YouTube embed

1445

entries.extend(list(map(

1446

unescapeHTML,

1447

re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))

1448

1449

# Wordpress "YouTube Video Importer" plugin

1450

matches = re.findall(r'''(?x)<div[^>]+

1451

class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+

1452

data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)

1453

entries.extend(m[-1] for m in matches)

return entries

@staticmethod

def _extract_url(webpage):

1459

urls = YoutubeIE._extract_urls(webpage)

1460

return urls[0] if urls else None

1461

1462

@classmethod

1463

def extract_id(cls, url):

1464

mobj = re.match(cls._VALID_URL, url, re.VERBOSE)

1465

if mobj is None:

1466

raise ExtractorError('Invalid URL: %s' % url)

1467

video_id = mobj.group(2)

1468

return video_id

1469

1470

def _extract_annotations(self, video_id):

1471

url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id

1472

return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')

1473

1474

@staticmethod

1475

def _extract_chapters(description, duration):

1476

if not description:

1477

return None

1478

chapter_lines = re.findall(

1479

r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',

1480

description)

1481

if not chapter_lines:

1482

return None

1483

chapters = []

1484

for next_num, (chapter_line, time_point) in enumerate(

1485

chapter_lines, start=1):

1486

start_time = parse_duration(time_point)

1487

if start_time is None:

1488

continue

1489

if start_time > duration:

1490

break

1491

end_time = (duration if next_num == len(chapter_lines)

1492

else parse_duration(chapter_lines[next_num][1]))

1493

if end_time is None:

1494

continue

1495

if end_time > duration:

1496

end_time = duration

1497

if start_time > end_time:

1498

break

1499

chapter_title = re.sub(

1500

r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')

1501

chapter_title = re.sub(r'\s+', ' ', chapter_title)

1502

chapters.append({

1503

'start_time': start_time,

1504

'end_time': end_time,

1505

'title': chapter_title,

})

return chapters

def _real_extract(self, url):

1510

url, smuggled_data = unsmuggle_url(url, {})

1511

1512

proto = (

1513

'http' if self._downloader.params.get('prefer_insecure', False)

else 'https')

start_time = None

end_time = None

parsed_url = compat_urllib_parse_urlparse(url)

1519

for component in [parsed_url.fragment, parsed_url.query]:

1520

query = compat_parse_qs(component)

1521

if start_time is None and 't' in query:

1522

start_time = parse_duration(query['t'][0])

1523

if start_time is None and 'start' in query:

1524

start_time = parse_duration(query['start'][0])

1525

if end_time is None and 'end' in query:

1526

end_time = parse_duration(query['end'][0])

1527

1528

# Extract original video URL from URL with redirection, like age verification, using next_url parameter

1529

mobj = re.search(self._NEXT_URL_RE, url)

1530

if mobj:

1531

url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')

1532

video_id = self.extract_id(url)

1533

1534

# Get video webpage

1535

url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id

1536

video_webpage = self._download_webpage(url, video_id)

1537

1538

# Attempt to extract SWF player URL

1539

mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

1540

if mobj is not None:

1541

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

else:

player_url = None

dash_mpds = []

def add_dash_mpd(video_info):

1548

dash_mpd = video_info.get('dashmpd')

1549

if dash_mpd and dash_mpd[0] not in dash_mpds:

1550

dash_mpds.append(dash_mpd[0])

1551

1552

def add_dash_mpd_pr(pl_response):

1553

dash_mpd = url_or_none(try_get(

1554

pl_response, lambda x: x['streamingData']['dashManifestUrl'],

1555

compat_str))

1556

if dash_mpd and dash_mpd not in dash_mpds:

1557

dash_mpds.append(dash_mpd)

is_live = None

view_count = None

def extract_view_count(v_info):

1563

return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))

player_response = {}

# Get video info

embed_webpage = None

if re.search(r'player-age-gate-content">', video_webpage) is not None:

1570

age_gate = True

1571

# We simulate the access to the video from www.youtube.com/v/{video_id}

1572

# this can be viewed without login into Youtube

1573

url = proto + '://www.youtube.com/embed/%s' % video_id

1574

embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')

1575

data = compat_urllib_parse_urlencode({

1576

'video_id': video_id,

1577

'eurl': 'https://youtube.googleapis.com/v/' + video_id,

1578

'sts': self._search_regex(

1579

r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),

1580

})

1581

video_info_url = proto + '://www.youtube.com/get_video_info?' + data

1582

video_info_webpage = self._download_webpage(

1583

video_info_url, video_id,

1584

note='Refetching age-gated info webpage',

1585

errnote='unable to download video info webpage')

1586

video_info = compat_parse_qs(video_info_webpage)

1587

add_dash_mpd(video_info)

else:

age_gate = False

video_info = None

sts = None

# Try looking directly into the video webpage

1593

ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)

1594

if ytplayer_config:

1595

args = ytplayer_config['args']

1596

if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):

1597

# Convert to the same format returned by compat_parse_qs

1598

video_info = dict((k, [v]) for k, v in args.items())

1599

add_dash_mpd(video_info)

1600

# Rental video is not rented but preview is available (e.g.

1601

# https://www.youtube.com/watch?v=yYr8q0y5Jfg,

1602

# https://github.com/rg3/youtube-dl/issues/10532)

1603

if not video_info and args.get('ypc_vid'):

1604

return self.url_result(

1605

args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])

1606

if args.get('livestream') == '1' or args.get('live_playback') == 1:

1607

is_live = True

1608

sts = ytplayer_config.get('sts')

1609

if not player_response:

1610

pl_response = str_or_none(args.get('player_response'))

1611

if pl_response:

1612

pl_response = self._parse_json(pl_response, video_id, fatal=False)

1613

if isinstance(pl_response, dict):

1614

player_response = pl_response

1615

if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):

1616

add_dash_mpd_pr(player_response)

1617

# We also try looking in get_video_info since it may contain different dashmpd

1618

# URL that points to a DASH manifest with possibly different itag set (some itags

1619

# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH

1620

# manifest pointed by get_video_info's dashmpd).

1621

# The general idea is to take a union of itags of both DASH manifests (for example

1622

# video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)

1623

self.report_video_info_webpage_download(video_id)

1624

for el in ('info', 'embedded', 'detailpage', 'vevo', ''):

1625

query = {

1626

'video_id': video_id,

'ps': 'default',

'eurl': '',

'gl': 'US',

'hl': 'en',

}

if el:

query['el'] = el

if sts:

query['sts'] = sts

video_info_webpage = self._download_webpage(

1637

'%s://www.youtube.com/get_video_info' % proto,

1638

video_id, note=False,

1639

errnote='unable to download video info webpage',

1640

fatal=False, query=query)

1641

if not video_info_webpage:

1642

continue

1643

get_video_info = compat_parse_qs(video_info_webpage)

1644

if not player_response:

1645

pl_response = get_video_info.get('player_response', [None])[0]

1646

if isinstance(pl_response, dict):

1647

player_response = pl_response

1648

add_dash_mpd_pr(player_response)

1649

add_dash_mpd(get_video_info)

1650

if view_count is None:

1651

view_count = extract_view_count(get_video_info)

1652

if not video_info:

1653

video_info = get_video_info

1654

if 'token' in get_video_info:

1655

# Different get_video_info requests may report different results, e.g.

1656

# some may report video unavailability, but some may serve it without

1657

# any complaint (see https://github.com/rg3/youtube-dl/issues/7362,

1658

# the original webpage as well as el=info and el=embedded get_video_info

1659

# requests report video unavailability due to geo restriction while

1660

# el=detailpage succeeds and returns valid data). This is probably

1661

# due to YouTube measures against IP ranges of hosting providers.

1662

# Working around by preferring the first succeeded video_info containing

1663

# the token if no such video_info yet was found.

1664

if 'token' not in video_info:

1665

video_info = get_video_info

1666

break

1667

1668

def extract_unavailable_message():

1669

return self._html_search_regex(

1670

r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',

1671

video_webpage, 'unavailable message', default=None)

1672

1673

if 'token' not in video_info:

1674

if 'reason' in video_info:

1675

if 'The uploader has not made this video available in your country.' in video_info['reason']:

1676

regions_allowed = self._html_search_meta(

1677

'regionsAllowed', video_webpage, default=None)

1678

countries = regions_allowed.split(',') if regions_allowed else None

1679

self.raise_geo_restricted(

1680

msg=video_info['reason'][0], countries=countries)

1681

reason = video_info['reason'][0]

1682

if 'Invalid parameters' in reason:

1683

unavailable_message = extract_unavailable_message()

1684

if unavailable_message:

1685

reason = unavailable_message

1686

raise ExtractorError(

1687

'YouTube said: %s' % reason,

1688

expected=True, video_id=video_id)

1689

else:

1690

raise ExtractorError(

1691

'"token" parameter not in video info for unknown reason',

1692

video_id=video_id)

1693

1694

if video_info.get('license_info'):

1695

raise ExtractorError('This video is DRM protected.', expected=True)

1696

1697

video_details = try_get(

1698

player_response, lambda x: x['videoDetails'], dict) or {}

1699

1700

# title

1701

if 'title' in video_info:

1702

video_title = video_info['title'][0]

1703

elif 'title' in player_response:

1704

video_title = video_details['title']

1705

else:

1706

self._downloader.report_warning('Unable to extract video title')

video_title = '_'

# description

description_original = video_description = get_element_by_id("eow-description", video_webpage)

1711

if video_description:

1712

1713

def replace_url(m):

1714

redir_url = compat_urlparse.urljoin(url, m.group(1))

1715

parsed_redir_url = compat_urllib_parse_urlparse(redir_url)

1716

if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':

1717

qs = compat_parse_qs(parsed_redir_url.query)

q = qs.get('q')

if q and q[0]:

return q[0]

return redir_url

description_original = video_description = re.sub(r'''(?x)

1724

<a\s+

1725

(?:[a-zA-Z-]+="[^"]*"\s+)*?

1726

(?:title|href)="([^"]+)"\s+

1727

(?:[a-zA-Z-]+="[^"]*"\s+)*?

class="[^"]*"[^>]*>

[^<]+\.{3}\s*

</a>

''', replace_url, video_description)

1732

video_description = clean_html(video_description)

1733

else:

1734

fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)

1735

if fd_mobj:

1736

video_description = unescapeHTML(fd_mobj.group(1))

1737

else:

1738

video_description = ''

1739

1740

if not smuggled_data.get('force_singlefeed', False):

1741

if not self._downloader.params.get('noplaylist'):

1742

multifeed_metadata_list = try_get(

1743

player_response,

1744

lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],

1745

compat_str) or try_get(

1746

video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)

1747

if multifeed_metadata_list:

1748

entries = []

1749

feed_ids = []

1750

for feed in multifeed_metadata_list.split(','):

1751

# Unquote should take place before split on comma (,) since textual

1752

# fields may contain comma as well (see

1753

# https://github.com/rg3/youtube-dl/issues/8536)

1754

feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))

1755

entries.append({

1756

'_type': 'url_transparent',

1757

'ie_key': 'Youtube',

1758

'url': smuggle_url(

1759

'%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),

1760

{'force_singlefeed': True}),

1761

'title': '%s (%s)' % (video_title, feed_data['title'][0]),

1762

})

1763

feed_ids.append(feed_data['id'][0])

1764

self.to_screen(

1765

'Downloading multifeed video (%s) - add --no-playlist to just download video %s'

1766

% (', '.join(feed_ids), video_id))

1767

return self.playlist_result(entries, video_id, video_title, video_description)

1768

else:

1769

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1770

1771

if view_count is None:

1772

view_count = extract_view_count(video_info)

1773

if view_count is None and video_details:

1774

view_count = int_or_none(video_details.get('viewCount'))

1775

1776

# Check for "rental" videos

1777

if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:

1778

raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)

1779

1780

def _extract_filesize(media_url):

1781

return int_or_none(self._search_regex(

1782

r'\bclen[=/](\d+)', media_url, 'filesize', default=None))

1783

1784

if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

1785

self.report_rtmp_download()

1786

formats = [{

1787

'format_id': '_rtmp',

1788

'protocol': 'rtmp',

1789

'url': video_info['conn'][0],

1790

'player_url': player_url,

1791

}]

1792

elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):

1793

encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]

1794

if 'rtmpe%3Dyes' in encoded_url_map:

1795

raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)

1796

formats_spec = {}

1797

fmt_list = video_info.get('fmt_list', [''])[0]

1798

if fmt_list:

1799

for fmt in fmt_list.split(','):

1800

spec = fmt.split('/')

1801

if len(spec) > 1:

1802

width_height = spec[1].split('x')

1803

if len(width_height) == 2:

1804

formats_spec[spec[0]] = {

1805

'resolution': spec[1],

1806

'width': int_or_none(width_height[0]),

1807

'height': int_or_none(width_height[1]),

1808

}

1809

q = qualities(['small', 'medium', 'hd720'])

1810

streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list)

1811

if streaming_formats:

1812

for fmt in streaming_formats:

1813

itag = str_or_none(fmt.get('itag'))

1814

if not itag:

1815

continue

1816

quality = fmt.get('quality')

1817

quality_label = fmt.get('qualityLabel') or quality

1818

formats_spec[itag] = {

1819

'asr': int_or_none(fmt.get('audioSampleRate')),

1820

'filesize': int_or_none(fmt.get('contentLength')),

1821

'format_note': quality_label,

1822

'fps': int_or_none(fmt.get('fps')),

1823

'height': int_or_none(fmt.get('height')),

1824

'quality': q(quality),

1825

# bitrate for itag 43 is always 2147483647

1826

'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,

1827

'width': int_or_none(fmt.get('width')),

1828

}

1829

formats = []

1830

for url_data_str in encoded_url_map.split(','):

1831

url_data = compat_parse_qs(url_data_str)

1832

if 'itag' not in url_data or 'url' not in url_data:

1833

continue

1834

stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))

1835

# Unsupported FORMAT_STREAM_TYPE_OTF

1836

if stream_type == 3:

1837

continue

1838

format_id = url_data['itag'][0]

1839

url = url_data['url'][0]

1840

1841

if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):

1842

ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'

1843

jsplayer_url_json = self._search_regex(

1844

ASSETS_RE,

1845

embed_webpage if age_gate else video_webpage,

1846

'JS player URL (1)', default=None)

1847

if not jsplayer_url_json and not age_gate:

1848

# We need the embed website after all

1849

if embed_webpage is None:

1850

embed_url = proto + '://www.youtube.com/embed/%s' % video_id

1851

embed_webpage = self._download_webpage(

1852

embed_url, video_id, 'Downloading embed webpage')

1853

jsplayer_url_json = self._search_regex(

1854

ASSETS_RE, embed_webpage, 'JS player URL')

1855

1856

player_url = json.loads(jsplayer_url_json)

1857

if player_url is None:

1858

player_url_json = self._search_regex(

1859

r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',

1860

video_webpage, 'age gate player URL')

1861

player_url = json.loads(player_url_json)

1862

1863

if 'sig' in url_data:

1864

url += '&signature=' + url_data['sig'][0]

1865

elif 's' in url_data:

1866

encrypted_sig = url_data['s'][0]

1867

1868

if self._downloader.params.get('verbose'):

1869

if player_url is None:

1870

player_version = 'unknown'

1871

player_desc = 'unknown'

1872

else:

1873

if player_url.endswith('swf'):

1874

player_version = self._search_regex(

1875

r'-(.+?)(?:/watch_as3)?\.swf$', player_url,

1876

'flash player', fatal=False)

1877

player_desc = 'flash player %s' % player_version

1878

else:

1879

player_version = self._search_regex(

1880

[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',

1881

r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],

1882

player_url,

1883

'html5 player', fatal=False)

1884

player_desc = 'html5 player %s' % player_version

1885

1886

parts_sizes = self._signature_cache_id(encrypted_sig)

1887

self.to_screen('{%s} signature length %s, %s' %

1888

(format_id, parts_sizes, player_desc))

1889

1890

signature = self._decrypt_signature(

1891

encrypted_sig, video_id, player_url, age_gate)

1892

url += '&signature=' + signature

1893

if 'ratebypass' not in url:

1894

url += '&ratebypass=yes'

1895

1896

dct = {

1897

'format_id': format_id,

1898

'url': url,

1899

'player_url': player_url,

1900

}

1901

if format_id in self._formats:

1902

dct.update(self._formats[format_id])

1903

if format_id in formats_spec:

1904

dct.update(formats_spec[format_id])

1905

1906

# Some itags are not included in DASH manifest thus corresponding formats will

1907

# lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).

1908

# Trying to extract metadata from url_encoded_fmt_stream_map entry.

1909

mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])

1910

width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)

1911

1912

filesize = int_or_none(url_data.get(

1913

'clen', [None])[0]) or _extract_filesize(url)

1914

1915

quality = url_data.get('quality', [None])[0]

1916

1917

more_fields = {

1918

'filesize': filesize,

1919

'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),

1920

'width': width,

1921

'height': height,

1922

'fps': int_or_none(url_data.get('fps', [None])[0]),

1923

'format_note': url_data.get('quality_label', [None])[0] or quality,

1924

'quality': q(quality),

1925

}

1926

for key, value in more_fields.items():

1927

if value:

1928

dct[key] = value

1929

type_ = url_data.get('type', [None])[0]

1930

if type_:

1931

type_split = type_.split(';')

1932

kind_ext = type_split[0].split('/')

1933

if len(kind_ext) == 2:

1934

kind, _ = kind_ext

1935

dct['ext'] = mimetype2ext(type_split[0])

1936

if kind in ('audio', 'video'):

1937

codecs = None

1938

for mobj in re.finditer(

1939

r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):

1940

if mobj.group('key') == 'codecs':

1941

codecs = mobj.group('val')

1942

break

1943

if codecs:

1944

dct.update(parse_codecs(codecs))

1945

if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':

1946

dct['downloader_options'] = {

1947

# Youtube throttles chunks >~10M

1948

'http_chunk_size': 10485760,

}

formats.append(dct)

else:

manifest_url = (

url_or_none(try_get(

player_response,

lambda x: x['streamingData']['hlsManifestUrl'],

1956

compat_str)) or

1957

url_or_none(try_get(

1958

video_info, lambda x: x['hlsvp'][0], compat_str)))

1959

if manifest_url:

1960

formats = []

1961

m3u8_formats = self._extract_m3u8_formats(

1962

manifest_url, video_id, 'mp4', fatal=False)

1963

for a_format in m3u8_formats:

1964

itag = self._search_regex(

1965

r'/itag/(\d+)/', a_format['url'], 'itag', default=None)

1966

if itag:

1967

a_format['format_id'] = itag

1968

if itag in self._formats:

1969

dct = self._formats[itag].copy()

1970

dct.update(a_format)

1971

a_format = dct

1972

a_format['player_url'] = player_url

1973

# Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming

1974

a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'

1975

formats.append(a_format)

1976

else:

1977

error_message = clean_html(video_info.get('reason', [None])[0])

1978

if not error_message:

1979

error_message = extract_unavailable_message()

1980

if error_message:

1981

raise ExtractorError(error_message, expected=True)

1982

raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')

1983

1984

# uploader

1985

video_uploader = try_get(

1986

video_info, lambda x: x['author'][0],

1987

compat_str) or str_or_none(video_details.get('author'))

1988

if video_uploader:

1989

video_uploader = compat_urllib_parse_unquote_plus(video_uploader)

1990

else:

1991

self._downloader.report_warning('unable to extract uploader name')

1992

1993

# uploader_id

1994

video_uploader_id = None

1995

video_uploader_url = None

1996

mobj = re.search(

1997

r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',

1998

video_webpage)

1999

if mobj is not None:

2000

video_uploader_id = mobj.group('uploader_id')

2001

video_uploader_url = mobj.group('uploader_url')

2002

else:

2003

self._downloader.report_warning('unable to extract uploader nickname')

2004

2005

channel_id = self._html_search_meta(

2006

'channelId', video_webpage, 'channel id')

2007

channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None

2008

2009

# thumbnail image

2010

# We try first to get a high quality image:

2011

m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',

2012

video_webpage, re.DOTALL)

2013

if m_thumb is not None:

2014

video_thumbnail = m_thumb.group(1)

2015

elif 'thumbnail_url' not in video_info:

2016

self._downloader.report_warning('unable to extract video thumbnail')

2017

video_thumbnail = None

2018

else: # don't panic if we can't find it

2019

video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])

2020

2021

# upload date

2022

upload_date = self._html_search_meta(

2023

'datePublished', video_webpage, 'upload date', default=None)

2024

if not upload_date:

2025

upload_date = self._search_regex(

2026

[r'(?s)id="eow-date.*?>(.*?)</span>',

2027

r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],

2028

video_webpage, 'upload date', default=None)

2029

upload_date = unified_strdate(upload_date)

2030

2031

video_license = self._html_search_regex(

2032

r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',

2033

video_webpage, 'license', default=None)

m_music = re.search(

r'''(?x)

<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*

<ul[^>]*>\s*

<li>(?P<title>.+?)

by (?P<creator>.+?)

(?:

$.+?$|

<a[^>]*

(?:

\bhref=["\']/red[^>]*>| # drop possible

2046

>\s*Listen ad-free with YouTube Red # YouTube Red ad

)

.*?

)?</li

''',

video_webpage)

if m_music:

video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))

2054

video_creator = clean_html(m_music.group('creator'))

2055

else:

2056

video_alt_title = video_creator = None

2057

2058

def extract_meta(field):

2059

return self._html_search_regex(

2060

r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,

2061

video_webpage, field, default=None)

2062

2063

track = extract_meta('Song')

2064

artist = extract_meta('Artist')

2065

2066

m_episode = re.search(

2067

r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',

2068

video_webpage)

2069

if m_episode:

2070

series = unescapeHTML(m_episode.group('series'))

2071

season_number = int(m_episode.group('season'))

2072

episode_number = int(m_episode.group('episode'))

2073

else:

2074

series = season_number = episode_number = None

2075

2076

m_cat_container = self._search_regex(

2077

r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',

2078

video_webpage, 'categories', default=None)

2079

if m_cat_container:

2080

category = self._html_search_regex(

2081

r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',

2082

default=None)

2083

video_categories = None if category is None else [category]

2084

else:

2085

video_categories = None

2086

2087

video_tags = [

2088

unescapeHTML(m.group('content'))

2089

for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]

2090

2091

def _extract_count(count_name):

2092

return str_to_int(self._search_regex(

2093

r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'

2094

% re.escape(count_name),

2095

video_webpage, count_name, default=None))

2096

2097

like_count = _extract_count('like')

2098

dislike_count = _extract_count('dislike')

2099

2100

if view_count is None:

2101

view_count = str_to_int(self._search_regex(

2102

r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,

2103

'view count', default=None))

2104

2105

# subtitles

2106

video_subtitles = self.extract_subtitles(video_id, video_webpage)

2107

automatic_captions = self.extract_automatic_captions(video_id, video_webpage)

2108

2109

video_duration = try_get(

2110

video_info, lambda x: int_or_none(x['length_seconds'][0]))

2111

if not video_duration:

2112

video_duration = int_or_none(video_details.get('lengthSeconds'))

2113

if not video_duration:

2114

video_duration = parse_duration(self._html_search_meta(

2115

'duration', video_webpage, 'video duration'))

2116

2117

# annotations

2118

video_annotations = None

2119

if self._downloader.params.get('writeannotations', False):

2120

video_annotations = self._extract_annotations(video_id)

2121

2122

chapters = self._extract_chapters(description_original, video_duration)

2123

2124

# Look for the DASH manifest

2125

if self._downloader.params.get('youtube_include_dash_manifest', True):

2126

dash_mpd_fatal = True

2127

for mpd_url in dash_mpds:

2128

dash_formats = {}

2129

try:

2130

def decrypt_sig(mobj):

2131

s = mobj.group(1)

2132

dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)

2133

return '/signature/%s' % dec_s

2134

2135

mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)

2136

2137

for df in self._extract_mpd_formats(

2138

mpd_url, video_id, fatal=dash_mpd_fatal,

2139

formats_dict=self._formats):

2140

if not df.get('filesize'):

2141

df['filesize'] = _extract_filesize(df['url'])

2142

# Do not overwrite DASH format found in some previous DASH manifest

2143

if df['format_id'] not in dash_formats:

2144

dash_formats[df['format_id']] = df

2145

# Additional DASH manifests may end up in HTTP Error 403 therefore

2146

# allow them to fail without bug report message if we already have

2147

# some DASH manifest succeeded. This is temporary workaround to reduce

2148

# burst of bug reports until we figure out the reason and whether it

2149

# can be fixed at all.

2150

dash_mpd_fatal = False

2151

except (ExtractorError, KeyError) as e:

2152

self.report_warning(

2153

'Skipping DASH manifest: %r' % e, video_id)

2154

if dash_formats:

2155

# Remove the formats we found through non-DASH, they

2156

# contain less info and it can be wrong, because we use

2157

# fixed values (for example the resolution). See

2158

# https://github.com/rg3/youtube-dl/issues/5774 for an

2159

# example.

2160

formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]

2161

formats.extend(dash_formats.values())

2162

2163

# Check for malformed aspect ratio

2164

stretched_m = re.search(

2165

r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',

2166

video_webpage)

2167

if stretched_m:

2168

w = float(stretched_m.group('w'))

2169

h = float(stretched_m.group('h'))

2170

# yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).

2171

# We will only process correct ratios.

if w > 0 and h > 0:

ratio = w / h

for f in formats:

if f.get('vcodec') != 'none':

2176

f['stretched_ratio'] = ratio

2177

2178

self._sort_formats(formats)

2179

2180

self.mark_watched(video_id, video_info, player_response)

return {

'id': video_id,

'uploader': video_uploader,

2185

'uploader_id': video_uploader_id,

2186

'uploader_url': video_uploader_url,

2187

'channel_id': channel_id,

2188

'channel_url': channel_url,

2189

'upload_date': upload_date,

2190

'license': video_license,

2191

'creator': video_creator or artist,

2192

'title': video_title,

2193

'alt_title': video_alt_title or track,

2194

'thumbnail': video_thumbnail,

2195

'description': video_description,

2196

'categories': video_categories,

2197

'tags': video_tags,

2198

'subtitles': video_subtitles,

2199

'automatic_captions': automatic_captions,

2200

'duration': video_duration,

2201

'age_limit': 18 if age_gate else 0,

2202

'annotations': video_annotations,

2203

'chapters': chapters,

2204

'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,

2205

'view_count': view_count,

2206

'like_count': like_count,

2207

'dislike_count': dislike_count,

2208

'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),

2209

'formats': formats,

2210

'is_live': is_live,

2211

'start_time': start_time,

2212

'end_time': end_time,

2213

'series': series,

2214

'season_number': season_number,

2215

'episode_number': episode_number,

'track': track,

'artist': artist,

}

class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):

2222

IE_DESC = 'YouTube.com playlists'

2223

_VALID_URL = r"""(?x)(?:

(?:https?://)?

(?:\w+\.)?

(?:

(?:

youtube\.com|

invidio\.us

)

/

(?:

(?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))

2234

\? (?:.*?[&;])*? (?:p|a|list)=

2235

| p/

2236

)|

2237

youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=

2238

)

2239

(

2240

(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}

2241

# Top tracks, they can also include dots

|(?:MC)[\w\.]*

)

.*

|

(%(playlist_id)s)

)""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}

2248

_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'

2249

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'

2250

IE_NAME = 'youtube:playlist'

2251

_TESTS = [{

2252

'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

2253

'info_dict': {

2254

'title': 'ytdl test PL',

2255

'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

},

'playlist_count': 3,

}, {

'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

2260

'info_dict': {

2261

'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

2262

'title': 'YDL_Empty_List',

2263

},

2264

'playlist_count': 0,

2265

'skip': 'This playlist is private',

2266

}, {

2267

'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',

2268

'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

2269

'info_dict': {

2270

'title': '29C3: Not my department',

2271

'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

2272

},

2273

'playlist_count': 95,

2274

}, {

2275

'note': 'issue #673',

2276

'url': 'PLBB231211A4F62143',

2277

'info_dict': {

2278

'title': '[OLD]Team Fortress 2 (Class-based LP)',

2279

'id': 'PLBB231211A4F62143',

2280

},

2281

'playlist_mincount': 26,

2282

}, {

2283

'note': 'Large playlist',

2284

'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',

2285

'info_dict': {

2286

'title': 'Uploads from Cauchemar',

2287

'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',

2288

},

2289

'playlist_mincount': 799,

2290

}, {

2291

'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

2292

'info_dict': {

2293

'title': 'YDL_safe_search',

2294

'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

2295

},

2296

'playlist_count': 2,

2297

'skip': 'This playlist is private',

2298

}, {

2299

'note': 'embedded',

2300

'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

'playlist_count': 4,

'info_dict': {

'title': 'JODA15',

'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

2305

}

2306

}, {

2307

'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',

2308

'playlist_mincount': 485,

2309

'info_dict': {

2310

'title': '2017 華語最新單曲 (2/24更新)',

2311

'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',

2312

}

2313

}, {

2314

'note': 'Embedded SWF player',

2315

'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',

'playlist_count': 4,

'info_dict': {

'title': 'JODA7',

'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',

2320

}

2321

}, {

2322

'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',

2323

'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',

2324

'info_dict': {

2325

'title': 'Uploads from Interstellar Movie',

2326

'id': 'UUXw-G3eDE9trcvY2sBMM_aA',

2327

},

2328

'playlist_mincount': 21,

2329

}, {

2330

# Playlist URL that does not actually serve a playlist

2331

'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',

'info_dict': {

'id': 'FqZTN594JQw',

'ext': 'webm',

'title': "Smiley's People 01 detective, Adventure Series, Action",

2336

'uploader': 'STREEM',

2337

'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',

2338

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',

2339

'upload_date': '20150526',

2340

'license': 'Standard YouTube License',

2341

'description': 'md5:507cdcb5a49ac0da37a920ece610be80',

2342

'categories': ['People & Blogs'],

'tags': list,

'view_count': int,

'like_count': int,

'dislike_count': int,

2347

},

2348

'params': {

2349

'skip_download': True,

2350

},

2351

'add_ie': [YoutubeIE.ie_key()],

2352

}, {

2353

'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',

'info_dict': {

'id': 'yeWKywCrFtk',

'ext': 'mp4',

'title': 'Small Scale Baler and Braiding Rugs',

2358

'uploader': 'Backus-Page House Museum',

2359

'uploader_id': 'backuspagemuseum',

2360

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',

2361

'upload_date': '20161008',

2362

'license': 'Standard YouTube License',

2363

'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',

2364

'categories': ['Nonprofits & Activism'],

2365

'tags': list,

2366

'like_count': int,

2367

'dislike_count': int,

},

'params': {

'noplaylist': True,

'skip_download': True,

2372

},

2373

}, {

2374

'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',

2375

'only_matching': True,

2376

}, {

2377

'url': 'TLGGrESM50VT6acwMjAyMjAxNw',

2378

'only_matching': True,

2379

}, {

2380

# music album playlist

2381

'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',

2382

'only_matching': True,

2383

}, {

2384

'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',

2385

'only_matching': True,

2386

}]

2387

2388

def _real_initialize(self):

2389

self._login()

2390

2391

def _extract_mix(self, playlist_id):

2392

# The mixes are generated from a single video

2393

# the id of the playlist is just 'RD' + video_id

2394

ids = []

2395

last_id = playlist_id[-11:]

2396

for n in itertools.count(1):

2397

url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)

2398

webpage = self._download_webpage(

2399

url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))

2400

new_ids = orderedSet(re.findall(

2401

r'''(?xs)data-video-username=".*?".*?

2402

href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),

2403

webpage))

2404

# Fetch new pages until all the videos are repeated, it seems that

2405

# there are always 51 unique videos.

2406

new_ids = [_id for _id in new_ids if _id not in ids]

if not new_ids:

break

ids.extend(new_ids)

last_id = ids[-1]

url_results = self._ids_to_results(ids)

2413

2414

search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)

2415

title_span = (

2416

search_title('playlist-title') or

2417

search_title('title long-title') or

2418

search_title('title'))

2419

title = clean_html(title_span)

2420

2421

return self.playlist_result(url_results, playlist_id, title)

2422

2423

def _extract_playlist(self, playlist_id):

2424

url = self._TEMPLATE_URL % playlist_id

2425

page = self._download_webpage(url, playlist_id)

2426

2427

# the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604)

2428

for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):

2429

match = match.strip()

2430

# Check if the playlist exists or is private

2431

mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)

2432

if mobj:

2433

reason = mobj.group('reason')

2434

message = 'This playlist %s' % reason

2435

if 'private' in reason:

2436

message += ', use --username or --netrc to access it'

2437

message += '.'

2438

raise ExtractorError(message, expected=True)

2439

elif re.match(r'[^<]*Invalid parameters[^<]*', match):

2440

raise ExtractorError(

2441

'Invalid parameters. Maybe URL is incorrect.',

2442

expected=True)

2443

elif re.match(r'[^<]*Choose your language[^<]*', match):

2444

continue

2445

else:

2446

self.report_warning('Youtube gives an alert message: ' + match)

2447

2448

playlist_title = self._html_search_regex(

2449

r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',

2450

page, 'title', default=None)

2451

2452

_UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='

2453

uploader = self._search_regex(

2454

r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,

2455

page, 'uploader', default=None)

2456

mobj = re.search(

2457

r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,

2458

page)

2459

if mobj:

2460

uploader_id = mobj.group('uploader_id')

2461

uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))

2462

else:

2463

uploader_id = uploader_url = None

has_videos = True

if not playlist_title:

2468

try:

2469

# Some playlist URLs don't actually serve a playlist (e.g.

2470

# https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)

2471

next(self._entries(page, playlist_id))

2472

except StopIteration:

2473

has_videos = False

2474

2475

playlist = self.playlist_result(

2476

self._entries(page, playlist_id), playlist_id, playlist_title)

2477

playlist.update({

2478

'uploader': uploader,

2479

'uploader_id': uploader_id,

2480

'uploader_url': uploader_url,

2481

})

2482

2483

return has_videos, playlist

2484

2485

def _check_download_just_video(self, url, playlist_id):

2486

# Check if it's a video-specific URL

2487

query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)

2488

video_id = query_dict.get('v', [None])[0] or self._search_regex(

2489

r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,

2490

'video id', default=None)

2491

if video_id:

2492

if self._downloader.params.get('noplaylist'):

2493

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

2494

return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)

2495

else:

2496

self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))

2497

return video_id, None

2498

return None, None

2499

2500

def _real_extract(self, url):

2501

# Extract playlist id

2502

mobj = re.match(self._VALID_URL, url)

2503

if mobj is None:

2504

raise ExtractorError('Invalid URL: %s' % url)

2505

playlist_id = mobj.group(1) or mobj.group(2)

2506

2507

video_id, video = self._check_download_just_video(url, playlist_id)

if video:

return video

if playlist_id.startswith(('RD', 'UL', 'PU')):

2512

# Mixes require a custom extraction process

2513

return self._extract_mix(playlist_id)

2514

2515

has_videos, playlist = self._extract_playlist(playlist_id)

2516

if has_videos or not video_id:

2517

return playlist

2518

2519

# Some playlist URLs don't actually serve a playlist (see

2520

# https://github.com/rg3/youtube-dl/issues/10537).

2521

# Fallback to plain video extraction if there is a video id

2522

# along with playlist id.

2523

return self.url_result(video_id, 'Youtube', video_id=video_id)

2524

2525

2526

class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):

2527

IE_DESC = 'YouTube.com channels'

2528

_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'

2529

_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'

2530

_VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'

2531

IE_NAME = 'youtube:channel'

2532

_TESTS = [{

2533

'note': 'paginated channel',

2534

'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',

2535

'playlist_mincount': 91,

2536

'info_dict': {

2537

'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',

2538

'title': 'Uploads from lex will',

2539

}

2540

}, {

2541

'note': 'Age restricted channel',

2542

# from https://www.youtube.com/user/DeusExOfficial

2543

'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',

2544

'playlist_mincount': 64,

2545

'info_dict': {

2546

'id': 'UUs0ifCMCm1icqRbqhUINa0w',

2547

'title': 'Uploads from Deus Ex',

2548

},

2549

}, {

2550

'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',

2551

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

2556

return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)

2557

else super(YoutubeChannelIE, cls).suitable(url))

2558

2559

def _build_template_url(self, url, channel_id):

2560

return self._TEMPLATE_URL % channel_id

2561

2562

def _real_extract(self, url):

2563

channel_id = self._match_id(url)

2564

2565

url = self._build_template_url(url, channel_id)

2566

2567

# Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)

2568

# Workaround by extracting as a playlist if managed to obtain channel playlist URL

2569

# otherwise fallback on channel by page extraction

2570

channel_page = self._download_webpage(

2571

url + '?view=57', channel_id,

2572

'Downloading channel page', fatal=False)

2573

if channel_page is False:

2574

channel_playlist_id = False

2575

else:

2576

channel_playlist_id = self._html_search_meta(

2577

'channelId', channel_page, 'channel id', default=None)

2578

if not channel_playlist_id:

2579

channel_url = self._html_search_meta(

2580

('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),

2581

channel_page, 'channel url', default=None)

2582

if channel_url:

2583

channel_playlist_id = self._search_regex(

2584

r'vnd\.youtube://user/([0-9A-Za-z_-]+)',

2585

channel_url, 'channel id', default=None)

2586

if channel_playlist_id and channel_playlist_id.startswith('UC'):

2587

playlist_id = 'UU' + channel_playlist_id[2:]

2588

return self.url_result(

2589

compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')

2590

2591

channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')

2592

autogenerated = re.search(r'''(?x)

2593

class="[^"]*?(?:

2594

channel-header-autogenerated-label|

2595

yt-channel-title-autogenerated

2596

)[^"]*"''', channel_page) is not None

2597

2598

if autogenerated:

2599

# The videos are contained in a single page

2600

# the ajax pages can't be used, they are empty

2601

entries = [

2602

self.url_result(

2603

video_id, 'Youtube', video_id=video_id,

2604

video_title=video_title)

2605

for video_id, video_title in self.extract_videos_from_page(channel_page)]

2606

return self.playlist_result(entries, channel_id)

2607

2608

try:

2609

next(self._entries(channel_page, channel_id))

2610

except StopIteration:

2611

alert_message = self._html_search_regex(

2612

r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',

2613

channel_page, 'alert', default=None, group='alert')

2614

if alert_message:

2615

raise ExtractorError('Youtube said: %s' % alert_message, expected=True)

2616

2617

return self.playlist_result(self._entries(channel_page, channel_id), channel_id)

2618

2619

2620

class YoutubeUserIE(YoutubeChannelIE):

2621

IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'

2622

2623

_TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'

2624

IE_NAME = 'youtube:user'

2625

2626

_TESTS = [{

2627

'url': 'https://www.youtube.com/user/TheLinuxFoundation',

2628

'playlist_mincount': 320,

2629

'info_dict': {

2630

'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',

2631

'title': 'Uploads from The Linux Foundation',

2632

}

2633

}, {

2634

# Only available via https://www.youtube.com/c/12minuteathlete/videos

2635

# but not https://www.youtube.com/user/12minuteathlete/videos

2636

'url': 'https://www.youtube.com/c/12minuteathlete/videos',

2637

'playlist_mincount': 249,

2638

'info_dict': {

2639

'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',

2640

'title': 'Uploads from 12 Minute Athlete',

2641

}

2642

}, {

2643

'url': 'ytuser:phihag',

2644

'only_matching': True,

2645

}, {

2646

'url': 'https://www.youtube.com/c/gametrailers',

2647

'only_matching': True,

2648

}, {

2649

'url': 'https://www.youtube.com/gametrailers',

2650

'only_matching': True,

2651

}, {

2652

# This channel is not available, geo restricted to JP

2653

'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',

2654

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

2659

# Don't return True if the url can be extracted with other youtube

2660

# extractor, the regex would is too permissive and it would match.

2661

other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)

2662

if any(ie.suitable(url) for ie in other_yt_ies):

2663

return False

2664

else:

2665

return super(YoutubeUserIE, cls).suitable(url)

2666

2667

def _build_template_url(self, url, channel_id):

2668

mobj = re.match(self._VALID_URL, url)

2669

return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))

2670

2671

2672

class YoutubeLiveIE(YoutubeBaseInfoExtractor):

2673

IE_DESC = 'YouTube.com live streams'

2674

_VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'

2675

IE_NAME = 'youtube:live'

2676

2677

_TESTS = [{

2678

'url': 'https://www.youtube.com/user/TheYoungTurks/live',

'info_dict': {

'id': 'a48o2S1cPoo',

'ext': 'mp4',

'title': 'The Young Turks - Live Main Show',

2683

'uploader': 'The Young Turks',

2684

'uploader_id': 'TheYoungTurks',

2685

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',

2686

'upload_date': '20150715',

2687

'license': 'Standard YouTube License',

2688

'description': 'md5:438179573adcdff3c97ebb1ee632b891',

2689

'categories': ['News & Politics'],

2690

'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],

2691

'like_count': int,

2692

'dislike_count': int,

2693

},

2694

'params': {

2695

'skip_download': True,

2696

},

2697

}, {

2698

'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',

2699

'only_matching': True,

2700

}, {

2701

'url': 'https://www.youtube.com/c/CommanderVideoHq/live',

2702

'only_matching': True,

2703

}, {

2704

'url': 'https://www.youtube.com/TheYoungTurks/live',

2705

'only_matching': True,

2706

}]

2707

2708

def _real_extract(self, url):

2709

mobj = re.match(self._VALID_URL, url)

2710

channel_id = mobj.group('id')

2711

base_url = mobj.group('base_url')

2712

webpage = self._download_webpage(url, channel_id, fatal=False)

2713

if webpage:

2714

page_type = self._og_search_property(

2715

'type', webpage, 'page type', default='')

2716

video_id = self._html_search_meta(

2717

'videoId', webpage, 'video id', default=None)

2718

if page_type.startswith('video') and video_id and re.match(

2719

r'^[0-9A-Za-z_-]{11}$', video_id):

2720

return self.url_result(video_id, YoutubeIE.ie_key())

2721

return self.url_result(base_url)

2722

2723

2724

class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):

2725

IE_DESC = 'YouTube.com user/channel playlists'

2726

_VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'

2727

IE_NAME = 'youtube:playlists'

2728

2729

_TESTS = [{

2730

'url': 'https://www.youtube.com/user/ThirstForScience/playlists',

2731

'playlist_mincount': 4,

2732

'info_dict': {

2733

'id': 'ThirstForScience',

2734

'title': 'Thirst for Science',

2735

},

2736

}, {

2737

# with "Load more" button

2738

'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',

2739

'playlist_mincount': 70,

2740

'info_dict': {

2741

'id': 'igorkle1',

2742

'title': 'Игорь Клейнер',

2743

},

2744

}, {

2745

'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',

2746

'playlist_mincount': 17,

2747

'info_dict': {

2748

'id': 'UCiU1dHvZObB2iP6xkJ__Icw',

2749

'title': 'Chem Player',

},

}]

class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):

2755

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'

2756

2757

2758

class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):

2759

IE_DESC = 'YouTube.com searches'

2760

# there doesn't appear to be a real limit, for example if you search for

2761

# 'python' you get more than 8.000.000 results

2762

_MAX_RESULTS = float('inf')

2763

IE_NAME = 'youtube:search'

2764

_SEARCH_KEY = 'ytsearch'

2765

_EXTRA_QUERY_ARGS = {}

2766

_TESTS = []

2767

2768

def _get_n_results(self, query, n):

2769

"""Get a specified number of results for a query"""

videos = []

limit = n

url_query = {

'search_query': query.encode('utf-8'),

2776

}

2777

url_query.update(self._EXTRA_QUERY_ARGS)

2778

result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)

2779

2780

for pagenum in itertools.count(1):

2781

data = self._download_json(

2782

result_url, video_id='query "%s"' % query,

2783

note='Downloading page %s' % pagenum,

2784

errnote='Unable to download API page',

2785

query={'spf': 'navigate'})

2786

html_content = data[1]['body']['content']

2787

2788

if 'class="search-message' in html_content:

2789

raise ExtractorError(

2790

'[youtube] No video results', expected=True)

2791

2792

new_videos = list(self._process_page(html_content))

2793

videos += new_videos

2794

if not new_videos or len(videos) > limit:

2795

break

2796

next_link = self._html_search_regex(

2797

r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',

2798

html_content, 'next link', default=None)

2799

if next_link is None:

2800

break

2801

result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)

if len(videos) > n:

videos = videos[:n]

return self.playlist_result(videos, query)

2806

2807

2808

class YoutubeSearchDateIE(YoutubeSearchIE):

2809

IE_NAME = YoutubeSearchIE.IE_NAME + ':date'

2810

_SEARCH_KEY = 'ytsearchdate'

2811

IE_DESC = 'YouTube.com searches, newest videos first'

2812

_EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}

2813

2814

2815

class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):

2816

IE_DESC = 'YouTube.com search URLs'

2817

IE_NAME = 'youtube:search_url'

2818

_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'

2819

_TESTS = [{

2820

'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',

2821

'playlist_mincount': 5,

2822

'info_dict': {

2823

'title': 'youtube-dl test video',

2824

}

2825

}, {

2826

'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',

2827

'only_matching': True,

2828

}]

2829

2830

def _real_extract(self, url):

2831

mobj = re.match(self._VALID_URL, url)

2832

query = compat_urllib_parse_unquote_plus(mobj.group('query'))

2833

webpage = self._download_webpage(url, query)

2834

return self.playlist_result(self._process_page(webpage), playlist_title=query)

2835

2836

2837

class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):

2838

IE_DESC = 'YouTube.com (multi-season) shows'

2839

_VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'

2840

IE_NAME = 'youtube:show'

2841

_TESTS = [{

2842

'url': 'https://www.youtube.com/show/airdisasters',

2843

'playlist_mincount': 5,

2844

'info_dict': {

2845

'id': 'airdisasters',

2846

'title': 'Air Disasters',

}

}]

def _real_extract(self, url):

2851

playlist_id = self._match_id(url)

2852

return super(YoutubeShowIE, self)._real_extract(

2853

'https://www.youtube.com/show/%s/playlists' % playlist_id)

2854

2855

2856

class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):

2857

"""

2858

Base class for feed extractors

2859

Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.

2860

"""

2861

_LOGIN_REQUIRED = True

@property

def IE_NAME(self):

return 'youtube:%s' % self._FEED_NAME

2866

2867

def _real_initialize(self):

2868

self._login()

2869

2870

def _entries(self, page):

2871

# The extraction process is the same as for playlists, but the regex

2872

# for the video ids doesn't contain an index

2873

ids = []

2874

more_widget_html = content_html = page

2875

for page_num in itertools.count(1):

2876

matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)

2877

2878

# 'recommended' feed has infinite 'load more' and each new portion spins

2879

# the same videos in (sometimes) slightly different order, so we'll check

2880

# for unicity and break when portion has no new videos

2881

new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))

if not new_ids:

break

ids.extend(new_ids)

for entry in self._ids_to_results(new_ids):

2888

yield entry

2889

2890

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

2895

'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,

2896

'Downloading page #%s' % page_num,

2897

transform_source=uppercase_escape)

2898

content_html = more['content_html']

2899

more_widget_html = more['load_more_widget_html']

2900

2901

def _real_extract(self, url):

2902

page = self._download_webpage(

2903

'https://www.youtube.com/feed/%s' % self._FEED_NAME,

2904

self._PLAYLIST_TITLE)

2905

return self.playlist_result(

2906

self._entries(page), playlist_title=self._PLAYLIST_TITLE)

2907

2908

2909

class YoutubeWatchLaterIE(YoutubePlaylistIE):

2910

IE_NAME = 'youtube:watchlater'

2911

IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'

2912

_VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'

2913

2914

_TESTS = [{

2915

'url': 'https://www.youtube.com/playlist?list=WL',

2916

'only_matching': True,

2917

}, {

2918

'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',

2919

'only_matching': True,

2920

}]

2921

2922

def _real_extract(self, url):

2923

_, video = self._check_download_just_video(url, 'WL')

2924

if video:

2925

return video

2926

_, playlist = self._extract_playlist('WL')

return playlist

class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):

2931

IE_NAME = 'youtube:favorites'

2932

IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'

2933

_VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'

2934

_LOGIN_REQUIRED = True

2935

2936

def _real_extract(self, url):

2937

webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')

2938

playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')

2939

return self.url_result(playlist_id, 'YoutubePlaylist')

2940

2941

2942

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):

2943

IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'

2944

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'

2945

_FEED_NAME = 'recommended'

2946

_PLAYLIST_TITLE = 'Youtube Recommended videos'

2947

2948

2949

class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):

2950

IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'

2951

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'

2952

_FEED_NAME = 'subscriptions'

2953

_PLAYLIST_TITLE = 'Youtube Subscriptions'

2954

2955

2956

class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):

2957

IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'

2958

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'

2959

_FEED_NAME = 'history'

2960

_PLAYLIST_TITLE = 'Youtube History'

2961

2962

2963

class YoutubeTruncatedURLIE(InfoExtractor):

2964

IE_NAME = 'youtube:truncated_url'

2965

IE_DESC = False # Do not list

2966

_VALID_URL = r'''(?x)

2967

(?:https?://)?

2968

(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/

2969

(?:watch\?(?:

2970

feature=[a-z_]+|

2971

annotation_id=annotation_[^&]+|

x-yt-cl=[0-9]+|

hl=[^&]*|

t=[0-9]+

)?

|

attribution_link\?a=[^&]+

)

$

'''

_TESTS = [{

'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',

2984

'only_matching': True,

2985

}, {

2986

'url': 'https://www.youtube.com/watch?',

2987

'only_matching': True,

2988

}, {

2989

'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',

2990

'only_matching': True,

2991

}, {

2992

'url': 'https://www.youtube.com/watch?feature=foo',

2993

'only_matching': True,

2994

}, {

2995

'url': 'https://www.youtube.com/watch?hl=en-GB',

2996

'only_matching': True,

2997

}, {

2998

'url': 'https://www.youtube.com/watch?t=2372',

2999

'only_matching': True,

3000

}]

3001

3002

def _real_extract(self, url):

3003

raise ExtractorError(

3004

'Did you forget to quote the URL? Remember that & is a meta '

3005

'character in most shells, so you want to put the URL in quotes, '

3006

'like youtube-dl '

3007

'"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '

3008

' or simply youtube-dl BaW_jenozKc .',

expected=True)

class YoutubeTruncatedIDIE(InfoExtractor):

3013

IE_NAME = 'youtube:truncated_id'

3014

IE_DESC = False # Do not list

3015

_VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'

3016

3017

_TESTS = [{

3018

'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',

3019

'only_matching': True,

3020

}]

3021

3022

def _real_extract(self, url):

3023

video_id = self._match_id(url)

3024

raise ExtractorError(

3025

'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),

3026

expected=True)